# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [2]:
import string
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, brier_score_loss
from gensim.models.word2vec import Word2Vec

In [3]:
s = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

solution1 = ' '.join(s.split())

wrdsp = r'[a-zA-Z]+\s'
last = r'[a-zA-Z]+\.'

solution2 = ''.join(re.findall(wrdsp, s) + re.findall(last, s))

print(solution1)

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [4]:
dates_url = 'https://raw.githubusercontent.com/ryanleeallred/datasets/master/dates.txt'
dates_ = pd.read_csv(dates_url, 
                    header=None
                   ).rename(columns={0: 'monthday', 1:'year'})

def monthdaysplit(dat): 
    def getsplit(i: int): 
        return dat.monthday.apply(lambda s: s.split()[i])
    return dat.assign(month = getsplit(0), day = getsplit(1)).drop('monthday', axis=1)

monthdaysplit(dates_).head()

Unnamed: 0,year,month,day
0,2015,March,8
1,2015,March,15
2,2015,March,22
3,2015,March,29
4,2015,April,5


In [5]:
month_pat = re.compile(r'^[A-Z]{1}[a-z]+')
day_pat = re.compile(r'[0-9]{1,2}')
year_pat = re.compile(r'2015')

rows_list=[]
dates_df = pd.DataFrame(columns=['month', 'day', 'year'])
with open('dates.txt', 'r') as f:
    for i,s in enumerate(f.readlines()):
        month = month_pat.search(s)
        day = day_pat.search(s)
        year = year_pat.search(s)
        
        rows_list.append({'month': month.group(), 'day': day.group(), 'year': year.group()})

dates_df = pd.DataFrame(rows_list, columns=['month', 'day', 'year'])

dates_df

Unnamed: 0,month,day,year
0,March,8,2015
1,March,15,2015
2,March,22,2015
3,March,29,2015
4,April,5,2015
5,April,12,2015
6,April,19,2015
7,April,26,2015
8,May,3,2015
9,May,10,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [6]:
bow_url = 'https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv'

def clean_bow(dat: pd.DataFrame) -> pd.DataFrame: 
    replacewords = stopwords.words('english') + [c for c in string.punctuation]
    return dat.assign(SentimentText = dat.SentimentText.str.lower().replace(replacewords, ''))

df_bow = clean_bow(pd.read_csv(bow_url).sample(2**14))
print(df_bow.shape)

df_bow.SentimentText.sample(3).apply(word_tokenize)

(16384, 2)


59089                                 [@, beechercreature]
14850    [..., on, date, ., boy, doesn, want, a, gf, ,,...
77872    [@, cadmium66, i, think, the, skanks, might, l...
Name: SentimentText, dtype: object

### How should TF-IDF scores be interpreted? How are they calculated?

TF-IDF is sort of like _entropy_ in that it measures our **uncertainty** about variables and penalizes stuff that's too common to be interesting. 

Given a list of documents, _term frequency `t`_ is a count for each word in each document. _document frequency_ `d` for each word documents the number of times it occurs when iterating through the whole list of documents, so `1/d` is what you're interested in. 

Ultimately, you want to divide these quantities and put them into `log`, because uncertainty measures always use `log`. 

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [7]:
cntvecz = CountVectorizer(max_features=2**13)

X_train, X_test, y_train, y_test = train_test_split(df_bow.SentimentText, df_bow.Sentiment)

print([str(x.shape) + ' -- ' for x in [X_train, X_test, y_train, y_test]])

cntvecz.fit(X_train)

BoWd_train = pd.DataFrame(cntvecz.transform(X_train).toarray(), columns=cntvecz.get_feature_names())

BoWd_test = pd.DataFrame(cntvecz.transform(X_test).toarray(), columns=cntvecz.get_feature_names())


['(12288,) -- ', '(4096,) -- ', '(12288,) -- ', '(4096,) -- ']


In [8]:
rfc = RandomForestClassifier(n_estimators=20, criterion='entropy')
lr = LogisticRegression(solver='lbfgs')

rfc.fit(BoWd_train, y_train)
lr.fit(BoWd_train, y_train)

def report3(model): 
    
    train_predict = [t[0] for t in model.predict_proba(BoWd_train)]
    test_predict = [t[0] for t in model.predict_proba(BoWd_test)]
    
    avgaccur = f"""--MEAN ACCURACY--
            \tTrain score: {model.score(BoWd_train, y_train):.4}
            \tTest score: {model.score(BoWd_test, y_test):.4}"""
    rocauc = f"""\n--ROC AUC--
        \tTrain rocauc: {roc_auc_score(y_train, train_predict):.4}
        \tTest rocauc: {roc_auc_score(y_test.values,test_predict):.4}"""

    brier = f"""\n--BRIER SCORE LOSS--
        \tTrain brier: {brier_score_loss(y_train, train_predict):.4}
        \tTest brier: {brier_score_loss(y_test, test_predict):.4}"""

    return avgaccur + rocauc + brier

print("Random Forest Classification ")
print(report3(rfc))
print(''.join([' ' for _ in range(2**5)]))
print("Logistic Regression")
print(report3(lr))



Random Forest Classification 
--MEAN ACCURACY--
            	Train score: 0.9945
            	Test score: 0.7231
--ROC AUC--
        	Train rocauc: 0.0005414
        	Test rocauc: 0.2152
--BRIER SCORE LOSS--
        	Train brier: 0.7581
        	Test brier: 0.4491
                                
Logistic Regression
--MEAN ACCURACY--
            	Train score: 0.8867
            	Test score: 0.7371
--ROC AUC--
        	Train rocauc: 0.04773
        	Test rocauc: 0.1954
--BRIER SCORE LOSS--
        	Train brier: 0.6072
        	Test brier: 0.5063


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [9]:
min_count = 2
size = 50
window = 4

sentences = X_train.apply(word_tokenize)

model = Word2Vec(sentences, min_count=min_count, size=size, window=window)

print(len(list(model.wv.vocab.keys())))

model.most_similar('twitter',topn=10)


7091


  # This is added back by InteractiveShellApp.init_path()


[('whats', 0.9979201555252075),
 ('keeps', 0.9976176023483276),
 ('film', 0.9975176453590393),
 ('photo', 0.9971795082092285),
 ('sun', 0.9971530437469482),
 ('dr', 0.9968245625495911),
 ('cold', 0.9968036413192749),
 ('10', 0.9965555667877197),
 ('favorite', 0.9960423111915588),
 ('rock', 0.9960168600082397)]