In [2]:
import nltk
import pandas as pd
import re
import string

from gensim import models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score
from urllib.request import urlopen

# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [3]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [4]:
whitespace_string = ' '.join(whitespace_string.split())
print(whitespace_string)

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [5]:
days = []
months = []
years = []

with urlopen('https://raw.githubusercontent.com/ryanleeallred/datasets/master/dates.txt') as file:
    
    encoding = file.info().get_param('charset', 'utf8')
    html = file.read().decode(encoding)

    data = re.sub(',','', html)
    data = data.splitlines()
    for line in data:
        split_str = re.findall(r"\S+", line)
        months.append(split_str[0])
        days.append(split_str[1])
        years.append(split_str[2])
    


In [6]:
df = pd.DataFrame()
df['Day'] = days
df['Month'] = months
df['Year'] = years

In [7]:
df.head()

Unnamed: 0,Day,Month,Year
0,8,March,2015
1,15,March,2015
2,22,March,2015
3,29,March,2015
4,5,April,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [7]:
df = pd.read_csv("https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv") 

# we create a small utility function so we can apply our clean method to series/df
def regex_cleaner(df_cell):
    current = ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\ / \ / \S+)", "", df_cell))
    return current
                             
df['SentimentText'] = df['SentimentText'].apply(regex_cleaner)
df['tokenized'] = df['SentimentText'].apply(nltk.word_tokenize)

In [8]:
df.head()

Unnamed: 0,Sentiment,SentimentText,tokenized
0,0,is so sad for my APL friend,"[is, so, sad, for, my, APL, friend]"
1,0,I missed the New Moon trailer,"[I, missed, the, New, Moon, trailer]"
2,1,omg its already 730 O,"[omg, its, already, 730, O]"
3,0,Omgaga Im sooo im gunna CRy Ive be...,"[Omgaga, Im, sooo, im, gunna, CRy, Ive, been, ..."
4,0,i think mi bf is cheating on me TT,"[i, think, mi, bf, is, cheating, on, me, TT]"


### How should TF-IDF scores be interpreted? How are they calculated?

TF-IDF scores are a model for word importance in the context of it's document:corpus

#### TF-IDF is calculated by two contextual divisions that assign weights to words occuring in documents.
The first assumes that the greater the occurence of a word in a singular document, the greater it's importance. This is referred to as the Term Frequency and can be adjusted for many factors, such as document length in comparison to other documents.

The second assumes that the greater the occurence of a word in the corpus, the lesser it's importance and uses this assumption as an offset. This is referred to as the Inverse Document Frequency, and it reduces the words weight, or importance as occurence across the corpus increases and buffs the weight of rare terminology.


TF-IDF is calculated by multiplying Term Frequency (TF) and Inverse Document Frequency(IDF). Hence the name abbreviation :P 



# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [20]:
X = df['SentimentText']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30)

stop_words = set(nltk.corpus.stopwords.words('english'))

In [21]:
logreg_vect_pipe = make_pipeline(TfidfVectorizer(stop_words=stop_words, lowercase=False, sublinear_tf=True),
                        LogisticRegression(solver='lbfgs',
                                       max_iter=10000))
grid_params = [{'tfidfvectorizer__ngram_range' : [(1, 1), (1, 2), (1,3), (1,4), (1,5)],
                  'tfidfvectorizer__max_features' : [1000, 10000, 100000, None]}]

lr_grid = GridSearchCV(logreg_vect_pipe, 
                       grid_params, 
                       cv=2,
                      scoring='roc_auc')
lr_grid.fit(X_train, y_train);

In [22]:
print ('Train data: ROC AUC: %.2f' % roc_auc_score(lr_grid.predict(X_train), y_train), 
       'Test data: ROC AUC: %.2f' % roc_auc_score(lr_grid.predict(X_test), y_test))

Train data: ROC AUC: 0.85 Test data: ROC AUC: 0.76


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [16]:
vector_model = models.Word2Vec(df.tokenized)
vector_model.wv.most_similar('twitter')

[('facebook', 0.771111011505127),
 ('Twitter', 0.7573177814483643),
 ('myspace', 0.754893958568573),
 ('everyone', 0.6923916339874268),
 ('message', 0.6493430137634277),
 ('someone', 0.6354849338531494),
 ('FB', 0.6302226781845093),
 ('rely', 0.6148366928100586),
 ('youtube', 0.603368878364563),
 ('direct', 0.6029317378997803)]