In [116]:
from gensim.models.word2vec import Word2Vec

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
import requests
import pandas as pd
import string

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [2]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [128]:
print (re.sub('[\s]+', ' ', whitespace_string.strip()).replace('a lot of', 'no'))

This is a string that has no extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [36]:
r = requests.get('https://raw.githubusercontent.com/ryanleeallred/datasets/master/dates.txt')
dates = r.text.replace('\r', '').split('\n')

In [35]:
months = [re.findall(r'[A-Z][a-z]+', date)[0] for date in dates]
days = [re.findall(r'[\d]{1,2},', date)[0][:-1] for date in dates]
years = [re.findall(r'[\d]{4}', date)[0] for date in dates]
df_dates = pd.DataFrame({'Month' : months,
                        'Day' : days,
                        'Year' : years})
df_dates.head()

Unnamed: 0,Month,Day,Year
0,March,8,2015
1,March,15,2015
2,March,22,2015
3,March,29,2015
4,April,5,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [39]:
# load the tweets
tweets = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv')
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [49]:
def clean_tweets(text):
    """
    Takes in a string of Twitter text and returns cleaned string
    """
    return ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\ / \ / \S+)","", text.lower()))

In [69]:
tweets['text_clean'] = tweets['SentimentText'].apply(clean_tweets)
tweets['text_tokens'] = tweets['text_clean'].apply(word_tokenize)

In [70]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText,text_clean,text_tokens
0,0,is so sad for my APL frie...,is so sad for my apl friend,"[is, so, sad, for, my, apl, friend]"
1,0,I missed the New Moon trail...,i missed the new moon trailer,"[i, missed, the, new, moon, trailer]"
2,1,omg its already 7:30 :O,omg its already 730 o,"[omg, its, already, 730, o]"
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cry ive be...,"[omgaga, im, sooo, im, gunna, cry, ive, been, ..."
4,0,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me tt,"[i, think, mi, bf, is, cheating, on, me, tt]"


### How should TF-IDF scores be interpreted? How are they calculated?

TF-IDF scores are a way of determining how important a word is to a document in the context of a collection of documents (corpus).

The basic assumptions for determining a word's importance under TF-IDF are

1. The more frequently a word appears in a document, the more important it is
2. The more frequently a word appears in the corpus, the less important it is

For example, if I'm looking through a document and the word 'gun' is mentioned several times, it's likely important. But, if I'm looking through a set of documents about WWII, 'gun' is likely mentioned in most of them, and therefore less important.

TF-IDF is calculated by multiplying Term Frequency (TF) and Inverse Document Frequency.

Term Frequency is the number of times a word occurs in a given document. This 'frequency' can be adjusted when you're working with different documents (e.g. using boolean frequency or adjusting for document length).

Inverse Document Frequency is calculated as the log of (documents containing word / total number of documents).

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [75]:
X = tweets.text_clean
y = tweets.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2,
                                                   stratify=y)

stop = stopwords.words('english')

### Logistic Regression + Count Vectorizer ###

In [111]:
lr_count = make_pipeline(CountVectorizer(stop_words=stop),
                        LogisticRegression(solver='lbfgs',
                                       max_iter=500))
lr_grid_params = [{'countvectorizer__ngram_range' : [(1,1), (1,2), (1,3)],
                  'countvectorizer__max_features' : [50, 100, None]}]

lr_grid = GridSearchCV(lr_count, 
                       lr_grid_params, 
                       cv=3,
                      scoring='roc_auc')
lr_grid.fit(X_train, y_train);

In [112]:
print ('Best Params\n', lr_grid.best_params_)
print ('\nTrain ROC AUC: %.2f' % roc_auc_score(lr_grid.predict(X_train), y_train))
print ('Test ROC AUC: %.2f' % roc_auc_score(lr_grid.predict(X_test), y_test))

Best Params
 {'countvectorizer__max_features': None, 'countvectorizer__ngram_range': (1, 3)}

Train ROC AUC: 0.98
Test ROC AUC: 0.76


### Naive Bayes + TF-IDF Vectorizer

In [117]:
nb_tfidf = make_pipeline(TfidfVectorizer(stop_words=stop),
                        MultinomialNB())
nb_grid_params = [{'tfidfvectorizer__ngram_range' : [(1,2), (1,3), (2,5)],
                  'tfidfvectorizer__max_features' : [100, None]}]
nb_grid = GridSearchCV(nb_tfidf, 
                       nb_grid_params, 
                       cv=3,
                      scoring='roc_auc')
nb_grid.fit(X_train, y_train);

In [124]:
print ('Best Params\n', nb_grid.best_params_)
print ('\nTrain ROC AUC: %.2f' % roc_auc_score(nb_grid.predict(X_train), y_train))
print ('Test ROC AUC: %.2f' % roc_auc_score(nb_grid.predict(X_test), y_test))

Best Params
 {'tfidfvectorizer__max_features': None, 'tfidfvectorizer__ngram_range': (1, 3)}

Train ROC AUC: 0.97
Test ROC AUC: 0.76


### XGBoost (!!)

In [126]:
gb_tfidf = TfidfVectorizer(max_features=None,
                           ngram_range=(1,3),
                           stop_words=stop)

gb = GradientBoostingClassifier(max_depth=10)

gb_pipe = make_pipeline(gb_tfidf,
                       gb)
gb_pipe.fit(X_train, y_train);

In [127]:
print ('\nTrain ROC AUC: %.2f' % roc_auc_score(gb_pipe.predict(X_train), y_train))
print ('Test ROC AUC: %.2f' % roc_auc_score(gb_pipe.predict(X_test), y_test))


Train ROC AUC: 0.79
Test ROC AUC: 0.73


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [99]:
twitter_model = Word2Vec(tweets.text_tokens)

In [101]:
twitter_model.most_similar('twitter')

  """Entry point for launching an IPython kernel.


[('facebook', 0.764838695526123),
 ('myspace', 0.7137081027030945),
 ('fb', 0.6582540273666382),
 ('youtube', 0.650816798210144),
 ('list', 0.6289170980453491),
 ('everyone', 0.6272796988487244),
 ('blog', 0.6241987943649292),
 ('message', 0.609565019607544),
 ('comment', 0.6038779020309448),
 ('updates', 0.5980688333511353)]