# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen

import nltk
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.models.word2vec import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gutierrez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [3]:
##### Your Code Here #####
clean_string = whitespace_string.strip()
clean_string = (' ').join(clean_string.split())
print(clean_string)

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [28]:
##### Your Code Here #####
dates = '''March 8, 2015
March 15, 2015
March 22, 2015
March 29, 2015
April 5, 2015
April 12, 2015
April 19, 2015
April 26, 2015
May 3, 2015
May 10, 2015
May 17, 2015
May 24, 2015
May 31, 2015
June 7, 2015
June 14, 2015
June 21, 2015
June 28, 2015
July 5, 2015
July 12, 2015
July 19, 2015'''

In [57]:
regex = r"([a-zA-Z]+) ([\d]+), (\d{4})"

search_result = re.findall(regex, dates)
print(search_result)

[('March', '8', '2015'), ('March', '15', '2015'), ('March', '22', '2015'), ('March', '29', '2015'), ('April', '5', '2015'), ('April', '12', '2015'), ('April', '19', '2015'), ('April', '26', '2015'), ('May', '3', '2015'), ('May', '10', '2015'), ('May', '17', '2015'), ('May', '24', '2015'), ('May', '31', '2015'), ('June', '7', '2015'), ('June', '14', '2015'), ('June', '21', '2015'), ('June', '28', '2015'), ('July', '5', '2015'), ('July', '12', '2015'), ('July', '19', '2015')]


In [59]:
df_dates = pd.DataFrame(search_result, columns=['Month', 'Day', 'Year'])
df_dates

Unnamed: 0,Month,Day,Year
0,March,8,2015
1,March,15,2015
2,March,22,2015
3,March,29,2015
4,April,5,2015
5,April,12,2015
6,April,19,2015
7,April,26,2015
8,May,3,2015
9,May,10,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [4]:
##### Your Code Here #####
df = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv')

In [5]:
df.SentimentText.values[:5]

array(['                     is so sad for my APL friend.............',
       '                   I missed the New Moon trailer...',
       '              omg its already 7:30 :O',
       "          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...",
       '         i think mi bf is cheating on me!!!       T_T'],
      dtype=object)

In [6]:
def clean_text(doc):
    # make all text lowercase
    doc = str(doc).lower()
    # split into tokens by white space
    tokens = doc.split(' ')
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

df['SentimentText'] = df['SentimentText'].apply(clean_text)

In [7]:
df.SentimentText[:5]

0                                   [sad, apl, friend]
1                         [missed, new, moon, trailer]
2                                       [omg, already]
3    [omgaga, im, sooo, im, gunna, cry, ive, dentis...
4                        [think, mi, bf, cheating, tt]
Name: SentimentText, dtype: object

In [8]:
df.shape

(99989, 2)

### How should TF-IDF scores be interpreted? How are they calculated?

#### Your Answer Here #####

TF-IDF creates a matrix of words in documents provided where words are assigned a score between 0 and 1.
Words with higher relevance are given a higer score, while those with less relevance have a lower score.
Scores are calculated by taking the frequency of a word in a document divided by the number of documents that word is in. 

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [9]:
# Joining text to make sentence
def join_text(doc):
    joined_doc = (' ').join(doc)
    return joined_doc

In [10]:
df['SentimentText1'] = df['SentimentText'].apply(join_text)

In [11]:
##### Your Code Here #####
df.head()

Unnamed: 0,Sentiment,SentimentText,SentimentText1
0,0,"[sad, apl, friend]",sad apl friend
1,0,"[missed, new, moon, trailer]",missed new moon trailer
2,1,"[omg, already]",omg already
3,0,"[omgaga, im, sooo, im, gunna, cry, ive, dentis...",omgaga im sooo im gunna cry ive dentist since ...
4,0,"[think, mi, bf, cheating, tt]",think mi bf cheating tt


In [12]:
df.shape

(99989, 3)

### Train_Test_Split to create train and test datasets.

In [13]:
from sklearn.model_selection import train_test_split

X = df.SentimentText1
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((79991,), (19998,), (79991,), (19998,))

### **Method 1:** Count Vectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=100, ngram_range=(1,1), stop_words='english')
vectorizer.fit(X_train)

# Vocabulary created with Count Vectorizer
print(vectorizer.vocabulary_)

{'sleep': 65, 'haha': 24, 'love': 46, 'little': 42, 'dont': 11, 'know': 38, 'wanna': 86, 'hear': 28, 'like': 41, 'got': 21, 'ill': 35, 'week': 90, 'time': 77, 'long': 44, 'thing': 74, 'good': 20, 'night': 55, 'youre': 99, 'way': 89, 'thats': 73, 'today': 78, 'yes': 98, 'cool': 6, 'right': 61, 'lol': 43, 'new': 53, 'nice': 54, 'happy': 25, 'day': 7, 'im': 36, 'sad': 62, 'come': 5, 'yeah': 97, 'doesnt': 10, 'think': 75, 'really': 60, 'want': 87, 'bad': 2, 'miss': 50, 'wont': 94, 'help': 29, 'amp': 0, 'twitter': 83, 'gonna': 19, 'let': 39, 'oh': 56, 'didnt': 9, 'ok': 57, 'thanks': 72, 'said': 63, 'work': 95, 'tomorrow': 79, 'hope': 33, 'feel': 12, 'better': 4, 'soon': 66, 'ive': 37, 'hi': 31, 'make': 47, 'going': 18, 'follow': 13, 'getting': 15, 'need': 52, 'great': 22, 'ya': 96, 'morning': 51, 'people': 58, 'id': 34, 'wait': 85, 'thank': 71, 'welcome': 92, 'ur': 84, 'life': 40, 'fun': 14, 'sorry': 67, 'look': 45, 'guys': 23, 'girl': 16, 'wish': 93, 'tonight': 80, 'tell': 70, 'home': 32, 

In [16]:
train_word_counts  = vectorizer.transform(X_train)
X_train_vectorized = pd.DataFrame(train_word_counts.toarray(),
                                 columns=vectorizer.get_feature_names())

print(X_train_vectorized.shape)
X_train_vectorized.head()

(79991, 100)


Unnamed: 0,amp,awesome,bad,best,better,come,cool,day,days,didnt,...,week,weekend,welcome,wish,wont,work,ya,yeah,yes,youre
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
test_word_counts = vectorizer.transform(X_test)
X_test_vectorized = pd.DataFrame(test_word_counts.toarray(),
                                columns=vectorizer.get_feature_names())

print(X_test_vectorized.shape)
X_test_vectorized.head()

(19998, 100)


Unnamed: 0,amp,awesome,bad,best,better,come,cool,day,days,didnt,...,week,weekend,welcome,wish,wont,work,ya,yeah,yes,youre
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators=200).fit(X_train_vectorized, y_train)

rfc_train_predictions = RFC.predict(X_train_vectorized)
rfc_test_predictions = RFC.predict(X_test_vectorized)

# Using ROC_AUC Score (not Accuracy)
print(f'Train ROC_AUC Score: {roc_auc_score(y_train, rfc_train_predictions)}')
print(f'Test ROC_AUC Score: {roc_auc_score(y_test, rfc_test_predictions)}')

Train ROC_AUC Score: 0.7348323308546927
Test ROC_AUC Score: 0.6249485470432838


### Gradient Boosting Classifier

In [61]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(learning_rate=0.1, 
                           n_estimators=300, max_depth=3).fit(X_train_vectorized, y_train)

gbc_train_predictions = GBC.predict(X_train_vectorized)
gbc_test_predictions = GBC.predict(X_test_vectorized)

# Using ROC_AUC Score (not Accuracy)
print(f'Train ROC_AUC Score: {roc_auc_score(y_train, gbc_train_predictions)}')
print(f'Test ROC_AUC Score: {roc_auc_score(y_test, gbc_test_predictions)}')

Train ROC_AUC Score: 0.6319499651220892
Test ROC_AUC Score: 0.6265282158097948


### **Method 2:** TF-IDF Vectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(max_features=100, ngram_range=(1,1), stop_words='english')
vectorizer_tfidf.fit(X_train)

# Vocabulary created with TF-IDF Vectorizer
print(vectorizer.vocabulary_)

{'sleep': 65, 'haha': 24, 'love': 46, 'little': 42, 'dont': 11, 'know': 38, 'wanna': 86, 'hear': 28, 'like': 41, 'got': 21, 'ill': 35, 'week': 90, 'time': 77, 'long': 44, 'thing': 74, 'good': 20, 'night': 55, 'youre': 99, 'way': 89, 'thats': 73, 'today': 78, 'yes': 98, 'cool': 6, 'right': 61, 'lol': 43, 'new': 53, 'nice': 54, 'happy': 25, 'day': 7, 'im': 36, 'sad': 62, 'come': 5, 'yeah': 97, 'doesnt': 10, 'think': 75, 'really': 60, 'want': 87, 'bad': 2, 'miss': 50, 'wont': 94, 'help': 29, 'amp': 0, 'twitter': 83, 'gonna': 19, 'let': 39, 'oh': 56, 'didnt': 9, 'ok': 57, 'thanks': 72, 'said': 63, 'work': 95, 'tomorrow': 79, 'hope': 33, 'feel': 12, 'better': 4, 'soon': 66, 'ive': 37, 'hi': 31, 'make': 47, 'going': 18, 'follow': 13, 'getting': 15, 'need': 52, 'great': 22, 'ya': 96, 'morning': 51, 'people': 58, 'id': 34, 'wait': 85, 'thank': 71, 'welcome': 92, 'ur': 84, 'life': 40, 'fun': 14, 'sorry': 67, 'look': 45, 'guys': 23, 'girl': 16, 'wish': 93, 'tonight': 80, 'tell': 70, 'home': 32, 

In [20]:
train_word_counts_tfidf = vectorizer_tfidf.transform(X_train)
X_train_vectorized_tfidf = pd.DataFrame(train_word_counts_tfidf.toarray(),
                                       columns=vectorizer_tfidf.get_feature_names())

print(X_train_vectorized_tfidf.shape)
X_train_vectorized_tfidf.head()

(79991, 100)


Unnamed: 0,amp,awesome,bad,best,better,come,cool,day,days,didnt,...,week,weekend,welcome,wish,wont,work,ya,yeah,yes,youre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
test_word_counts_tfidf = vectorizer_tfidf.transform(X_test)
X_test_vectorized_tfidf = pd.DataFrame(test_word_counts_tfidf.toarray(),
                                      columns=vectorizer_tfidf.get_feature_names())

print(X_test_vectorized_tfidf.shape)
X_test_vectorized_tfidf.head()

(19998, 100)


Unnamed: 0,amp,awesome,bad,best,better,come,cool,day,days,didnt,...,week,weekend,welcome,wish,wont,work,ya,yeah,yes,youre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.66453,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Random Forest Classifier

In [27]:
RFC_tfidf = RandomForestClassifier(n_estimators=100).fit(X_train_vectorized_tfidf, y_train)
train_predictions_tfidf = RFC_tfidf.predict(X_train_vectorized_tfidf)
test_predictions_tfidf = RFC_tfidf.predict(X_test_vectorized_tfidf)

# Using ROC_AUC Score (not Accuracy)
print(f'Train ROC_AUC Score: {roc_auc_score(y_train, train_predictions_tfidf)}')
print(f'Test ROC_AUC Score: {roc_auc_score(y_test, test_predictions_tfidf)}')

Train ROC_AUC Score: 0.7353071990655697
Test ROC_AUC Score: 0.6291072139808982


### Gradient Boosting Classifier

In [62]:
from sklearn.ensemble import GradientBoostingClassifier
GBC_tfidf = GradientBoostingClassifier(learning_rate=0.1, 
                           n_estimators=300, max_depth=3).fit(X_train_vectorized, y_train)

gbc_train_predictions_tfidf = GBC_tfidf.predict(X_train_vectorized)
gbc_test_predictions_tfidf = GBC_tfidf.predict(X_test_vectorized)

# Using ROC_AUC Score (not Accuracy)
print(f'Train ROC_AUC Score: {roc_auc_score(y_train, gbc_train_predictions_tfidf)}')
print(f'Test ROC_AUC Score: {roc_auc_score(y_test, gbc_test_predictions_tfidf)}')

Train ROC_AUC Score: 0.6318801944829562
Test ROC_AUC Score: 0.6269408351960983


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [23]:
##### Your Code Here #####
from gensim.models import Word2Vec
w2v = Word2Vec(df.SentimentText, min_count=20, window=3, size=300, negative=20)

In [24]:
w2v.wv.most_similar(positive=["twitter"], topn=10)

[('account', 0.7636793851852417),
 ('facebook', 0.7506049871444702),
 ('page', 0.7476189732551575),
 ('via', 0.7369566559791565),
 ('list', 0.7338579893112183),
 ('web', 0.7260967493057251),
 ('updates', 0.7207168936729431),
 ('myspace', 0.7120668292045593),
 ('app', 0.7016830444335938),
 ('others', 0.6991567611694336)]