# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [1]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [2]:
better_string = ' '.join(whitespace_string.split())
better_string

'This is a string that has a lot of extra whitespace.'

### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [3]:
import re
regex = r"([a-zA-Z]+) (\d+)"
dates = '''March 8, 2015
March 15, 2015
March 22, 2015
March 29, 2015
April 5, 2015
April 12, 2015
April 19, 2015
April 26, 2015
May 3, 2015
May 10, 2015
May 17, 2015
May 24, 2015
May 31, 2015
June 7, 2015
June 14, 2015
June 21, 2015
June 28, 2015
July 5, 2015
July 12, 2015
July 19, 2015'''
search_result = re.findall(regex, dates)
print(search_result)
for i in range(len(search_result)):
    search_result[i] += (2015,)
print(search_result)

[('March', '8'), ('March', '15'), ('March', '22'), ('March', '29'), ('April', '5'), ('April', '12'), ('April', '19'), ('April', '26'), ('May', '3'), ('May', '10'), ('May', '17'), ('May', '24'), ('May', '31'), ('June', '7'), ('June', '14'), ('June', '21'), ('June', '28'), ('July', '5'), ('July', '12'), ('July', '19')]
[('March', '8', 2015), ('March', '15', 2015), ('March', '22', 2015), ('March', '29', 2015), ('April', '5', 2015), ('April', '12', 2015), ('April', '19', 2015), ('April', '26', 2015), ('May', '3', 2015), ('May', '10', 2015), ('May', '17', 2015), ('May', '24', 2015), ('May', '31', 2015), ('June', '7', 2015), ('June', '14', 2015), ('June', '21', 2015), ('June', '28', 2015), ('July', '5', 2015), ('July', '12', 2015), ('July', '19', 2015)]


In [4]:
import pandas as pd
df = pd.DataFrame(search_result, columns=['Month', 'Day', 'Year'])
df.head()

Unnamed: 0,Month,Day,Year
0,March,8,2015
1,March,15,2015
2,March,22,2015
3,March,29,2015
4,April,5,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv')
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [6]:
pd.set_option('display.max_colwidth', 200)
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL friend.............
1,0,I missed the New Moon trailer...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,0,i think mi bf is cheating on me!!! T_T


In [7]:
df.isna().sum()

Sentiment        0
SentimentText    0
dtype: int64

In [8]:
df.Sentiment.value_counts(normalize=True)

1    0.564632
0    0.435368
Name: Sentiment, dtype: float64

In [9]:
df.SentimentText = df.SentimentText.str.lower()
df.tail()

Unnamed: 0,Sentiment,SentimentText
99984,0,@cupcake seems like a repeating problem hope you're able to find something.
99985,1,"@cupcake__ arrrr we both replied to each other over different tweets at the same time , i'll see you then, duno where the hell kateyy is!"
99986,0,@cupcake_2120 ya i thought so
99987,1,@cupcake_dollie yes. yes. i'm glad you had more fun with me.
99988,1,@cupcake_kayla haha yes you do


In [10]:
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist
import string

table = str.maketrans('','', string.punctuation)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def nltk_tokenize(input):
    # Tokenize by word
    tokens = word_tokenize(input)
    #print("Tokens:", tokens)
    # Strip punctuation from within words
    no_punctuation = [x.translate(table) for x in tokens]
    #print("No Punctuation:", no_punctuation)
    # Remove words that aren't alphabetic
    alphabetic = [word for word in no_punctuation if word.isalpha()]
    #print("Alphabetic:", alphabetic)
    # Remove stopwords
    words = [w for w in alphabetic if not w in stop_words]
    #print("Cleaned Words:", words)
    #print("--------------------------------")
    # lemmatize!
    lemmas = [lemmatizer.lemmatize(w) for w in words]
    # Append to list
    return ' '.join(words)

In [11]:
cleaned = []
for x in df.SentimentText:
    x = nltk_tokenize(x)
    cleaned.append(x)
df['cleaned'] = cleaned
df.head()

Unnamed: 0,Sentiment,SentimentText,cleaned
0,0,is so sad for my apl friend.............,sad apl friend
1,0,i missed the new moon trailer...,missed new moon trailer
2,1,omg its already 7:30 :o,omg already
3,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...,omgaga im sooo im gunna cry dentist since suposed get crown put
4,0,i think mi bf is cheating on me!!! t_t,think mi bf cheating tt


### How should TF-IDF scores be interpreted? How are they calculated?

Term frequency refers to the percentage of words in a document for each word.

Inverse document frequency refers to the penalty for the word existing in a high number of documents.

The purpose of TF-IDF is to find what words are unique or important to each document. Because of this we will penalize the term frequencies of words that are common across all documents, which will allow for each document's most different topics to rise to the top. The TF-IDF score increases proportionally to the number of times the word appears in a document and decreases inversely with the number of documents in a collection, aka corpus, of documents that the word appears in.

The TF-IDF score for term i in document j = TF(i, j) * IDF(i)
TF = (Term i frequency in document j) / (Total words in document j)
IDF = log base 2 * (Total documents)/(Documents with term i)

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [12]:
from sklearn.model_selection import train_test_split
X = df.cleaned
y = df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(79991,)
(19998,)
(79991,)
(19998,)


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=50, ngram_range=(1,1), stop_words='english')
vectorizer.fit(X_train)
print(vectorizer.vocabulary_)

{'ca': 3, 'nt': 31, 'quot': 33, 'haha': 12, 'love': 22, 'got': 10, 'know': 19, 'lt': 23, 'na': 26, 'like': 20, 'http': 17, 'time': 41, 'good': 9, 'night': 30, 'way': 45, 'today': 42, 'yes': 49, 'day': 5, 'right': 35, 'lol': 21, 'new': 28, 'nice': 29, 'happy': 13, 'sad': 36, 'come': 4, 'yeah': 48, 'think': 40, 'im': 18, 'really': 34, 'want': 44, 'bad': 1, 'miss': 25, 'amp': 0, 'twitter': 43, 'oh': 32, 'thanks': 39, 'work': 47, 'hope': 16, 'feel': 6, 'better': 2, 'make': 24, 'going': 8, 'need': 27, 'great': 11, 'thank': 38, 'fun': 7, 'sorry': 37, 'wish': 46, 'home': 15, 'hey': 14}


In [15]:
pd.options.display.max_columns = 50
train_word_counts = vectorizer.transform(X_train)

X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())

print(X_train_vectorized.shape)
X_train_vectorized.head()

(79991, 50)


Unnamed: 0,amp,bad,better,ca,come,day,feel,fun,going,good,got,great,haha,happy,hey,home,hope,http,im,know,like,lol,love,lt,make,miss,na,need,new,nice,night,nt,oh,quot,really,right,sad,sorry,thank,thanks,think,time,today,twitter,want,way,wish,work,yeah,yes
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
test_word_counts = vectorizer.transform(X_test)

X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())

print(X_test_vectorized.shape)
X_test_vectorized.head()

(19998, 50)


Unnamed: 0,amp,bad,better,ca,come,day,feel,fun,going,good,got,great,haha,happy,hey,home,hope,http,im,know,like,lol,love,lt,make,miss,na,need,new,nice,night,nt,oh,quot,really,right,sad,sorry,thank,thanks,think,time,today,twitter,want,way,wish,work,yeah,yes
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=42).fit(X_train_vectorized, y_train)

train_predictions = LR.predict(X_train_vectorized)
test_predictions = LR.predict(X_test_vectorized)



In [18]:
from sklearn.metrics import accuracy_score

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

Train Accuracy: 0.6563238364315985
Test Accuracy: 0.6552655265526552


In [19]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB().fit(X_train_vectorized, y_train)

train_predictions = MNB.predict(X_train_vectorized)
test_predictions = MNB.predict(X_test_vectorized)

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

Train Accuracy: 0.6557862759560451
Test Accuracy: 0.6543154315431543


In [20]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators=100).fit(X_train_vectorized, y_train)

train_predictions = RFC.predict(X_train_vectorized)
test_predictions = RFC.predict(X_test_vectorized)

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

Train Accuracy: 0.7023790176394844
Test Accuracy: 0.6464146414641464


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [21]:
# turn a doc into clean tokens
def listify(doc):
    # split into tokens by white space
    tokens = doc.split()
    return tokens

df['list'] = df.cleaned.apply(listify)
print(df.shape)
df.head()

(99989, 4)


Unnamed: 0,Sentiment,SentimentText,cleaned,list
0,0,is so sad for my apl friend.............,sad apl friend,"[sad, apl, friend]"
1,0,i missed the new moon trailer...,missed new moon trailer,"[missed, new, moon, trailer]"
2,1,omg its already 7:30 :o,omg already,"[omg, already]"
3,0,.. omgaga. im sooo im gunna cry. i've been at this dentist since 11.. i was suposed 2 just get a crown put on (30mins)...,omgaga im sooo im gunna cry dentist since suposed get crown put,"[omgaga, im, sooo, im, gunna, cry, dentist, since, suposed, get, crown, put]"
4,0,i think mi bf is cheating on me!!! t_t,think mi bf cheating tt,"[think, mi, bf, cheating, tt]"


In [22]:
from gensim.models.word2vec import Word2Vec
w2v = Word2Vec(df.list, min_count=20, window=3, size=300, negative=20)



In [23]:
words = list(w2v.wv.vocab)
print(f'Vocabulary Size: {len(words)}')

Vocabulary Size: 3592


In [24]:
w2v.wv.most_similar('twitter', topn=10)

[('updates', 0.7911983132362366),
 ('account', 0.7756571173667908),
 ('facebook', 0.7693488597869873),
 ('page', 0.7638937830924988),
 ('email', 0.7558614611625671),
 ('dm', 0.7541946768760681),
 ('address', 0.7459713220596313),
 ('myspace', 0.7456074953079224),
 ('list', 0.740282416343689),
 ('link', 0.7369334101676941)]

In [25]:
w2v.wv.doesnt_match(['twitter', 'facebook', 'address'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'twitter'

In [26]:
w2v.wv.doesnt_match(['twitter', 'facebook', 'myspace'])

'twitter'

In [27]:
w2v.wv.most_similar(positive=["twitter"], topn=10)

[('updates', 0.7911983132362366),
 ('account', 0.7756571173667908),
 ('facebook', 0.7693488597869873),
 ('page', 0.7638937830924988),
 ('email', 0.7558614611625671),
 ('dm', 0.7541946768760681),
 ('address', 0.7459713220596313),
 ('myspace', 0.7456074953079224),
 ('list', 0.740282416343689),
 ('link', 0.7369334101676941)]