# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [2]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [3]:
 ##### Your Code Here #####
whitespace_string = whitespace_string.split()
whitespace_string = " ".join(whitespace_string)
print(whitespace_string)

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [4]:
with open('dates.txt', 'r', encoding='utf-8') as f:
  contents = f.read()
  
contents

'March 8, 2015\nMarch 15, 2015\nMarch 22, 2015\nMarch 29, 2015\nApril 5, 2015\nApril 12, 2015\nApril 19, 2015\nApril 26, 2015\nMay 3, 2015\nMay 10, 2015\nMay 17, 2015\nMay 24, 2015\nMay 31, 2015\nJune 7, 2015\nJune 14, 2015\nJune 21, 2015\nJune 28, 2015\nJuly 5, 2015\nJuly 12, 2015\nJuly 19, 2015'

In [5]:
##### Your Code Here #####
import re

regex1 = r'[A-Z][a-z]+\s(\d+)\,\s\d+'

day = re.findall(regex1, contents)

regex2 = r'([A-Z][a-z]+)\s\d+\,\s\d+'

month = re.findall(regex2, contents)

regex3 = r'[A-Z][a-z]+\s\d+\,\s(\d+)'

year = re.findall(regex3, contents)
for x in year:
    print(x)

2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015
2015


In [6]:
import pandas as pd

df = pd.DataFrame({'day':day, 'month':month,'year':year})
df.head()

Unnamed: 0,day,month,year
0,8,March,2015
1,15,March,2015
2,22,March,2015
3,29,March,2015
4,5,April,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [7]:
##### Your Code Here #####
df = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv')

df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [8]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize # Word Tokenizer


cleaned_listings = []

table = str.maketrans('','', string.punctuation)
stop_words = set(stopwords.words('english'))

for entry in df.SentimentText.tolist():
    # Tokenize
    tokens = word_tokenize(entry)
    # Lowercase
    lowercase_tokens = [w.lower() for w in tokens]
    # Strip punctuation
    no_punctuation = [x.translate(table) for x in lowercase_tokens]
    # Remove words that aren't alphabetic
    alphabetic = [word for word in no_punctuation if word.isalpha()]
    # Remove stopwords
    words = [w for w in alphabetic if not w in stop_words]
    cleaned_listings.append(words)
    
cleaned_listings

[['sad', 'apl', 'friend'],
 ['missed', 'new', 'moon', 'trailer'],
 ['omg', 'already'],
 ['omgaga',
  'im',
  'sooo',
  'im',
  'gunna',
  'cry',
  'dentist',
  'since',
  'suposed',
  'get',
  'crown',
  'put'],
 ['think', 'mi', 'bf', 'cheating', 'tt'],
 ['worry', 'much'],
 ['juuuuuuuuuuuuuuuuussssst', 'chillin'],
 ['sunny', 'work', 'tomorrow', 'tv', 'tonight'],
 ['handed', 'uniform', 'today', 'miss', 'already'],
 ['hmmmm', 'wonder', 'number'],
 ['must', 'think', 'positive'],
 ['thanks', 'haters', 'face', 'day'],
 ['weekend', 'sucked', 'far'],
 ['jb', 'isnt', 'showing', 'australia'],
 ['ok', 'thats', 'win'],
 ['lt', 'way', 'feel', 'right'],
 ['awhhe', 'man', 'completely', 'useless', 'rt', 'funny', 'twitter', 'http'],
 ['feeling',
  'strangely',
  'fine',
  'gon',
  'na',
  'go',
  'listen',
  'semisonic',
  'celebrate'],
 ['huge', 'roll', 'thunder', 'scary'],
 ['cut',
  'beard',
  'growing',
  'well',
  'year',
  'gon',
  'na',
  'start',
  'shaunamanu',
  'happy',
  'meantime'],
 ['sa

In [9]:
import itertools

single_cleaned_listing = list(itertools.chain.from_iterable(cleaned_listings))

single_cleaned_listing

['sad',
 'apl',
 'friend',
 'missed',
 'new',
 'moon',
 'trailer',
 'omg',
 'already',
 'omgaga',
 'im',
 'sooo',
 'im',
 'gunna',
 'cry',
 'dentist',
 'since',
 'suposed',
 'get',
 'crown',
 'put',
 'think',
 'mi',
 'bf',
 'cheating',
 'tt',
 'worry',
 'much',
 'juuuuuuuuuuuuuuuuussssst',
 'chillin',
 'sunny',
 'work',
 'tomorrow',
 'tv',
 'tonight',
 'handed',
 'uniform',
 'today',
 'miss',
 'already',
 'hmmmm',
 'wonder',
 'number',
 'must',
 'think',
 'positive',
 'thanks',
 'haters',
 'face',
 'day',
 'weekend',
 'sucked',
 'far',
 'jb',
 'isnt',
 'showing',
 'australia',
 'ok',
 'thats',
 'win',
 'lt',
 'way',
 'feel',
 'right',
 'awhhe',
 'man',
 'completely',
 'useless',
 'rt',
 'funny',
 'twitter',
 'http',
 'feeling',
 'strangely',
 'fine',
 'gon',
 'na',
 'go',
 'listen',
 'semisonic',
 'celebrate',
 'huge',
 'roll',
 'thunder',
 'scary',
 'cut',
 'beard',
 'growing',
 'well',
 'year',
 'gon',
 'na',
 'start',
 'shaunamanu',
 'happy',
 'meantime',
 'sad',
 'iran',
 'wompppp'

### How should TF-IDF scores be interpreted? How are they calculated?

#### Your Answer Here #####
Scores indicate the term frequencies of UNIQUE words to each document. They are calculated by multiplying the term frequency and the inverse document frequency for each term in a specific document.

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [10]:
##### Your Code Here #####
for_vector_count = []

for listing in cleaned_listings:
    new_listing = " ".join(listing)
    for_vector_count.append(new_listing)

df = df.assign(cleaned=cleaned_listings)
df = df.assign(Cleaned = for_vector_count)
df.head()


Unnamed: 0,Sentiment,SentimentText,cleaned,Cleaned
0,0,is so sad for my APL frie...,"[sad, apl, friend]",sad apl friend
1,0,I missed the New Moon trail...,"[missed, new, moon, trailer]",missed new moon trailer
2,1,omg its already 7:30 :O,"[omg, already]",omg already
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, dentist, si...",omgaga im sooo im gunna cry dentist since supo...
4,0,i think mi bf is cheating on me!!! ...,"[think, mi, bf, cheating, tt]",think mi bf cheating tt


In [11]:
from sklearn.model_selection import train_test_split

X = df['Cleaned']
y=df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,1), stop_words='english')

vectorizer.fit(X_train)

print(vectorizer.vocabulary_)



In [13]:
train_word_counts = vectorizer.transform(X_train)

X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns = vectorizer.get_feature_names())

print(X_train_vectorized.shape)
X_train_vectorized.head()

(79991, 10000)


Unnamed: 0,aa,aaa,aaah,aaahh,aafreen,aah,aahhh,aalaap,aaliyon,aamwilliams,...,zip,zombie,zombies,zone,zones,zoo,zoom,zune,zurich,ðµ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
test_word_counts = vectorizer.transform(X_test)

X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns = vectorizer.get_feature_names())

print(X_test_vectorized.shape)
X_test_vectorized.head()

(19998, 10000)


Unnamed: 0,aa,aaa,aaah,aaahh,aafreen,aah,aahhh,aalaap,aaliyon,aamwilliams,...,zip,zombie,zombies,zone,zones,zoo,zoom,zune,zurich,ðµ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

LR = LogisticRegression(random_state=42).fit(X_train_vectorized, y_train)

train_predictions = LR.predict(X_train_vectorized)
test_predictions = LR.predict(X_test_vectorized)



In [17]:
print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

Train Accuracy: 0.8006650748209173
Test Accuracy: 0.7511751175117511


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [18]:
##### Your Code Here #####
from gensim.models import Word2Vec
w2v = Word2Vec(df.cleaned, min_count=20, window=3, size=300, negative=20)

In [19]:
w2v.wv.most_similar('twitter', topn=10)

[('updates', 0.7682567834854126),
 ('email', 0.7594246864318848),
 ('list', 0.7587671279907227),
 ('account', 0.7582287192344666),
 ('myspace', 0.7524596452713013),
 ('address', 0.7512942552566528),
 ('info', 0.744848906993866),
 ('page', 0.7412559390068054),
 ('facebook', 0.7402005791664124),
 ('message', 0.7371288537979126)]