# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [3]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [4]:
def remove_wspace(s):
    if '\n' in s or '   ' in s or '. ' in s:
        s = ''.join(s.split('\n'))
        s = ' '.join(s.split('  '))
        s = '.'.join(s.split('.  '))
        remove_wspace(s)
    return s

In [5]:
remove_wspace(whitespace_string)

' This is a  string  that has  a lot of extra  whitespace.'

### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [6]:
import pandas as pd
import re

url = 'https://github.com/ryanleeallred/datasets/blob/master/dates.txt'
dates_df = pd.read_csv(url, sep='delimiter', names=['date'])

  """


In [7]:
pd.set_option('display.max_colwidth', 500)

In [8]:
dates_df.head()

Unnamed: 0,date
0,<!DOCTYPE html>
1,"<html lang=""en"">"
2,<head>
3,"<meta charset=""utf-8"">"
4,"<link rel=""dns-prefetch"" href=""https://github.githubassets.com"">"


In [9]:
from bs4 import BeautifulSoup


def clean_html_with_bs4(string):
    soup = BeautifulSoup(string)
    string = soup.get_text()
    return string


dates_df.date = dates_df.date.apply(clean_html_with_bs4)

In [10]:
import numpy as np

dates_df = dates_df.replace('', np.nan)
dates_df = dates_df.dropna()
# dates_df.tail(50)  # dates are in certain lines from 481 to 576, inclusive

In [25]:
# Do regex search and return from a function
import re


def find_and_format_dates(df, col):
    output_list = []
    df_string = str(df[col].values)
    regex = r"[A-Z][a-z]+\s\d{1,2}"
    search_result = re.findall(regex, df_string)
    for string in search_result:
        string = string.split(' ')
        output_list.append(tuple([string[0], string[1], '2015']))
    # print(len(output_list), '\n', output_list)
    return output_list

In [21]:
find_and_format_dates(dates_df, 'date')

21 
 [('Mar', '29', '2015'), ('March', '8', '2015'), ('March', '15', '2015'), ('March', '22', '2015'), ('March', '29', '2015'), ('April', '5', '2015'), ('April', '12', '2015'), ('April', '19', '2015'), ('April', '26', '2015'), ('May', '3', '2015'), ('May', '10', '2015'), ('May', '17', '2015'), ('May', '24', '2015'), ('May', '31', '2015'), ('June', '7', '2015'), ('June', '14', '2015'), ('June', '21', '2015'), ('June', '28', '2015'), ('July', '5', '2015'), ('July', '12', '2015'), ('July', '19', '2015')]


In [26]:
data = find_and_format_dates(dates_df, 'date')
output_dates_df = pd.DataFrame(data, columns=['Day', 'Month', 'Year'])

In [28]:
output_dates_df.head(25)

Unnamed: 0,Day,Month,Year
0,Mar,29,2015
1,March,8,2015
2,March,15,2015
3,March,22,2015
4,March,29,2015
5,April,5,2015
6,April,12,2015
7,April,19,2015
8,April,26,2015
9,May,3,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [29]:
# Load data

url = "https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv"

tweet_sentiment_df = pd.read_csv(url, encoding="ISO-8859-1")
print(tweet_sentiment_df.shape)
tweet_sentiment_df.head()

(99989, 2)


Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL friend.............
1,0,I missed the New Moon trailer...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
4,0,i think mi bf is cheating on me!!! T_T


In [30]:
# Make copy of tweet_sentiment_df

df = tweet_sentiment_df.copy()

In [32]:
# Imports
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize  # Sentence Tokenizer
from nltk.tokenize import word_tokenize  # Word Tokenizer

In [40]:
# Make list for SentimentText column
punct_list = ['.', '!', '?', ':', '\'', '-', '_', '|', '&', ',', ';', '(', ')', '/']

text_list = []
for text in df.SentimentText.values:
    text = str(text)
    text = text.lower()
    for punctuation in punct_list:
        text = ''.join(text.split(punctuation))
    text_list.append(text)

In [41]:
# Inspect text_list

text_list[:20]

['                     is so sad for my apl friend',
 '                   i missed the new moon trailer',
 '              omg its already 730 o',
 '           omgaga im sooo  im gunna cry ive been at this dentist since 11 i was suposed 2 just get a crown put on 30mins',
 '         i think mi bf is cheating on me       tt',
 '         or i just worry too much        ',
 '       juuuuuuuuuuuuuuuuussssst chillin',
 '       sunny again        work tomorrow         tv tonight',
 '      handed in my uniform today  i miss you already',
 '      hmmmm i wonder how she my number @',
 '      i must think about positive',
 '      thanks to all the haters up in my face all day 112102',
 '      this weekend has sucked so far',
 '     jb isnt showing in australia any more',
 '     ok thats it you win',
 '    lt this is the way i feel right now',
 '    awhhe man im completely useless rt now funny all i can do is twitter httpmylocme27hx',
 '    feeling strangely fine now im gonna go listen to some semi

In [43]:
# Remove stopwords from text_list
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhump\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
# Use stopwords from previous coursework

stopwords = ['i','me','my','myself','we', 'our','ours','ourselves',
'you','your','yours','yourself','yourselves','he','him','his','himself',
'she','her','hers','herself','it','its','itself','they','them','their',
'theirs','themselves','what','which','who','whom','this','that','these',
'those','am','is','are','was','were','be','been','being','have','has',
'had','having','do','does','did','doing','a','an','the','and','but',
'if','or','because','as','until','while','of','at','by','for','with',
'about','against','between','into','through','during','before','after',
'above','below','to','from','up','down','in','out','on','off','over',
'under','again','further','then','once','here','there','when','where',
'why','how','all','any','both','each','few','more','most','other','some',
'such','no','nor','not','only','own','same','so','than','too','very',
's','t','can','will','just','don','should','now']

In [46]:
# Strip stopwords from text_list

for text in text_list:
    for word in stopwords:
        if word in text:
            text = ''.join(text.split(word))

In [47]:
# Inspect text_list again

text_list[:10]

['                     is so sad for my apl friend',
 '                   i missed the new moon trailer',
 '              omg its already 730 o',
 '           omgaga im sooo  im gunna cry ive been at this dentist since 11 i was suposed 2 just get a crown put on 30mins',
 '         i think mi bf is cheating on me       tt',
 '         or i just worry too much        ',
 '       juuuuuuuuuuuuuuuuussssst chillin',
 '       sunny again        work tomorrow         tv tonight',
 '      handed in my uniform today  i miss you already',
 '      hmmmm i wonder how she my number @']

In [49]:
# Tokenize text_list to sent level

sentence_tokens = sent_tokenize(str(text_list))

In [50]:
# Tokenize to word level

sent_count = 0
tokens_list = []
for sent in sentence_tokens:
    sent_count += 1
    tokens = word_tokenize(sent)
    tokens_list.append(tokens)
print('\nThe sentence count is:', sent_count)


The sentence count is: 1


In [51]:
# Inspect text_list

text_list[:20]

['                     is so sad for my apl friend',
 '                   i missed the new moon trailer',
 '              omg its already 730 o',
 '           omgaga im sooo  im gunna cry ive been at this dentist since 11 i was suposed 2 just get a crown put on 30mins',
 '         i think mi bf is cheating on me       tt',
 '         or i just worry too much        ',
 '       juuuuuuuuuuuuuuuuussssst chillin',
 '       sunny again        work tomorrow         tv tonight',
 '      handed in my uniform today  i miss you already',
 '      hmmmm i wonder how she my number @',
 '      i must think about positive',
 '      thanks to all the haters up in my face all day 112102',
 '      this weekend has sucked so far',
 '     jb isnt showing in australia any more',
 '     ok thats it you win',
 '    lt this is the way i feel right now',
 '    awhhe man im completely useless rt now funny all i can do is twitter httpmylocme27hx',
 '    feeling strangely fine now im gonna go listen to some semi

### How should TF-IDF scores be interpreted? How are they calculated?

#### TF-IDF scores reflect frequency of terms, and are calculated from log-based weighting of document and query terms.

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Train-test split model validation

X = df.SentimentText
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Check shape of 4 pandas Series objects

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((79991,), (19998,), (79991,), (19998,))

In [55]:
X_train.isna().sum(), X_test.isna().sum()

(0, 0)

In [56]:
vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 2), stop_words='english')
vectorizer.fit(X_train)
print(vectorizer.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [57]:
# Complete vectorization by transforming X_train and X_test

train_word_counts = vectorizer.transform(X_train)
test_word_counts = vectorizer.transform(X_test)

In [None]:
# Fit random forest classifier and get accuracy score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

RFC = RandomForestClassifier(n_estimators=100).fit(train_word_counts, y_train)

train_predictions = RFC.predict(train_word_counts)
test_predictions = RFC.predict(test_word_counts)

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [None]:
# Imports
# import gensim
from gensim.models.word2vec import Word2Vec

In [None]:
# Fit model

model = Word2Vec(sentences, min_count=1, size=5)

In [None]:
model.wv.most_similar('twitter')