# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [17]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "
print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [5]:
#Eliminate all new lines and anything more than a single space
less_whitespace = " ".join(whitespace_string.split())
print(less_whitespace)

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [4]:
#Get text
import requests
url = 'https://raw.githubusercontent.com/ryanleeallred/datasets/master/dates.txt'
contents= requests.get(url)
contents.text

'March 8, 2015\r\nMarch 15, 2015\r\nMarch 22, 2015\r\nMarch 29, 2015\r\nApril 5, 2015\r\nApril 12, 2015\r\nApril 19, 2015\r\nApril 26, 2015\r\nMay 3, 2015\r\nMay 10, 2015\r\nMay 17, 2015\r\nMay 24, 2015\r\nMay 31, 2015\r\nJune 7, 2015\r\nJune 14, 2015\r\nJune 21, 2015\r\nJune 28, 2015\r\nJuly 5, 2015\r\nJuly 12, 2015\r\nJuly 19, 2015'

In [19]:
import pandas as pd
#Create list of dates only
lines = contents.text.split('\r\n')
#Separate everything before first whitespace for month
#Day and year will automatically separate because of comma
month = []
for x in range(0,len(lines)):
    month.append(lines[x].split(' '))
#Name columns and convert to dataframe
cols = ['Month', 'Day', 'Year']
df = pd.DataFrame(month, columns = cols)
#Get rid of comma in Day column
drop_comma = df['Day'].replace( '[,]','', regex=True )
df['Day'] = drop_comma
df

Unnamed: 0,Month,Day,Year
0,March,8,2015
1,March,15,2015
2,March,22,2015
3,March,29,2015
4,April,5,2015
5,April,12,2015
6,April,19,2015
7,April,26,2015
8,May,3,2015
9,May,10,2015


# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [112]:
url = "https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv"
tweetdf = pd.read_csv(url)
tweettxt = contents = requests.get(url).text
print(f'Shape:\n{tweetdf.shape}')
print(f'Check NaN values:\n{tweetdf.isnull().sum()}')
print(f'datatypes:\n{tweetdf.dtypes}')
tweetdf.head()

Shape:
(99989, 2)
Check NaN values:
Sentiment        0
SentimentText    0
dtype: int64
datatypes:
Sentiment         int64
SentimentText    object
dtype: object


Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [107]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /home/rick1270/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rick1270/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [118]:
table = str.maketrans('','', string.punctuation)
stop_words = set(stopwords.words('english'))


def nltk_tokenize(input):
  # Tokenize by word
  tokens = word_tokenize(input)
  # Make all words lowercase
  lowercase_tokens = [w.lower() for w in tokens]
  # Strip punctuation from within words
  no_punctuation = [x.translate(table) for x in lowercase_tokens]
  # Remove stopwords
  no_stopwords = [w for w in no_punctuation if not w in stop_words]
  # Remove empty strings
  words = [w for w in no_stopwords if len(w) > 0]
  return words
nltk_tokenize(tweettxt)

['sentiment',
 'sentimenttext',
 '0',
 'sad',
 'apl',
 'friend',
 '0',
 'missed',
 'new',
 'moon',
 'trailer',
 '1',
 'omg',
 'already',
 '730',
 '0',
 'omgaga',
 'im',
 'sooo',
 'im',
 'gunna',
 'cry',
 'dentist',
 'since',
 '11',
 'suposed',
 '2',
 'get',
 'crown',
 'put',
 '30mins',
 '0',
 'think',
 'mi',
 'bf',
 'cheating',
 'tt',
 '0',
 'worry',
 'much',
 '1',
 'juuuuuuuuuuuuuuuuussssst',
 'chillin',
 '0',
 'sunny',
 'work',
 'tomorrow',
 'tv',
 'tonight',
 '1',
 'handed',
 'uniform',
 'today',
 'miss',
 'already',
 '1',
 'hmmmm',
 'wonder',
 'number',
 '0',
 'must',
 'think',
 'positive',
 '1',
 'thanks',
 'haters',
 'face',
 'day',
 '112102',
 '0',
 'weekend',
 'sucked',
 'far',
 '0',
 'jb',
 'isnt',
 'showing',
 'australia',
 '0',
 'ok',
 'thats',
 'win',
 '0',
 'lt',
 'way',
 'feel',
 'right',
 '0',
 'awhhe',
 'man',
 'completely',
 'useless',
 'rt',
 'funny',
 'twitter',
 'http',
 'mylocme27hx',
 '1',
 'feeling',
 'strangely',
 'fine',
 'gon',
 'na',
 'go',
 'listen',
 'semis

In [119]:
# nltk_tokenize sentimentText
tweetdf['SentimentText_Tokenized'] = tweetdf['SentimentText'].apply(nltk_tokenize)
tweetdf.head()

Unnamed: 0,Sentiment,SentimentText,SentimentText_Tokenized
0,0,is so sad for my APL frie...,"[sad, apl, friend]"
1,0,I missed the New Moon trail...,"[missed, new, moon, trailer]"
2,1,omg its already 7:30 :O,"[omg, already, 730]"
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, dentist, si..."
4,0,i think mi bf is cheating on me!!! ...,"[think, mi, bf, cheating, tt]"


### How should TF-IDF scores be interpreted? How are they calculated?

Put simply, the higher the TF*IDF score (weight), the rarer the term and vice versa.

Typically, the tf-idf weight is composed by two terms: the first computes the normalized Term Frequency (TF), aka. the number of times a word appears in a document, divided by the total number of words in that document; the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization: 

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [105]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [120]:
from sklearn.model_selection import train_test_split

X = tweetdf.SentimentText
y = tweetdf.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(79991,)
(19998,)
(79991,)
(19998,)


In [121]:
# Count Vectorizer method
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=300, ngram_range=(1,1), stop_words='english')
vectorizer.fit(X_train)

print(len(vectorizer.vocabulary_))

300


In [122]:
train_word_counts = vectorizer.transform(X_train)

X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())

print(X_train_vectorized.shape)
X_train_vectorized.head()

(79991, 300)


Unnamed: 0,10,able,actually,add,ago,agree,ah,amazing,amp,anymore,...,xx,ya,yay,yea,yeah,year,years,yep,yes,yesterday
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
test_word_counts = vectorizer.transform(X_test)

X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())

print(X_test_vectorized.shape)
X_test_vectorized.head()

(19998, 300)


Unnamed: 0,10,able,actually,add,ago,agree,ah,amazing,amp,anymore,...,xx,ya,yay,yea,yeah,year,years,yep,yes,yesterday
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

LR = LogisticRegression(solver='lbfgs', random_state=42).fit(X_train_vectorized, y_train)

train_predictions = LR.predict(X_train_vectorized)
test_predictions = LR.predict(X_test_vectorized)

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')

Train Accuracy: 0.701391406533235
Test Accuracy: 0.6985698569856986


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [127]:
import gensim
from gensim.models import Word2Vec

In [129]:
w2v = Word2Vec(tweetdf.SentimentText_Tokenized, min_count=20, window=3, 
               size=300, negative=20)
words = list(w2v.wv.vocab)
len(words)

3730

In [130]:
w2v.wv.most_similar('twitter', topn=10)

[('account', 0.7438366413116455),
 ('email', 0.7323489189147949),
 ('updates', 0.7306891679763794),
 ('info', 0.7264102697372437),
 ('facebook', 0.7244384288787842),
 ('page', 0.7240075469017029),
 ('fb', 0.7183884382247925),
 ('message', 0.715042233467102),
 ('myspace', 0.7150341272354126),
 ('dm', 0.7144572734832764)]