# TRANSFORMING AND CLEANING THE DATASET
This notebook shows the cleaning and transformation of the dataset and testing our Machine Learning model. In order for our model to be able to give the highest accuracy at predicting the sentiment analysis, tokenization and removing stop words are crucial to the cleaning aspect of the dataset. 

In [9]:
#Import all dependencies
import pandas as pd
import re
import io 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from functools import reduce


In [10]:
stop_words = set(stopwords.words("english"))
punctuation = string.punctuation

#Create a function to clean dataset, tokenize words and remove stopwords and punctuation
def tokenize_words(text, stopwords, punctuation):
    text = text.lower() 
    text = text.replace("<br />", " ")
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = word_tokenize(text)
    filtered = []
    for w in tokens:
        if w not in stopwords and w not in punctuation:
            filtered.append(w)
    text = reduce((lambda x,y: x + " " + y), filtered)
    return text

In [11]:
#word_tokenize accepts a string as an input, not a file. 
stop_words = set(stopwords.words('english')) 
stop_words = [re.sub(r"[^a-z ]", "", w) for w in stop_words]

#Read in .txt file
test_neg_path = "../train_neg.txt" 
test_pos_path = "../train_pos.txt" #need to change this

test_neg_df = pd.read_table(test_neg_path, sep="\n", header=None, names=['Reviews'])
test_pos_df = pd.read_table(test_pos_path, sep="\n", header=None, names=['Reviews'])

#Tokenize words, removing stop words, removing punctuation and creating the dataframe
test_neg_df['Reviews (Cleaned)'] = test_neg_df['Reviews'].apply(tokenize_words, args=(stop_words, punctuation))
test_pos_df['Reviews (Cleaned)'] = test_pos_df['Reviews'].apply(tokenize_words, args=(stop_words, punctuation))

In [4]:
test_neg_df

Unnamed: 0,Reviews,Reviews (Cleaned)
0,Working with one of the best Shakespeare sourc...,working one best shakespeare sources film mana...
1,"Well...tremors I, the original started off in ...",well tremors original started found movie quit...
2,Ouch! This one was a bit painful to sit throug...,ouch one bit painful sit cute amusing premise ...
3,"I've seen some crappy movies in my life, but t...",seen crappy movies life one must among worst d...
4,Carriers follows the exploits of two guys and ...,carriers follows exploits two guys two gals st...
...,...,...
12495,"My comments may be a bit of a spoiler, for wha...",comments may bit spoiler worth stop care enoug...
12496,"The ""saucy"" misadventures of four au pairs who...",saucy misadventures four au pairs arrive londo...
12497,"Oh, those Italians! Assuming that movies about...",oh italians assuming movies aristocrats weird ...
12498,Eight academy nominations? It's beyond belief....,eight academy nominations beyond belief think ...


In [5]:
#Creating a for loop to find the word frequency for tokenized words for visualization purposes
neg_freq = {}
for sentence in test_neg_df['Reviews (Cleaned)']:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token not in neg_freq.keys():
            neg_freq[token] = 1
        else:
            neg_freq[token] += 1
            
neg_freq

{'working': 331,
 'one': 13138,
 'best': 2096,
 'shakespeare': 112,
 'sources': 35,
 'film': 19221,
 'manages': 246,
 'creditable': 5,
 'source': 97,
 'whilst': 138,
 'still': 2283,
 'appealing': 96,
 'wider': 13,
 'audience': 1178,
 'branagh': 67,
 'steals': 66,
 'fishburne': 16,
 'nose': 98,
 'talented': 292,
 'cast': 1696,
 'good': 7423,
 'form': 313,
 'well': 4257,
 'tremors': 32,
 'original': 1984,
 'started': 545,
 'found': 1267,
 'movie': 24969,
 'quite': 1596,
 'enjoyable': 269,
 'watch': 3550,
 'however': 1741,
 'proceeded': 16,
 'make': 4722,
 'ii': 204,
 'iii': 76,
 'trust': 149,
 'movies': 4081,
 'going': 2319,
 'downhill': 79,
 'right': 1625,
 'finished': 168,
 'first': 4307,
 'mean': 1117,
 'ass': 163,
 'blasters': 2,
 'god': 766,
 'capable': 115,
 'answering': 18,
 'question': 358,
 'gods': 34,
 'name': 921,
 'would': 7036,
 'create': 296,
 'another': 2254,
 'dumpster': 7,
 'dives': 12,
 'iv': 38,
 'considered': 223,
 'bad': 7401,
 'fact': 1839,
 'even': 7691,
 'epitome'

In [12]:
#Create dataframe and save into csv 
df_neg_new = pd.DataFrame.from_dict(neg_freq, orient="index")
df_neg_new.to_csv('neg_word_frequency.csv', index=True)

In [6]:
test_pos_df

Unnamed: 0,Reviews,Reviews (Cleaned)
0,For a movie that gets no respect there sure ar...,movie gets respect sure lot memorable quotes l...
1,Bizarre horror movie filled with famous faces ...,bizarre horror movie filled famous faces stole...
2,"A solid, if unremarkable film. Matthau, as Ein...",solid unremarkable film matthau einstein wonde...
3,It's a strange feeling to sit alone in a theat...,strange feeling sit alone theater occupied par...
4,"You probably all already know this by now, but...",probably already know additional episodes neve...
...,...,...
12495,About a year ago I finally gave up on American...,year ago finally gave american television thou...
12496,When I saw the elaborate DVD box for this and ...,saw elaborate dvd box dreadful red queen figur...
12497,"Last November, I had a chance to see this film...",last november chance see film reno film festiv...
12498,Great movie -I loved it. Great editing and use...,great movie loved great editing use soundtrack...


In [7]:
#Creating a for loop to find the word frequency for tokenized words for visualization purposes
pos_freq = {}
for sentence in test_pos_df['Reviews (Cleaned)']:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token not in pos_freq.keys():
            pos_freq[token] = 1
        else:
            pos_freq[token] += 1
            
pos_freq

{'movie': 19078,
 'gets': 1490,
 'respect': 263,
 'sure': 1253,
 'lot': 2088,
 'memorable': 467,
 'quotes': 54,
 'listed': 50,
 'gem': 290,
 'imagine': 321,
 'joe': 390,
 'piscopo': 14,
 'actually': 1790,
 'funny': 1953,
 'maureen': 39,
 'stapleton': 10,
 'scene': 2567,
 'stealer': 13,
 'moroni': 2,
 'character': 3516,
 'absolute': 154,
 'scream': 106,
 'watch': 3424,
 'alan': 220,
 'skipper': 9,
 'hale': 30,
 'jr': 193,
 'police': 579,
 'sgt': 39,
 'bizarre': 220,
 'horror': 1441,
 'filled': 313,
 'famous': 485,
 'faces': 179,
 'stolen': 86,
 'cristina': 18,
 'raines': 72,
 'later': 1324,
 'tv': 1400,
 'flamingo': 3,
 'road': 231,
 'pretty': 1549,
 'somewhat': 555,
 'unstable': 25,
 'model': 118,
 'gummy': 2,
 'smile': 182,
 'slated': 11,
 'pay': 244,
 'attempted': 57,
 'suicides': 8,
 'guarding': 7,
 'gateway': 9,
 'hell': 365,
 'scenes': 2428,
 'modeling': 13,
 'well': 6411,
 'captured': 180,
 'mood': 231,
 'music': 1739,
 'perfect': 1242,
 'deborah': 15,
 'raffin': 5,
 'charming': 

In [None]:
#Create dataframe and save into csv 
df_new = pd.DataFrame.from_dict(wordfreq, orient="index")
df_new.to_csv('word_frequency.csv', index=True)

In [8]:
#Encoding each review with 0 for negative and 1 for positive 
test_neg_df['Encoding'] = 0
test_pos_df['Encoding'] = 1

#Concatenating both negative and positive reviews to insert into a dataframe
test_df = pd.concat([test_neg_df, test_pos_df])

test_df

Unnamed: 0,Reviews,Reviews (Cleaned),Encoding
0,Working with one of the best Shakespeare sourc...,working one best shakespeare sources film mana...,0
1,"Well...tremors I, the original started off in ...",well tremors original started found movie quit...,0
2,Ouch! This one was a bit painful to sit throug...,ouch one bit painful sit cute amusing premise ...,0
3,"I've seen some crappy movies in my life, but t...",seen crappy movies life one must among worst d...,0
4,Carriers follows the exploits of two guys and ...,carriers follows exploits two guys two gals st...,0
...,...,...,...
12495,About a year ago I finally gave up on American...,year ago finally gave american television thou...,1
12496,When I saw the elaborate DVD box for this and ...,saw elaborate dvd box dreadful red queen figur...,1
12497,"Last November, I had a chance to see this film...",last november chance see film reno film festiv...,1
12498,Great movie -I loved it. Great editing and use...,great movie loved great editing use soundtrack...,1


In [None]:
#Creating a for loop to find the word frequency for tokenized words for visualization purposes
wordfreq = {}
for sentence in test_df['Reviews (Cleaned)']:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [None]:
#Top 200 most frequent words 
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

#Create dataframe and save into csv 
df_new = pd.DataFrame.from_dict(wordfreq, orient="index")
df_new.to_csv('word_frequency.csv', index=True)

# TESTING OUR MODEL 
The code below shows the steps of how we tested our model. 

In [5]:
reviews_np = test_df['Reviews (Cleaned)']

In [6]:
# Vectorizing our words
CV = CountVectorizer(input="content", lowercase=False)
CV

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
#Standardize the data
cv_matrix = CV.fit_transform(reviews_np)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
#Depicts term frequency vector for each review (bag of words)
vocab = CV.get_feature_names()
df_reviews = pd.DataFrame(cv_matrix, columns=vocab)
df_reviews.head(500)

Unnamed: 0,aa,aaa,aaaaaaah,aaaaah,aaaaatch,aaaahhhhhhh,aaaand,aaaarrgh,aaah,aaargh,...,zyuranger,zz,zzzz,zzzzz,zzzzzzzz,zzzzzzzzzzzz,zzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Make a new column in dataframe to match each matrix to its corresponding review
test_df['matrix'] = list(cv_matrix)

In [10]:
#Logistic Regression 
LogisticRegression

sklearn.linear_model.logistic.LogisticRegression

In [11]:
# Set variables to train dataset
X_train = cv_matrix
y_train = test_df['Encoding']

In [12]:
#Create model variable
model = LogisticRegression()

In [13]:
#Fit linear model 
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
#Check model accuracy 
model.score(X_train, y_train)

0.99824

# Weight of a word
In this section we tried to see the weight of each word in each review in accordance with all other reviews.

In [19]:
#Cheacking coefficent of words
model_example = model.coef_
#model_example

array([[-1.95057656e-01, -1.21714029e-01, -2.21655083e-07, ...,
         3.20608939e-06,  3.20608939e-06,  3.20608939e-06]])

In [None]:
#One example of a review
review_ex = test_df.iloc[3620]
review_ex

In [23]:
import numpy as np

In [25]:
np.dot(review_ex['matrix'], model_example[0])

-2.537607604777733

In [28]:
strength = [ model_example[0][i] * review_ex['matrix'][i] for i in range(73081)]

In [31]:
look = pd.DataFrame({"Weights": strength, "Vocab": vocab})


Unnamed: 0,Weights,Vocab
0,-0.0,aa
1,-0.0,aaa
2,-0.0,aaaaaaah
3,0.0,aaaaah
4,0.0,aaaaatch
...,...,...
73076,-0.0,zzzzzzzzzzzz
73077,-0.0,zzzzzzzzzzzzz
73078,0.0,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
73079,0.0,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz


In [44]:
review_weights = look.loc[look['Weights'] != 0]

In [45]:
#Max weight 
review_weights.max()

Weights    1.61446
Vocab         work
dtype: object

In [46]:
#Min Weight
review_weights.min()

Weights   -1.78048
Vocab       across
dtype: object

In [51]:
#Weight SUM
review_weights["Weights"].sum()

-2.537607604777733

In [52]:
#Save file as CSV
review_weights.to_csv("example_weights.csv")

In [60]:
#FINAL: Model correctly predicts negative review and the probability of a positive review is 0.73
model.predict_proba(review_ex['matrix'].reshape(1,-1))

array([[0.9262405, 0.0737595]])