# TRANSFORMING AND CLEANING THE DATASET
This notebook shows the cleaning and transformation of the dataset. In order for our model to be able to give the highest accuracy at predicting the sentiment analysis, tokenization and removing stop words are crucial to the cleaning aspect of the dataset. 

In [1]:
#Import all dependencies
import pandas as pd
import re
import io 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from functools import reduce


In [2]:
stop_words = set(stopwords.words("english"))
punctuation = string.punctuation

#Create a function to clean dataset, tokenize words and remove stopwords and punctuation
def tokenize_words(text, stopwords, punctuation):
    text = text.lower() 
    text = text.replace("<br />", " ")
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = word_tokenize(text)
    filtered = []
    for w in tokens:
        if w not in stopwords and w not in punctuation:
            filtered.append(w)
    text = reduce((lambda x,y: x + " " + y), filtered)
    return text

In [3]:
#word_tokenize accepts a string as an input, not a file. 
stop_words = set(stopwords.words('english')) 
stop_words = [re.sub(r"[^a-z ]", "", w) for w in stop_words]

#Read in .txt file
test_neg_path = "../train_neg.txt" 
test_pos_path = "../train_pos.txt" #need to change this

test_neg_df = pd.read_table(test_neg_path, sep="\n", header=None, names=['Reviews'])
test_pos_df = pd.read_table(test_pos_path, sep="\n", header=None, names=['Reviews'])

#Encoding each review with 0 for negative and 1 for positive 
test_neg_df['Encoding'] = 0
test_pos_df['Encoding'] = 1

#Concatenating both negative and positive reviews to insert into a dataframe
test_df = pd.concat([test_neg_df, test_pos_df])

#Tokenize words, removing stop words, removing punctuation and creating the dataframe
test_df['Reviews (Cleaned)'] = test_df['Reviews'].apply(tokenize_words, args=(stop_words, punctuation))

test_df

Unnamed: 0,Reviews,Encoding,Reviews (Cleaned)
0,Working with one of the best Shakespeare sourc...,0,working one best shakespeare sources film mana...
1,"Well...tremors I, the original started off in ...",0,well tremors original started found movie quit...
2,Ouch! This one was a bit painful to sit throug...,0,ouch one bit painful sit cute amusing premise ...
3,"I've seen some crappy movies in my life, but t...",0,seen crappy movies life one must among worst d...
4,Carriers follows the exploits of two guys and ...,0,carriers follows exploits two guys two gals st...
...,...,...,...
12495,"My comments may be a bit of a spoiler, for wha...",1,comments may bit spoiler worth stop care enoug...
12496,"The ""saucy"" misadventures of four au pairs who...",1,saucy misadventures four au pairs arrive londo...
12497,"Oh, those Italians! Assuming that movies about...",1,oh italians assuming movies aristocrats weird ...
12498,Eight academy nominations? It's beyond belief....,1,eight academy nominations beyond belief think ...


In [4]:
#Creating a for loop to find the word frequency for tokenized words for visualization purposes
wordfreq = {}
for sentence in test_df['Reviews (Cleaned)']:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [None]:
#Top 200 most frequent words 
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

#Create dataframe and save into csv 
df_new = pd.DataFrame.from_dict(wordfreq, orient="index")
df_new.to_csv('word_frequency.csv', index=True)

# TESTING OUR MODEL 
The code below shows the steps of how we tested our model. 

In [5]:
reviews_np = test_df['Reviews (Cleaned)']

In [6]:
# Vectorizing our words
CV = CountVectorizer(input="content", lowercase=False)
CV

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [30]:
#Standardize the data
cv_matrix = CV.fit_transform(reviews_np)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [36]:
#Depicts term frequency vector for each review (bag of words)
vocab = CV.get_feature_names()
df_reviews = pd.DataFrame(cv_matrix, columns=vocab)
df_reviews.head(500)

Unnamed: 0,aa,aaa,aaaaaaah,aaaaah,aaaahhhhhhh,aaaarrgh,aaah,aaargh,aaaugh,aaawwwwnnn,...,zyuranger,zz,zzzz,zzzzz,zzzzzzzz,zzzzzzzzzzzz,zzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
#Make a new column in dataframe to match each matrix to its corresponding review
test_df['matrix'] = list(cv_matrix)

In [21]:
#Logistic Regression 
LogisticRegression

sklearn.linear_model.logistic.LogisticRegression

In [48]:
# Set variables to train dataset
X_train = cv_matrix
y_train = test_df['Encoding']

In [49]:
#Create model variable
model = LogisticRegression()

In [50]:
#Fit linear model 
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
#Check model accuracy 
model.score(X_train, y_train)

0.5