# NTSAMA TABETSING Yann Fabio 4GI 19P020

## I. Import libraries

In [1]:
#Dataset management
import numpy as np
import pandas as pd

#Matplot
import matplotlib.pyplot as plt
%matplotlib inline

#ntlk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#utils
import os
import time
import re
import pickle

#Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import precision_score, recall_score


## II. Import data

In [2]:
%%time
path = "../aclImdb/"

positiveFiles = [x for x in os.listdir(path+"train/pos/") if x.endswith(".txt")]
negativeFiles = [x for x in os.listdir(path+"train/neg/") if x.endswith(".txt")]

positiveReviews, negativeReviews = [], []

for pfile in positiveFiles:
    with open(path+"train/pos/"+pfile, encoding="latin1") as f:
        positiveReviews.append(f.read())
        
for nfile in negativeFiles:
    with open(path+"train/neg/"+nfile, encoding="latin1") as f:
        negativeReviews.append(f.read())
        
data = pd.concat([
    pd.DataFrame({"reviews":positiveReviews, "sentiment":1}),
    pd.DataFrame({"reviews":negativeReviews, "sentiment":0}),
], ignore_index=True).sample(frac=0.25, random_state=1)



display(data.sample(6))

Unnamed: 0,reviews,sentiment
15299,"""I Am Curious: Yellow"" is a risible and preten...",0
24872,"Cassavetes was clearly an intelligent, sensiti...",0
4732,A film that is so much a 30's Warners film in ...,1
7268,"After ""A Dirty Shame"", I never thought that I ...",1
19475,With the rising popularity of the now iconic G...,0
21809,This will be best known for the Ferrari that w...,0


Wall time: 3min 24s


In [3]:
print(data.shape)


(6250, 2)


## III. Preprocessing reviews's Text

In [4]:
stemmer = SnowballStemmer("english")

TEXT_CLEANING_REGEX = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text, stem=False):
    # Preprocessing of the reviews
    text = re.sub(TEXT_CLEANING_REGEX, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if stem:
            tokens.append(stemmer.stem(token))
        else:
            tokens.append(token)
    return " ".join(tokens)

In [5]:
%%time

data.reviews = data.reviews.apply(lambda x:preprocess(x, stem=True))

Wall time: 20.2 s


In [6]:
data.head()

Unnamed: 0,reviews,sentiment
21492,i have copi of this on vhs i think they the te...,0
9488,after sever extrem well rate to the point of s...,1
16933,i still don t know whi i forc myself to sit th...,0
12604,mt littl sister and i are self proclaim horror...,0
8222,i have person seen mani disney movi in my life...,1


## IV. Training and testing data

In [7]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

train_X, train_y = train['reviews'], train['sentiment']
test_X, test_y = test['reviews'], test['sentiment']

In [8]:
train_y.value_counts()

1    2514
0    2486
Name: sentiment, dtype: int64

## V. Extracting feature vectors from words

In [9]:
tfidf = TfidfVectorizer(stop_words='english')


tfidf.fit_transform(train_X)
train_X_vector = tfidf.transform(train_X)
test_X_vector = tfidf.transform(test_X)

## VI. Training models

In [10]:
comment = ['I did not like this movie at all I gave this movie away']
comment[0] = preprocess(comment[0], True)

comment_vector = tfidf.transform(comment)

### 1. SVM

In [11]:
%%time

svc = SVC(kernel='linear')
svc.fit(train_X_vector, train_y)

Wall time: 15.4 s


SVC(kernel='linear')

In [12]:
print(svc.predict(comment_vector))

[0]


### 2. Decision Tree

In [13]:
%%time

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_X_vector, train_y)

Wall time: 2.7 s


DecisionTreeClassifier()

In [14]:
print(dec_tree.predict(comment_vector))

[1]


### 3. Naive Bayes

In [15]:
%%time

gnb = GaussianNB()
gnb.fit(train_X_vector.toarray(), train_y)

Wall time: 27.9 s


GaussianNB()

In [17]:
%%time 

mnb = MultinomialNB()
mnb.fit(train_X_vector.toarray(), train_y)

Wall time: 4.86 s


MultinomialNB()

In [18]:
print(mnb.predict(comment_vector.toarray()))

[0]


In [69]:
print(float(gnb.predict(comment_vector.toarray())))

0.0


### 4. LDA

In [41]:
%%time

lda = LDA()
lda.fit(train_X_vector.toarray(), train_y)

Wall time: 5h 28min 3s


LinearDiscriminantAnalysis()

In [47]:
print(lda.predict(comment_vector.toarray()))

[0]


## VII. Models evaluation



In [20]:
#SVM
print("SVM evaluation")
recall = recall_score(test_y, svc.predict(test_X_vector))
print('Recall: %f' % recall)
precision = precision_score(test_y, svc.predict(test_X_vector))
print('Precision: %f' % precision)
print("---------------------------------------------------")

#Decision Tree
print("Decision Tree evaluation")
recall = recall_score(test_y, dec_tree.predict(test_X_vector))
print('Recall: %f' % recall)
precision = precision_score(test_y, dec_tree.predict(test_X_vector))
print('Precision: %f' % precision)
print("---------------------------------------------------")

#Gaussian Naive Bayes
print("Gaussian Naive Bayes evaluation")
recall = recall_score(test_y, mnb.predict(test_X_vector.toarray()))
print('Recall: %f' % recall)
precision = precision_score(test_y, mnb.predict(test_X_vector.toarray()))
print('Precision: %f' % precision)
print("---------------------------------------------------")

#LDA
"""
print("LDA evaluation")
recall = recall_score(test_y, lda.predict(test_X_vector.toarray()))
print('Recall: %f' % recall)
precision = precision_score(test_y, lda.predict(test_X_vector.toarray()))
print('Precision: %f' % precision)"""

SVM evaluation
Recall: 0.874214
Precision: 0.859351
---------------------------------------------------
Decision Tree evaluation
Recall: 0.685535
Precision: 0.689873
---------------------------------------------------
Gaussian Naive Bayes evaluation
Recall: 0.842767
Precision: 0.861736
---------------------------------------------------


'\nprint("LDA evaluation")\nrecall = recall_score(test_y, lda.predict(test_X_vector.toarray()))\nprint(\'Recall: %f\' % recall)\nprecision = precision_score(test_y, lda.predict(test_X_vector.toarray()))\nprint(\'Precision: %f\' % precision)'

## VIII. Serializing models

In [59]:
with open('sentiment_model_svm.pkl', 'wb') as f:
    pickle.dump(svc, f)
    
with open('sentiment_model_decision_tree.pkl', 'wb') as f:
    pickle.dump(dec_tree, f)
    
with open('sentiment_model_gaussian_naive_bayes.pkl', 'wb') as f:
    pickle.dump(gnb, f)
    
with open('sentiment_model_lda.pkl', 'wb') as f:
    pickle.dump(lda, f)

In [60]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [70]:
with open('preprocess.pkl', 'wb') as f:
    pickle.dump(preprocess, f)

In [19]:
#Multinomial Naive Bayes
print("Multinomial Naive Bayes evaluation")
recall = recall_score(test_y, svc.predict(test_X_vector.toarray()))
print('Recall: %f' % recall)
precision = precision_score(test_y, svc.predict(test_X_vector.toarray()))
print('Precision: %f' % precision)
print("---------------------------------------------------")

Multinomial Naive Bayes evaluation
Recall: 0.888655
Precision: 0.852823
---------------------------------------------------


In [1]:
import sklearn
sklearn.__version__

'0.24.2'

In [3]:
import nltk
nltk.__version__

'3.6.5'