## Imports

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk import word_tokenize 
from nltk.corpus import stopwords
import multiprocessing
from string import digits
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
import re
from nltk.tag import pos_tag
import pickle
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from xgboost import XGBClassifier
import string
from nltk.corpus import stopwords
import nltk.corpus.util

## Train Data
Out of the 25000 total reviews, I have taken 10000 reviews at random with an almost equal distribution between positive and negative.

In [2]:
train=pd.read_csv('train.csv',encoding='utf8')
train.describe()

Unnamed: 0,rating
count,10000.0
mean,5.5146
std,3.465947
min,1.0
25%,2.0
50%,7.0
75%,9.0
max,10.0


In [3]:
print('Count of Positive and Negative Reviews in Train set-')
print(train.sentiment.value_counts())
train.head(2)

Count of Positive and Negative Reviews in Train set-
positive    5046
negative    4954
Name: sentiment, dtype: int64


Unnamed: 0,rating,review,sentiment
0,10,A tour deforce! OK the kid that plays Oliver i...,positive
1,4,Funny that I find myself forced to review this...,negative


## Test Data
Out of the 25000 total reviews, I have taken 2500 reviews at random with an almost equal distribution between positive and negative.

In [4]:
test=pd.read_csv('test.csv')
test.describe()

Unnamed: 0,rating
count,2500.0
mean,5.6132
std,3.495436
min,1.0
25%,2.0
50%,7.0
75%,9.0
max,10.0


In [5]:
print('Count of Positive and Negative Reviews-')
print(test.sentiment.value_counts())
test.head(2)

Count of Positive and Negative Reviews-
positive    1273
negative    1227
Name: sentiment, dtype: int64


Unnamed: 0,rating,review,sentiment
0,1,I was unfortunate enough to see this movie at ...,negative
1,7,Sandra Bullock paints a believable picture as ...,positive


# Preprocessing

In [6]:
#convert to lowercase
train['review']=train['review'].apply(lambda x:x.lower())
test['review']=test['review'].apply(lambda x:x.lower())

#remove digits
train['review']=train['review'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
test['review']=test['review'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [7]:
#tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
train['tokens']=train['review'].apply(lambda x:tokenizer.tokenize(x))
test['tokens']=test['review'].apply(lambda x:tokenizer.tokenize(x))
train.head()

Unnamed: 0,rating,review,sentiment,tokens
0,10,a tour deforce! ok the kid that plays oliver i...,positive,"[a, tour, deforce, ok, the, kid, that, plays, ..."
1,4,funny that i find myself forced to review this...,negative,"[funny, that, i, find, myself, forced, to, rev..."
2,1,unless you are already familiar with the pop s...,negative,"[unless, you, are, already, familiar, with, th..."
3,10,you could stage a version of charles dickens' ...,positive,"[you, could, stage, a, version, of, charles, d..."
4,3,it's a really cheesy parody of tomb raider and...,negative,"[it, s, a, really, cheesy, parody, of, tomb, r..."


In [9]:
#stopwords
stop_words=stopwords.words('english')
stop_words.extend(['could','would','br'])

In [10]:
#remove stopwords and join tokens afterwards
train['tokens']=train['tokens'].apply(lambda x: [word for word in x if not word in stop_words])
train['joined']=train['tokens'].apply(' '.join)

test['tokens']=test['tokens'].apply(lambda x: [word for word in x if not word in stop_words])
test['joined']=test['tokens'].apply(' '.join)

In [11]:
#lemmatize words in the review
def lemmatize_sentence(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word
            

In [12]:
train['joined']=train['joined'].apply(lambda x: " ".join(lemmatize_sentence(x)))
test['joined']=test['joined'].apply(lambda x: " ".join(lemmatize_sentence(x)))

In [13]:
train.head(2)

Unnamed: 0,rating,review,sentiment,tokens,joined
0,10,a tour deforce! ok the kid that plays oliver i...,positive,"[tour, deforce, ok, kid, plays, oliver, bit, t...",tour deforce ok kid play oliver bit toooooo sw...
1,4,funny that i find myself forced to review this...,negative,"[funny, find, forced, review, movie, reviewing...",funny find forced review movie review recently...


In [14]:
#CountVectorizer with ngram 1
cv=CountVectorizer(ngram_range=(1,1))

In [15]:
train_cv=cv.fit_transform(train['joined'])
test_cv=cv.transform(test['joined'])

In [16]:
train_cv.shape

(10000, 43164)

In [17]:
#TFIDF transformer
tfidf=TfidfTransformer()

In [18]:
train_tfid=tfidf.fit_transform(train_cv)
train_tfid=train_tfid.todense()

test_tfid=tfidf.transform(test_cv)
test_tfid=test_tfid.todense()

In [19]:
X_train=train_tfid
X_train

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
y_train=train.sentiment
y_train

0       positive
1       negative
2       negative
3       positive
4       negative
          ...   
9995    positive
9996    positive
9997    positive
9998    negative
9999    positive
Name: sentiment, Length: 10000, dtype: object

In [21]:
X_test=test_tfid
X_test

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
y_test=test.sentiment
y_test

0       negative
1       positive
2       negative
3       positive
4       positive
          ...   
2495    positive
2496    positive
2497    positive
2498    negative
2499    positive
Name: sentiment, Length: 2500, dtype: object

# Naive Bayes Models

## MultinomialNB

In [23]:
mnb=MultinomialNB()
mnb.fit(X_train,y_train)

MultinomialNB()

In [24]:
pred1=mnb.predict(X_test)

In [25]:
print("Accuracy Score of MultinomialNB: ",accuracy_score(y_test, pred1)*100)
print("Confusion Matrix of MultinomialNB:\n",confusion_matrix(y_test,pred1))

Accuracy Score of MultinomialNB:  83.08
Confusion Matrix of MultinomialNB:
 [[1055  172]
 [ 251 1022]]


## BernoulliNB

In [26]:
bnb=BernoulliNB()
bnb.fit(X_train,y_train)

BernoulliNB()

In [27]:
pred2=bnb.predict(X_test)

In [30]:
print("Accuracy Score of BernoulliNB:",accuracy_score(y_test, pred2)*100)
print("Confusion Matrix of BernoulliNB:\n",confusion_matrix(y_test,pred2))

Accuracy Score of BernoulliNB: 83.2
Confusion Matrix of BernoulliNB:
 [[1080  147]
 [ 273 1000]]


## GaussianNB

In [31]:
gnb=GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB()

In [32]:
pred3=gnb.predict(X_test)

In [33]:
print("Accuracy Score of GaussianNB: ",accuracy_score(y_test, pred3)*100)
print("Confusion Matrix of GaussianNB:\n",confusion_matrix(y_test,pred3))

Accuracy Score of GaussianNB:  57.56
Confusion Matrix of GaussianNB:
 [[817 410]
 [651 622]]


## XGBoost

In [41]:
cores = multiprocessing.cpu_count() - 1
xgb1 = XGBClassifier(random_state = 21,n_jobs = cores)
xgb1.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=7, num_parallel_tree=1, random_state=21,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
pred4=xgb1.predict(X_test)

In [43]:
print("Accuracy Score of XGB: ",accuracy_score(y_test, pred4)*100)
print("Confusion Matrix of XGB:\n",confusion_matrix(y_test,pred4))

Accuracy Score of XGB:  83.56
Confusion Matrix of XGB:
 [[ 994  233]
 [ 178 1095]]


## LinearSVC

In [34]:
clfSVM = LinearSVC()
clfSVM.fit(X_train,y_train)

LinearSVC()

In [35]:
pred5=clfSVM.predict(X_test)

In [36]:
print("Accuracy Score of LinearSVC: ",accuracy_score(y_test, pred5)*100)
print("Confusion Matrix of LinearSVC:\n",confusion_matrix(y_test,pred5))

Accuracy Score of LinearSVC:  86.6
Confusion Matrix of LinearSVC:
 [[1059  168]
 [ 167 1106]]


## Logistic Regression

In [47]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [48]:
pred6=lr.predict(X_test)

In [49]:
print("Accuracy Score of Logistic Regression: ",accuracy_score(y_test, pred6)*100)
print("Confusion Matrix of  Logistic Regression:\n",confusion_matrix(y_test,pred6))

Accuracy Score of Logistic Regression:  87.2
Confusion Matrix of  Logistic Regression:
 [[1053  174]
 [ 146 1127]]


## RandomForestClassifier

In [37]:
rf = RandomForestClassifier(max_features=21, n_estimators=80)
rf.fit(X_train,y_train)

RandomForestClassifier(max_features=21, n_estimators=80)

In [38]:
pred7= rf.predict(X_test)

In [39]:
print("Accuracy Score of RandomForestClassifier: ",accuracy_score(y_test, pred7)*100)
print("Confusion Matrix of RandomForestClassifier:\n",confusion_matrix(y_test,pred7))

Accuracy Score of RandomForestClassifier:  80.92
Confusion Matrix of RandomForestClassifier:
 [[ 991  236]
 [ 241 1032]]
