# IMPORT PACKAGES AND DATA

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [2]:
#Checking dataframes Shape
df.shape

(2000, 2)

In [3]:
#Checking a sample of review
print(len(df['review'][0]),'\n')
print(df['review'][0])

2250 

how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story al

In [4]:
#Counting Null values
df.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
#Dropping Null Values
df.dropna(inplace=True)
print(df.shape)

(1965, 2)


In [6]:
#Dropping reviews which contain only whitespaces
#First declare a array
blanks=[]
#Now itertuples returns all three column iterators.Therefoe i,lb and rv as used for iteration.
for i,lb,rv in df.itertuples():
    if type(rv)==str: #checking whether it is not null
        if rv.isspace()==True:#now if contains only whitespace
            blanks.append(i)#appending on array
print(blanks)    

[57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [7]:
#removing blank spaces
df.drop(blanks,inplace=True)
print(df.shape)

(1938, 2)


In [8]:
#checking value counts of label
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [9]:
#Splitting the data in train and test data
from sklearn.model_selection import train_test_split
x=df['review']
y=df['label']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

In [10]:
#Using Pipepline to use TFIDF with Navie Bayes and SVM
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

text_class_nb=Pipeline([('Tfidf',TfidfVectorizer()),('Naive_Bayes',MultinomialNB())])
text_class_svm=Pipeline([('Tfidf',TfidfVectorizer()),('SVM',LinearSVC())])

In [11]:
#Fitting 
text_class_nb.fit(x_train,y=y_train)
text_class_svm.fit(x_train,y=y_train)

Pipeline(memory=None,
     steps=[('Tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [12]:
#Forming Prediction
prediction_nb=text_class_nb.predict(x_test)
prediction_svm=text_class_svm.predict(x_test)

In [13]:
#PRINTING the confusion matrix ,classification report and accuracy score
from sklearn import metrics
print('\nThe confusion matrix of NB is \n',metrics.confusion_matrix(prediction_nb,y_test))
print('\n\nThe classification report of NB \n',metrics.classification_report(prediction_nb,y_test))
print('\n\n Accuracy Score NB',metrics.accuracy_score(prediction_nb,y_test)*100,'%')
print('\n\nThe confusion matrix of SVM is \n',metrics.confusion_matrix(prediction_svm,y_test))
print('\n\nThe classification report of SVM \n',metrics.classification_report(prediction_svm,y_test))
print('\n\n Accuracy Score SVM',metrics.accuracy_score(prediction_svm,y_test)*100,'%')


The confusion matrix of NB is 
 [[287 130]
 [ 21 202]]


The classification report of NB 
              precision    recall  f1-score   support

        neg       0.93      0.69      0.79       417
        pos       0.61      0.91      0.73       223

avg / total       0.82      0.76      0.77       640



 Accuracy Score NB 76.40625 %


The confusion matrix of SVM is 
 [[259  49]
 [ 49 283]]


The classification report of SVM 
              precision    recall  f1-score   support

        neg       0.84      0.84      0.84       308
        pos       0.85      0.85      0.85       332

avg / total       0.85      0.85      0.85       640



 Accuracy Score SVM 84.6875 %


In [14]:
#Now as we didn't use Stopwords in TFIDFVectorizer 

#Now we use stopwords

text_class_nb=Pipeline([('Tfidf',TfidfVectorizer(stop_words='english')),('Naive_Bayes',MultinomialNB())])
text_class_svm=Pipeline([('Tfidf',TfidfVectorizer(stop_words='english')),('SVM',LinearSVC())])

#Fitting

text_class_nb.fit(x_train,y=y_train)
text_class_svm.fit(x_train,y=y_train)

#Forming Prediction

prediction_nb=text_class_nb.predict(x_test)
prediction_svm=text_class_svm.predict(x_test)

#PRINTING the confusion matrix ,classification report and accuracy score

from sklearn import metrics
print('\nThe confusion matrix of NB is \n',metrics.confusion_matrix(prediction_nb,y_test))
print('\n\nThe classification report of NB \n',metrics.classification_report(prediction_nb,y_test))
print('\n\n Accuracy Score NB',metrics.accuracy_score(prediction_nb,y_test)*100,'%')
print('\n\nThe confusion matrix of SVM is \n',metrics.confusion_matrix(prediction_svm,y_test))
print('\n\nThe classification report of SVM \n',metrics.classification_report(prediction_svm,y_test))
print('\n\n Accuracy Score SVM',metrics.accuracy_score(prediction_svm,y_test)*100,'%')


The confusion matrix of NB is 
 [[274  94]
 [ 34 238]]


The classification report of NB 
              precision    recall  f1-score   support

        neg       0.89      0.74      0.81       368
        pos       0.72      0.88      0.79       272

avg / total       0.82      0.80      0.80       640



 Accuracy Score NB 80.0 %


The confusion matrix of SVM is 
 [[252  52]
 [ 56 280]]


The classification report of SVM 
              precision    recall  f1-score   support

        neg       0.82      0.83      0.82       304
        pos       0.84      0.83      0.84       336

avg / total       0.83      0.83      0.83       640



 Accuracy Score SVM 83.125 %


### As we saw using stop_words filter increased Accuracy Score in NB from 76.4% to 80.0% 
### But decreased SVM Accuracy _Score from  84.6% to 83.125%

# Now using it check my own written Review

In [18]:
my_review="Movie was not what i expected from Christopher NOlan Movie"

In [19]:
print('SENTIMENT FROM NB ')
print(text_class_nb.predict([my_review]))   
print('\n\nSENTIMENT FROM SVM')
print(text_class_svm.predict([my_review]))   

SENTIMENT FROM NB 
['neg']


SENTIMENT FROM SVM
['neg']
