In [35]:
import numpy as np
import pandas as pd
import pickle
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
dataset = pd.read_csv('deceptive-opinion.csv')
dataset

Unnamed: 0,verdict,hotel,polarity,source,review
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...
...,...,...,...,...,...
1596,deceptive,intercontinental,negative,MTurk,Problems started when I booked the InterContin...
1597,deceptive,amalfi,negative,MTurk,The Amalfi Hotel has a beautiful website and i...
1598,deceptive,intercontinental,negative,MTurk,The Intercontinental Chicago Magnificent Mile ...
1599,deceptive,palmer,negative,MTurk,"The Palmer House Hilton, while it looks good i..."


In [37]:
required_dataset = dataset[['verdict', 'review']]
required_dataset

Unnamed: 0,verdict,review
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...
2,truthful,This comes a little late as I'm finally catchi...
3,truthful,The Omni Chicago really delivers on all fronts...
4,truthful,I asked for a high floor away from the elevato...
...,...,...
1596,deceptive,Problems started when I booked the InterContin...
1597,deceptive,The Amalfi Hotel has a beautiful website and i...
1598,deceptive,The Intercontinental Chicago Magnificent Mile ...
1599,deceptive,"The Palmer House Hilton, while it looks good i..."


In [38]:
required_dataset.loc[required_dataset['verdict'] == 'deceptive', 'verdict'] = 0
required_dataset.loc[required_dataset['verdict'] == 'truthful', 'verdict'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_dataset.loc[required_dataset['verdict'] == 'deceptive', 'verdict'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_dataset.loc[required_dataset['verdict'] == 'truthful', 'verdict'] = 1


In [39]:
required_dataset

Unnamed: 0,verdict,review
0,1,We stayed for a one night getaway with family ...
1,1,Triple A rate with upgrade to view room was le...
2,1,This comes a little late as I'm finally catchi...
3,1,The Omni Chicago really delivers on all fronts...
4,1,I asked for a high floor away from the elevato...
...,...,...
1596,0,Problems started when I booked the InterContin...
1597,0,The Amalfi Hotel has a beautiful website and i...
1598,0,The Intercontinental Chicago Magnificent Mile ...
1599,0,"The Palmer House Hilton, while it looks good i..."


In [40]:
X = required_dataset['review']
Y = np.asarray(required_dataset['verdict'], dtype = int)

In [41]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42) # 75% training and 25% test

In [42]:
X_test

526    I wouldn't know how to begin to write somethin...
354    Stayed at the InterContinental for an entire w...
168    This is a great find for downtown Chicago. The...
135    Stayed there three nights from 4/17/09 through...
937    Was one of the worst travel experiences of qui...
                             ...                        
621    I recently stayed at the Hard Rock Hotel in Ch...
839    I was very disappointed with the hotel this ti...
767    Everything experienced at this hotel was grand...
857    Thank god I got this hotel through priceline. ...
327    We stayed at the Palmer House Hilton in early ...
Name: review, Length: 401, dtype: object

In [43]:
Y_test

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,

In [44]:
cv = CountVectorizer()
x = cv.fit_transform(X_train)
y = cv.transform(X_test)

In [45]:
mnb = MultinomialNB()
gnb = GaussianNB()

In [48]:
# Fitting the model
mnb.fit(x, Y_train)
pickle.dump(mnb, open('model_mnb.pkl', 'wb'))
model = pickle.load(open('model_mnb.pkl','rb'))

In [49]:
# Training accuracy
mnb.score(x, Y_train)

0.9758333333333333

In [51]:
# Testing accuracy
mnb.score(y, Y_test)

0.8703241895261845