In [98]:
import pandas as pd
import numpy as np


In [99]:
## reading the dataset
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [100]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [101]:
df.shape ## shape of the dataset

(1000, 2)

In [102]:
df.info() ## info of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [103]:
df.isnull().sum() ## checking if null values exists

Review    0
Liked     0
dtype: int64

### Data Preprocessing

In [104]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
# Cleaning the reviews
corpus = []
for i in range(0,1000):

  # Cleaning special character from the reviews
  review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['Review'][i])

  # Converting the entire review into lower case
  review = review.lower()

  # Tokenizing the review by words
  review_words = review.split()

  # Removing the stop words
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]

  # Stemming the words
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review_words]

  # Joining the stemmed words
  review = ' '.join(review)

  # Creating a corpus
  corpus.append(review)

In [106]:
corpus[0:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [107]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9000)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

In [108]:
import pickle
# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('countvector.pkl', 'wb'))

## model building

In [109]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=12)
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [110]:
from sklearn.model_selection import train_test_split

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Naive Bayes

In [112]:
#GaussianNB
gnb = GaussianNB(var_smoothing=1e-2)
cv = cross_val_score(gnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
gnb.fit(X_train,y_train)
y_pred_gnb=gnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_gnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_gnb)
print(cm)

[0.68656716 0.68656716 0.76119403 0.70149254 0.68656716 0.64179104
 0.73134328 0.7761194  0.63636364 0.63636364 0.78787879 0.72727273]
70.49600482436304
The accuracy of the Naive Bayes is 73.5
[[59 38]
 [15 88]]


In [113]:
#MultinomialNB
mnb = MultinomialNB(alpha=2)
cv = cross_val_score(mnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
mnb.fit(X_train,y_train)
y_pred_mnb=mnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_mnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_mnb)
print(cm)

[0.74626866 0.80597015 0.71641791 0.7761194  0.79104478 0.70149254
 0.76119403 0.7761194  0.77272727 0.72727273 0.83333333 0.81818182]
76.88451680988994
The accuracy of the Naive Bayes is 76.5
[[73 24]
 [23 80]]


In [114]:
#Bernoulli NB
bnb = BernoulliNB(alpha=10)
cv = cross_val_score(bnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
mnb.fit(X_train,y_train)
y_pred_bnb=mnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_bnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_bnb)
print(cm)

[0.73134328 0.80597015 0.73134328 0.80597015 0.74626866 0.7761194
 0.7761194  0.71641791 0.78787879 0.75757576 0.72727273 0.75757576]
75.99879390924168
The accuracy of the Naive Bayes is 76.5
[[73 24]
 [23 80]]


### Random Forest Classifier

In [115]:
#Random Forest Classifier
rf = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=30,
                       max_features='log2', min_samples_leaf=2,
                       n_estimators=500, random_state=0)
rf.fit(X_train, y_train)
cv = cross_val_score(rf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_rf = rf.predict(X_test)
print('The accuracy of the RandomForestClassifier is',metrics.accuracy_score(y_pred_rf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_rf)
print(cm)

[0.79104478 0.79104478 0.76119403 0.79104478 0.79104478 0.76119403
 0.7761194  0.76119403 0.78787879 0.75757576 0.8030303  0.86363636]
78.6333484094678
The accuracy of the RandomForestClassifier is 76.5
[[86 11]
 [36 67]]


### SVC

In [116]:
#Linear SVC
svcl = SVC(kernel = 'linear', random_state = 0, probability=True)
svcl.fit(X_train, y_train)
cv = cross_val_score(svcl,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_svcl = svcl.predict(X_test)
print('The accuracy of the Linear SVC is',metrics.accuracy_score(y_pred_svcl,y_test)*100)
cm=confusion_matrix(y_test, y_pred_svcl)
print(cm)

[0.73134328 0.79104478 0.7761194  0.7761194  0.71641791 0.7761194
 0.79104478 0.79104478 0.74242424 0.71212121 0.74242424 0.75757576]
75.86499321573949
The accuracy of the Linear SVC is 72.0
[[74 23]
 [33 70]]


In [117]:
#rbf SVC
from sklearn.svm import SVC
svck = SVC(kernel = 'rbf', random_state = 0, probability=True)
svck.fit(X_train, y_train)
cv = cross_val_score(svck,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_svck = svck.predict(X_test)
print('The accuracy of the Kernel SVC is',metrics.accuracy_score(y_pred_svck,y_test)*100)
cm=confusion_matrix(y_test, y_pred_svck)
print(cm)

[0.73134328 0.79104478 0.73134328 0.82089552 0.71641791 0.80597015
 0.76119403 0.74626866 0.77272727 0.6969697  0.77272727 0.81818182]
76.37569727121965
The accuracy of the Kernel SVC is 73.0
[[90  7]
 [47 56]]


### Voting Classifier

In [118]:
#VCLF 
voting_clf = VotingClassifier(estimators = [('bnb',bnb),('mnb',mnb),('gnb', gnb),
                                            ('rf',rf),('svck',svck),('svcl',svcl)], voting = 'soft') 
voting_clf.fit(X_train, y_train)
cv = cross_val_score(voting_clf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_vclf = voting_clf.predict(X_test)
print('The accuracy of the Voting Classifier is',metrics.accuracy_score(y_pred_vclf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_vclf)
print(cm)
print(classification_report(y_test, y_pred_vclf))

[0.71641791 0.79104478 0.80597015 0.7761194  0.76119403 0.73134328
 0.76119403 0.82089552 0.71212121 0.74242424 0.83333333 0.8030303 ]
77.12573496155585
The accuracy of the Voting Classifier is 77.5
[[73 24]
 [21 82]]
              precision    recall  f1-score   support

           0       0.78      0.75      0.76        97
           1       0.77      0.80      0.78       103

    accuracy                           0.78       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.78      0.78      0.77       200



#### Note : Voting classifier is having only highest accuracy w.r.t to the rest of the algos

In [119]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'voting_clf.pkl'
pickle.dump(voting_clf, open(filename, 'wb'))