In [2]:
import numpy as np
import re
import nltk
import pickle 
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.datasets import load_files

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thakur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
pwd

'C:\\Users\\Thakur\\Restaurant Reviews'

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('Restaurant_Reviews.tsv',delimiter ='\t',quoting = 3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [7]:
data.columns

Index(['Review', 'Liked'], dtype='object')

In [8]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [9]:
corpus = []
for i in range(len(data)):
    review = re.sub(r'\W',' ',data['Review'][i])
    review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=data['Review'][i])
    review = review.lower()
    reviews = review.split()
    reviews = [word for word in reviews if not word in set(stopwords.words('english'))]
    review = [ps.stem(word) for word in reviews]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1600)
x=cv.fit_transform(corpus).toarray()
Y = data.iloc[:, 1].values

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,Y_train,Y_test = train_test_split(x,Y,test_size=0.2,random_state=42)

In [12]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.5)
classifier.fit(x_train,Y_train)

MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)

In [13]:
predictions = classifier.predict(x_test)
predictions

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1], dtype=int64)

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(Y_test,predictions)
accuracy = accuracy_score(Y_test,predictions)

In [15]:
print(cm)
print(accuracy)

[[71 25]
 [27 77]]
0.74


In [16]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(x_train, Y_train)
  temp_y_pred = temp_classifier.predict(x_test)
  score = accuracy_score(Y_test,temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for alpha=0.1 is: 73.5%
Accuracy score for alpha=0.2 is: 74.0%
Accuracy score for alpha=0.3 is: 74.0%
Accuracy score for alpha=0.4 is: 74.5%
Accuracy score for alpha=0.5 is: 74.0%
Accuracy score for alpha=0.6 is: 74.0%
Accuracy score for alpha=0.7 is: 74.0%
Accuracy score for alpha=0.8 is: 74.0%
Accuracy score for alpha=0.9 is: 74.0%
Accuracy score for alpha=1.0 is: 74.0%
--------------------------------------------
The best accuracy is 74.5% with alpha value as 0.4


In [17]:
classifier = MultinomialNB(alpha=0.4)
classifier.fit(x_train,Y_train)

MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)

In [18]:
from sklearn.metrics import classification_report
report= classification_report(Y_test,predictions)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.74      0.73        96
           1       0.75      0.74      0.75       104

    accuracy                           0.74       200
   macro avg       0.74      0.74      0.74       200
weighted avg       0.74      0.74      0.74       200



In [19]:
sample_1 = ["i love the food here"]
sample_2 = ["I love the food here but the service is slow"]

In [26]:
# Pickling the classifier
with open('classifier.pkl','wb') as f:
    pickle.dump(classifier,f)

In [27]:
# Pickling the Vectorizer
with open('BOW.pkl','wb') as f:
          pickle.dump(cv,f)

In [22]:
# Unpickling the classifier and vectorizer
#with open('classifier.pickle','rb') as f:
 #   clf = pickle.load(f)
    
#with open('BOW.pickle','rb') as f:
 #   cv = pickle.load(f)

In [23]:
#sample_1 = cv.transform(sample_1).toarray()

In [24]:
#print(clf.predict(sample_1))

In [25]:
sample_2 = cv.transform(sample_2).toarray()
print(clf.predict(sample_2))

NameError: name 'clf' is not defined