In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv" ,  delimiter = '\t' , quoting = 3)
Data = np.matrix(dataset)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
r,c =Data.shape
cleaned_text = []


[nltk_data] Downloading package stopwords to /home/aayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
for i in range(r):
    review = Data[i,0]
    review =          re.sub('[^a-zA-Z]' , ' ' , review).lower().split()
    ps = PorterStemmer()
    
    review =[ps.stem(word) for word in review if  word not in set(stopwords.words('english'))]
    review =' '.join(review)
    cleaned_text.append(review)
print("Cleaned Reviews")
pd.DataFrame(cleaned_text)

Cleaned Reviews


Unnamed: 0,0
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price
5,get angri want damn pho
6,honeslti tast fresh
7,potato like rubber could tell made ahead time ...
8,fri great
9,great touch


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(cleaned_text).toarray()
y = np.array(Data[0:,1].astype(np.int64))

In [6]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.25 , random_state = 54)
y_train = y_train[:,0]
y_test = y_test[:,0]

In [7]:
print("Classification Model : Naive Bayes")
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred1 = classifier.predict(X_test)


from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)
print("No of correct = " + str(cm1[0,0] + cm1[1,1]) + "/250")
print("Accuracy = " + str((cm1[0,0] + cm1[1,1])/2.5))

Classification Model : Naive Bayes
[[ 69  52]
 [ 19 110]]
No of correct = 179/250
Accuracy = 71.6


In [8]:
print("Classification Model : SVM")
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear')
classifier.fit(X_train,y_train)

y_pred2 = classifier.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print("No of correct = " + str(cm2[0,0] + cm2[1,1]) + "/250")
print("Accuracy = " + str((cm2[0,0] + cm2[1,1])/2.5))

Classification Model : SVM
[[96 25]
 [31 98]]
No of correct = 194/250
Accuracy = 77.6


In [9]:
print("Classification Model : Logistic Regressuin")
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0 , solver = 'liblinear')
classifier.fit(X_train, y_train)

y_pred3 = classifier.predict(X_test)

cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)
print("No of correct = " + str(cm3[0,0] + cm3[1,1]) + "/250")
print("Accuracy = " + str((cm3[0,0] + cm3[1,1])/2.5))

Classification Model : Logistic Regression
[[99 22]
 [30 99]]
No of correct = 198/250
Accuracy = 79.2


In [10]:
print("Classification Model : KNN")
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred4 = classifier.predict(X_test)

cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)
print("No of correct = " + str(cm4[0,0] + cm4[1,1]) + "/250")
print("Accuracy = " + str((cm4[0,0] + cm4[1,1])/2.5))

Classification Model : KNN
[[77 44]
 [50 79]]
No of correct = 156/250
Accuracy = 62.4
