In [1]:
# import the libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# Dataset
df = pd.read_csv('datasets/Restaurant_Reviews.tsv',sep = '\t')

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
test = [
    'I really liked this ham',
    'This burger tastes rubbish',
    'Wowwww! I like this food',
    'We can get the same food at a low cost somewhere else',
    'This restaurant is good',
    'This restaurant is really good',
    'You will not believe this, wait what!! the taste just changed',
    "I don't know what i'm eating, this taste awful",
    'This restaurant is not good'
]
#Desired Output: [1 0 1 0 1 1 1 0 0]

In [5]:
print(df.shape)
print(df.isna().sum())

(1000, 2)
Review    0
Liked     0
dtype: int64


In [6]:
# remove the empty string from the review column.
empty_loc  = []
for i, a,b in df.itertuples():
    if type(a) == str:
        if a.isspace() == True:
            empty_loc.append(i)
print(empty_loc)

[]


###### LinearSVC with 40% test data

In [38]:
x = df["Review"]
y = df["Liked"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.4, random_state = 25)

In [39]:
#Linear Classifier
Classifier_svc = Pipeline([('tfIdf',TfidfVectorizer()),('cl',LinearSVC()),])
Classifier_svc.fit(x_train,y_train)
pred = Classifier_svc.predict(x_test)

In [40]:
# model evaluation
cm = confusion_matrix(y_test,pred)
print(cm)
print("Accuracy : ", accuracy_score(y_test,pred))

[[170  34]
 [ 39 157]]
Accuracy :  0.8175


In [10]:
Classifier_svc.predict(test)
#Desired Output: [1 0 1 0 1 1 1 0 0]

array([1, 0, 0, 0, 1, 1, 1, 0, 1], dtype=int64)

###### LinearSVC with 30% test data

In [41]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 25)
Classifier_svc.fit(x_train,y_train)
pred = Classifier_svc.predict(x_test)

In [42]:
# model evaluation
print("confusion matrix : ")
cm = confusion_matrix(y_test,pred)
print(cm)
print("Accuracy : ", accuracy_score(y_test,pred))

confusion matrix : 
[[133  23]
 [ 31 113]]
Accuracy :  0.82


In [13]:
Classifier_svc.predict(test)
#Desired Output: [1 0 1 0 1 1 1 0 0]

array([1, 0, 0, 0, 1, 1, 1, 0, 1], dtype=int64)

###### LinearSVC with 20% test data

In [67]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 25)
Classifier_svc.fit(x_train,y_train)
pred = Classifier_svc.predict(x_test)

In [68]:
# model evaluation
print("confusion matrix : ")
cm = confusion_matrix(y_test,pred)
print(cm)
print("Accuracy : ", accuracy_score(y_test,pred))

confusion matrix : 
[[86 18]
 [19 77]]
Accuracy :  0.815


In [16]:
Classifier_svc.predict(test)
#Desired Output: [1 0 1 0 1 1 1 0 0]

array([1, 0, 1, 0, 1, 1, 1, 0, 1], dtype=int64)

##### LinearSVC with 10% test data.

In [32]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1, random_state = 25)  # 90% of the data used for training
print(y_train.value_counts())
Classifier_svc.fit(x_train,y_train)
pred = Classifier_svc.predict(x_test)

0    451
1    449
Name: Liked, dtype: int64


In [33]:
# model evaluation
print("confusion matrix : ")
cm = confusion_matrix(y_test,pred)
print(cm)
print("Accuracy : ", accuracy_score(y_test,pred))

confusion matrix : 
[[43  6]
 [ 9 42]]
Accuracy :  0.85


In [19]:
test = [
    'I really liked this ham',
    'This burger tastes rubbish',
    'Wowwww! I like this food',
    'We can get the same food at a low cost somewhere else',
    'This restaurant is good',
    'This restaurant is really good',
    'You will not believe this, wait what!! the taste just changed',
    "I don't know what i'm eating, this taste awful",
    'This restaurant is not good'
]

In [20]:
Classifier_svc.predict(test)
#Desired Output: [1 0 1 0 1 1 1 0 0]

array([1, 0, 1, 0, 1, 1, 0, 0, 1], dtype=int64)

##### With SVC kernel linear

In [73]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.10, random_state = 25)
Clf_svc = Pipeline([('tfIdf',TfidfVectorizer()),('cl',SVC(kernel = 'linear')),])
Clf_svc.fit(x_train,y_train)
pred = Clf_svc.predict(x_test)

In [74]:
print("confusion matrix : ")
cm = confusion_matrix(y_test,pred)
print(cm)
print("Accuracy : ", accuracy_score(y_test,pred))

confusion matrix : 
[[45  4]
 [11 40]]
Accuracy :  0.85


In [23]:
Classifier_svc.predict(test)
#Desired Output: [1 0 1 0 1 1 1 0 0]

array([1, 0, 1, 0, 1, 1, 0, 0, 1], dtype=int64)