In [12]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import re
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [13]:
features = pd.read_csv(r"Restaurant_Reviews.tsv",delimiter ='\t',quoting=3)

In [14]:
features = features.rename(columns = {"Review":"description"})

In [15]:
features.head()

Unnamed: 0,description,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [16]:
##Preprocessing

#Removing Stopwords and punctuation

sw = set(stopwords.words('english'))-set(['not'])
def remove_pun_stopwords(text):
    text = re.sub(r'[^\w\s]','',text)
    text = [i.lower() for i in text.lower().split() if i not in sw]
    return(' '.join(text))

In [17]:
features['description'] = features['description'].apply(remove_pun_stopwords)

In [18]:
negative = features[features['Liked'] == 0]
positive = features[features['Liked'] == 1]

In [23]:
###Average lenth of positive reviews
negative['len_of_words'] = negative['description'].apply(lambda x : len(x))

###Average lenth of negative reviews
positive['len_of_words'] = positive['description'].apply(lambda x : len(x))

In [24]:
## Finding frequency of words in positive and negative reviews 
freq_n = pd.Series((' '.join(negative['description']).split())).value_counts()
freq_p = pd.Series((' '.join(positive['description']).split())).value_counts()

In [31]:
top_positive_review_wrds = list(freq_p.head(5).index)
top_negative_review_wrds = list(freq_n.head(5).index)

In [34]:
def positive_word_count(text):
    x = 0
    for i in text.split():
        for j in top_positive_review_wrds:
            y = i.count(j)
            x+=y
    return(x)
def negative_word_count(text):
    x = 0
    for i in text.split():
        for j in top_negative_review_wrds:
            y = i.count(j)
            x+=y
    return(x)

In [36]:
###taking count of positive and negative word in each review
features['good_words_count'] = features['description'].apply(positive_word_count)
features['bad_word_count'] = features['description'].apply(negative_word_count)

In [37]:
features['len_of_words'] = features['description'].apply(lambda x : len(x))

In [38]:
##Stemming the words
ps = SnowballStemmer('english')
def stem_words(text):
    text = [ps.stem(i) for i in text.split()]
    return(' '.join(text))

In [39]:
features['description'] = features['description'].apply(stem_words)

In [40]:
features.head()

Unnamed: 0,description,Liked,good_words_count,bad_word_count,len_of_words
0,wow love place,1,1,1,15
1,crust not good,0,1,1,14
2,not tasti textur nasti,0,0,1,23
3,stop late may bank holiday rick steve recommen...,1,0,0,61
4,select menu great price,1,1,0,27


In [49]:
cv = CountVectorizer()
### Converting the text in count vector
X = cv.fit_transform(features['description']).toarray()
X = pd.DataFrame(X)

In [50]:
## Merging Count vector with the features we created above
X = pd.concat([X,features.iloc[:,2:5]],axis = 1)

In [51]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1608,1609,1610,1611,1612,1613,1614,good_words_count,bad_word_count,len_of_words
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,15
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,14
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,61
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,27


In [52]:
y = features['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
### We will do prediction using various algorithms

In [54]:
## Algo_1 = XG Boost

classifier1 = xgb.XGBClassifier(n_estimators=100)
classifier1.fit(X_train, y_train)

y_pred_1 = classifier1.predict(X_test)

cm_1 = confusion_matrix(y_test, y_pred_1)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_1)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_1)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_1)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_1)))

Accuracy Score : 0.715
Precision Score : 0.896551724137931
Recall Score : 0.5048543689320388
F1 Score : 0.6459627329192547


In [68]:
## Algo_1 = Random Forest

classifier2 = RandomForestClassifier(n_estimators=300)
classifier2.fit(X_train, y_train)

y_pred_2 = classifier2.predict(X_test)

cm_2 = confusion_matrix(y_test, y_pred_2)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_2)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_2)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_2)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_2)))

Accuracy Score : 0.82
Precision Score : 0.8383838383838383
Recall Score : 0.8058252427184466
F1 Score : 0.8217821782178217


In [70]:
## Algo_3 = ANN

classifier = Sequential()
classifier.add(Dense(units = 100,activation = 'sigmoid',input_shape = (X_train.shape[1],)))

classifier.add(Dense(units = 100,activation = 'sigmoid'))

classifier.add(Dense(units = 1,activation = 'sigmoid'))



classifier.compile(loss= 'binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

classifier.fit(X_train, y_train, batch_size = 10, epochs= 20,validation_split = 0.1)

y_pred_3 = classifier.predict(X_test)

y_pred_3 = [1 if i>0.5 else 0 for i in y_pred_3 ]

cm_3 = confusion_matrix(y_test,y_pred_3)



Train on 720 samples, validate on 80 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [71]:
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_3)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_3)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_3)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_3)))

Accuracy Score : 0.835
Precision Score : 0.8240740740740741
Recall Score : 0.8640776699029126
F1 Score : 0.8436018957345971


In [49]:
#### Same operation using the TFID Vector

cv = TfidfVectorizer()
### Converting the text in count vector
X = cv.fit_transform(features['description']).toarray()
X = pd.DataFrame(X)

In [50]:
## Merging Count vector with the features we created above
X = pd.concat([X,features.iloc[:,2:5]],axis = 1)

In [51]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1608,1609,1610,1611,1612,1613,1614,good_words_count,bad_word_count,len_of_words
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,15
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,14
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,61
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,27


In [52]:
y = features['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
### We will do prediction using various algorithms

In [54]:
## Algo_1 = XG Boost

classifier1 = xgb.XGBClassifier(n_estimators=100)
classifier1.fit(X_train, y_train)

y_pred_1 = classifier1.predict(X_test)

cm_1 = confusion_matrix(y_test, y_pred_1)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_1)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_1)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_1)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_1)))

Accuracy Score : 0.715
Precision Score : 0.896551724137931
Recall Score : 0.5048543689320388
F1 Score : 0.6459627329192547


In [68]:
## Algo_1 = Random Forest

classifier2 = RandomForestClassifier(n_estimators=300)
classifier2.fit(X_train, y_train)

y_pred_2 = classifier2.predict(X_test)

cm_2 = confusion_matrix(y_test, y_pred_2)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_2)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_2)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_2)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_2)))

Accuracy Score : 0.82
Precision Score : 0.8383838383838383
Recall Score : 0.8058252427184466
F1 Score : 0.8217821782178217


In [70]:
## Algo_3 = ANN

classifier = Sequential()
classifier.add(Dense(units = 100,activation = 'sigmoid',input_shape = (X_train.shape[1],)))

classifier.add(Dense(units = 100,activation = 'sigmoid'))

classifier.add(Dense(units = 1,activation = 'sigmoid'))



classifier.compile(loss= 'binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

classifier.fit(X_train, y_train, batch_size = 10, epochs= 20,validation_split = 0.1)

y_pred_3 = classifier.predict(X_test)

y_pred_3 = [1 if i>0.5 else 0 for i in y_pred_3 ]

cm_3 = confusion_matrix(y_test,y_pred_3)



Train on 720 samples, validate on 80 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [71]:
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_3)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_3)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_3)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_3)))

Accuracy Score : 0.835
Precision Score : 0.8240740740740741
Recall Score : 0.8640776699029126
F1 Score : 0.8436018957345971
