In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

In [2]:
df= pd.read_csv('last_2_years_restaurant_reviews.csv',nrows=10000)
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,review_id,user_id,stars,date,text,useful,funny,cool,count
0,-0aIra_B6iALlfqAriBSYA,"""Tea2go""",Restaurants;Food;Tea Rooms;Coffee & Tea,4.5,gqcbCEitsgE-3s7oAc9g-A,9oFkHW8IicDrJAuQmGYo3Q,5,2017-03-11,"If you like iced tea, this is the place to go!...",0,0,0,26
1,-0aIra_B6iALlfqAriBSYA,"""Tea2go""",Restaurants;Food;Tea Rooms;Coffee & Tea,4.5,JAJnV-A8rZZFRjTpfcGeDA,gWc0VxrzNh4qN6AuXreWGw,3,2017-09-15,My girlfriend and I had never been to the shop...,0,0,0,26
2,-0aIra_B6iALlfqAriBSYA,"""Tea2go""",Restaurants;Food;Tea Rooms;Coffee & Tea,4.5,MFdkDjiFHADe9oLyhhFmBg,7QVNLrJDkjsDsCtODlPdsw,5,2017-12-08,Closed up shop as of yesterday.\n\nShelves wer...,0,0,0,26
3,-0aIra_B6iALlfqAriBSYA,"""Tea2go""",Restaurants;Food;Tea Rooms;Coffee & Tea,4.5,Fby1UMZdcPh2vkgBBd4uxA,v8dYMrl50FFjAiOuVNEK6A,2,2017-07-08,Products and Service good based on previous vi...,0,0,0,26
4,-0aIra_B6iALlfqAriBSYA,"""Tea2go""",Restaurants;Food;Tea Rooms;Coffee & Tea,4.5,gfsLAtETZxsz5S_JjchUNg,qDOVAoVBObWOHAcCjsC_OA,5,2016-03-17,Really good spicy chai latte. Worker was kind ...,0,0,0,26


In [3]:
#Define my feature variables, here is the text of the review
documents = df['text']
documents.shape


(10000,)

In [4]:
#Define my target variable.predicting users' rating based on reviews. The target variable is rating.

target = df['stars']
df['stars'].value_counts()

5    4415
4    2339
1    1192
3    1162
2     892
Name: stars, dtype: int64

In [5]:
#Create training dataset and test dataset
from sklearn.model_selection import train_test_split
documents_train, documents_test, target_train, target_test = \
train_test_split(documents, target, random_state = 1, test_size=0.3)


In [6]:
documents_train.iloc[1]

'The younger 20 year old ish crack head looking Mexican girl with the yellow jacked up teeth and glasses absolutely cannot follow directions on orders. She has no comprehension of what "on the side" means and proceeded to literally put the sour cream on the side of my bowl... Some other workers here are almost as sad and bad. Why this location hires such stupidity is beyond me! But the food is standard for chipotle.'

NLP representation of the documents

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True, max_features = 5000
                            )
# Train the model with my training data
documents_train_vec = vectorizer.fit_transform(documents_train).toarray()
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
# Use the trained model to transform your test data
documents_test_vec = vectorizer.transform(documents_test).toarray()

Similar review search engine

In [8]:
def get_top_values(lst, n, labels):
    #Given a list of values, find the indices with the highest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  

def get_bottom_values(lst, n, labels):
    #Given a list of values, find the indices with the lowest n values.
    #Return the labels for each of these indices.
    return [labels[i] for i in np.argsort(lst)[:n]]

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
# Draw an arbitrary review from test (unseen in training) documents
arbitrary_review = np.random.choice(documents_test, 1)
arbitrary_review[0]

"Pleasantly surprised...the staff was very nice and attentive...but wait, you're not reading this to hear about the staff...you wanna know about the food. Its a fresh\\/Asian take on assembly line food...but its actually quite good. Vegetables come in a heaping serving, portions are nice and big for lunch, and even though it came with only 4 shrimps, they were.cooked to order and cooked perfectly...I plan to return."

In [10]:
# Transform the drawn review(s) to vector(s)
arbitrary_review_vec = vectorizer.transform([arbitrary_review[0]]).toarray()
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_score = cosine_similarity(arbitrary_review_vec, documents_train_vec)

# Let me find top 5 similar reviews
n = 5
similar_reviews = get_top_values(similarity_score[0], n, list(documents_train))

print('My search query: \n%s\n' % (arbitrary_review[0]))
print('Top %s similar reviews:' % n)
for i in range(n):
    print('No. %d review is %s.\n' % (i+1, similar_reviews[i]))

My search query: 
Pleasantly surprised...the staff was very nice and attentive...but wait, you're not reading this to hear about the staff...you wanna know about the food. Its a fresh\/Asian take on assembly line food...but its actually quite good. Vegetables come in a heaping serving, portions are nice and big for lunch, and even though it came with only 4 shrimps, they were.cooked to order and cooked perfectly...I plan to return.

Top 5 similar reviews:
No. 1 review is Very tasty, very fresh.  I was quite surprised as I don't think much of Asian food in Phoenix, but this was good.  A little pricey for lunch I thought, but the quality of the food and service made up for it..

No. 2 review is I have been coming here for many many years. Still as good as always. I love all of their food. Great gyros and chicken shawarma, falafel and everything on their menu. Cooked to order fresh ingredients and an attentive staff. Plus very inexpensive with large portions..

No. 3 review is This is the

The simple search engine seems working well. Its result makes sense as it will give similar positive reviews if my search query is positive.

Build model to predict rate based on reviews

1. Naive-Bayes Classifier
This is the simplest model as it assumes that there is no dependence between words. 

In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

index = ['NaiveBayes','LogisticRegression','RandomForestClassifier','SVM','XGBoost','MLP','GradientBoostingClassifier']
score_table = pd.DataFrame(index = index, columns= ['precision_score','recall_score','f1_score','accuracy_score'])

# define function for logging the results
def compute_log_result(algo, pred_train, pred_test):
    """compute and log the performance into the score_table for both training and test sets"""
    #global precision_score,recall_score,f1_score,accuracy_score
    # compute the performance
    pre_score=precision_score(pred_test,pred_train, average="macro")
    rec_score=recall_score(pred_test,pred_train, average="macro")
    f1score=f1_score(pred_test,pred_train, average="macro")
    acc_score=round(accuracy_score(pred_test,pred_train)*100,2)
    
    # log the performance
    score_table.loc[algo,:] = pre_score,rec_score,f1score,acc_score

In [12]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB()
clf_NB.fit(documents_train_vec, target_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
print('The accuracy score for train data set is %f, for test data set is: %f' % \
     (clf_NB.score(documents_train_vec, target_train), clf_NB.score(documents_test_vec, target_test)))

The accuracy score for train data set is 0.580857, for test data set is: 0.530000


In [14]:
predmnb = clf_NB.predict(documents_test_vec)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(target_test,predmnb))
print("Score:",round(accuracy_score(target_test,predmnb)*100,2))
print("Classification Report:",classification_report(target_test,predmnb))

Confusion Matrix for Multinomial Naive Bayes:
[[ 197    2    5   20  128]
 [  59    1    4   34  164]
 [  21    0    2   63  261]
 [   2    0    0   65  626]
 [   4    0    0   17 1325]]
Score: 53.0
Classification Report:               precision    recall  f1-score   support

           1       0.70      0.56      0.62       352
           2       0.33      0.00      0.01       262
           3       0.18      0.01      0.01       347
           4       0.33      0.09      0.15       693
           5       0.53      0.98      0.69      1346

    accuracy                           0.53      3000
   macro avg       0.41      0.33      0.29      3000
weighted avg       0.44      0.53      0.42      3000



The performance of Naive-Bayes Classifier is

not good enough. Thus, we will try Logistic Regression Classifier.

In [15]:
compute_log_result("NaiveBayes",predmnb,target_test)

In [16]:
score_table

Unnamed: 0,precision_score,recall_score,f1_score,accuracy_score
NaiveBayes,0.41341,0.329487,0.294649,53.0
LogisticRegression,,,,
RandomForestClassifier,,,,
SVM,,,,
XGBoost,,,,
MLP,,,,
GradientBoostingClassifier,,,,


In [17]:
# Build a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression()
clf_LR.fit(documents_train_vec, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
print('The accuracy score for train data set is %f, for test data set is: %f' % \
     (clf_LR.score(documents_train_vec, target_train), clf_LR.score(documents_test_vec, target_test)))

The accuracy score for train data set is 0.785286, for test data set is: 0.598667


In [19]:
predlr = clf_LR.predict(documents_test_vec)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(target_test,predlr))
print("Score:",round(accuracy_score(target_test,predlr)*100,2))
print("Classification Report:",classification_report(target_test,predlr))

Confusion Matrix for Multinomial Naive Bayes:
[[ 248   32   23   16   33]
 [  94   42   42   34   50]
 [  33   21   89  117   87]
 [  15    6   36  250  386]
 [  13    5    2  159 1167]]
Score: 59.87
Classification Report:               precision    recall  f1-score   support

           1       0.62      0.70      0.66       352
           2       0.40      0.16      0.23       262
           3       0.46      0.26      0.33       347
           4       0.43      0.36      0.39       693
           5       0.68      0.87      0.76      1346

    accuracy                           0.60      3000
   macro avg       0.52      0.47      0.47      3000
weighted avg       0.56      0.60      0.57      3000



Compared with Naive-Bayes model, Logistic Regression model improces a little bit. Let me find out the top 20 most important words given by Logistic Regression model.

In [20]:
compute_log_result("LogisticRegression",predlr,target_test)

In [21]:
# Let me find it out by ranking
n = 20
print ('Top 20 words by ranking are %s.' % (", ".join(i for i in get_top_values(clf_LR.coef_[0], n, words))))

Top 20 words by ranking are worst, bad, horrible, terrible, money, minutes, told, manager, management, business, asked, awful, rude, disgusting, don, zero, gross, nasty, wrong, avoid.


The top 20 important words are related with negative reviews. Why is it? I think customers may use word good, excellent to express their satisfaction for restaurants. From EDA process, we know the average rate is 3.47, which means there is no hard line to tell between average, good, perfect restaurants. However, there is a hard line (negative word) for people to comment a bad restaurants. Therefore, the negative words shows high significance in the model.

In [22]:
#What are the key features(words) that make the negative prediction?

print ('Top 20 words for negative prediction are %s.' % (", ".join(i for i in get_top_values(clf_LR.coef_[0], n, words))))

Top 20 words for negative prediction are worst, bad, horrible, terrible, money, minutes, told, manager, management, business, asked, awful, rude, disgusting, don, zero, gross, nasty, wrong, avoid.


In [23]:
#What are the key features(words) that make the positive prediction?

# Let's find it out by ranking
print ('Top 20 words for positive prediction are %s.' % (", ".join(i for i in get_bottom_values(clf_LR.coef_[0], n, words))))

Top 20 words for positive prediction are good, great, delicious, love, friendly, amazing, nice, best, little, bit, awesome, try, tasty, fresh, excellent, lot, pretty, really, clean, definitely.


The performance of Logistic Regression model is still not good enough. Lets Try decision tree models.

3. Random Forest Classifier


In [24]:
# Build a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(n_estimators=100, max_depth = 25, min_samples_leaf= 5, 
                             min_samples_split=5, random_state = 1)
clf_RF.fit(documents_train_vec, target_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [25]:
print('The accuracy score for train data set is %f, for test data set is: %f' % \
     (clf_RF.score(documents_train_vec, target_train), clf_RF.score(documents_test_vec, target_test)))

The accuracy score for train data set is 0.562571, for test data set is: 0.500333


In [26]:
predrf = clf_RF.predict(documents_test_vec)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(target_test,predrf))
print("Score:",round(accuracy_score(target_test,predrf)*100,2))
print("Classification Report:",classification_report(target_test,predrf))

Confusion Matrix for Multinomial Naive Bayes:
[[ 135    1    1    8  207]
 [  42    0    3   12  205]
 [  19    0    6   38  284]
 [   2    0    0   34  657]
 [   8    0    0   12 1326]]
Score: 50.03
Classification Report:               precision    recall  f1-score   support

           1       0.66      0.38      0.48       352
           2       0.00      0.00      0.00       262
           3       0.60      0.02      0.03       347
           4       0.33      0.05      0.09       693
           5       0.49      0.99      0.66      1346

    accuracy                           0.50      3000
   macro avg       0.42      0.29      0.25      3000
weighted avg       0.44      0.50      0.38      3000



In [27]:
compute_log_result("RandomForestClassifier",predrf,target_test)

In [28]:
#What are important features (words) by inspecting the Random Forest model?

n = 20
print ('Top 20 words by ranking are %s.' % (", ".join(i for i in get_top_values(clf_RF.feature_importances_, n, words))))


Top 20 words by ranking are great, amazing, good, worst, delicious, love, best, asked, ok, minutes, pretty, awesome, didn, order, favorite, fresh, told, just, friendly, wasn.


The random forest model performs the worst. 

Summary:
The object is to build a model to predict the users' rating based on users' reviews.
I define rate (1-5 stars) as target variable and use NLP method to vectorize reviews.
I check the rationality of NLP treatment by similarity measurement.
I used Naive Bayes, Logistic Regression and Random Forest to build multi-target classification models.

In [29]:
# Support Vector Machine
from sklearn.svm import SVC
svm = SVC(random_state=101)
svm.fit(documents_train_vec, target_train)
predsvm = svm.predict(documents_test_vec)
print("Confusion Matrix for Support Vector Machines:")
print(confusion_matrix(target_test,predsvm))
print("Score:",round(accuracy_score(target_test,predsvm)*100,2))
print("Classification Report:",classification_report(target_test,predsvm))

Confusion Matrix for Support Vector Machines:
[[ 264   17   18   16   37]
 [ 100   28   45   31   58]
 [  37   20   72  110  108]
 [  16    5   22  216  434]
 [  13    3    2  113 1215]]
Score: 59.83
Classification Report:               precision    recall  f1-score   support

           1       0.61      0.75      0.68       352
           2       0.38      0.11      0.17       262
           3       0.45      0.21      0.28       347
           4       0.44      0.31      0.37       693
           5       0.66      0.90      0.76      1346

    accuracy                           0.60      3000
   macro avg       0.51      0.46      0.45      3000
weighted avg       0.55      0.60      0.55      3000



In [30]:
compute_log_result("SVM",predsvm,target_test)

In [31]:
# XGBoost Classifier
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(documents_train_vec, target_train)
predxgb = xgb.predict(documents_test_vec)
print("Confusion Matrix for XGBoost Classifier:")
print(confusion_matrix(target_test,predxgb))
print("Score: ",round(accuracy_score(target_test,predxgb)*100,2))
print("Classification Report:")
print(classification_report(target_test,predxgb))

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


Confusion Matrix for XGBoost Classifier:
[[ 233   37   26   29   27]
 [  79   51   49   40   43]
 [  32   29   72  133   81]
 [  19    6   52  240  376]
 [  28    6   20  169 1123]]
Score:  57.3
Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.66      0.63       352
           2       0.40      0.19      0.26       262
           3       0.33      0.21      0.25       347
           4       0.39      0.35      0.37       693
           5       0.68      0.83      0.75      1346

    accuracy                           0.57      3000
   macro avg       0.48      0.45      0.45      3000
weighted avg       0.54      0.57      0.55      3000



In [32]:
compute_log_result("XGBoost",predxgb,target_test)

In [33]:
# MULTILAYER PERCEPTRON CLASSIFIER
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(documents_train_vec, target_train)
predmlp = mlp.predict(documents_test_vec)
print("Confusion Matrix for Multilayer Perceptron Classifier:")
print(confusion_matrix(target_test,predmlp))
print("Score:",round(accuracy_score(target_test,predmlp)*100,2))
print("Classification Report:")
print(classification_report(target_test,predmlp))

Confusion Matrix for Multilayer Perceptron Classifier:
[[204  80  29  21  18]
 [ 73  78  52  35  24]
 [ 30  48  99 100  70]
 [ 15  22  74 266 316]
 [ 17  20  42 324 943]]
Score: 53.0
Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.58      0.59       352
           2       0.31      0.30      0.31       262
           3       0.33      0.29      0.31       347
           4       0.36      0.38      0.37       693
           5       0.69      0.70      0.69      1346

    accuracy                           0.53      3000
   macro avg       0.46      0.45      0.45      3000
weighted avg       0.53      0.53      0.53      3000



In [34]:
compute_log_result("MLP",predmlp,target_test)

In [36]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbi = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,max_features=0.5,random_state=999999)
gbi.fit(documents_train_vec, target_train)
predgbi = gbi.predict(documents_test_vec)
print("Confusion Matrix for Gradient Boosting Classifier:")
print(confusion_matrix(target_test,predgbi))
print("Score:",round(accuracy_score(target_test,predgbi)*100,2))
print("Classification Report:",classification_report(target_test,predgbi))

Confusion Matrix for Gradient Boosting Classifier:
[[ 204   36   20   34   58]
 [  70   27   40   60   65]
 [  26   16   65  130  110]
 [  15    3   36  231  408]
 [  13    8    9  148 1168]]
Score: 56.5
Classification Report:               precision    recall  f1-score   support

           1       0.62      0.58      0.60       352
           2       0.30      0.10      0.15       262
           3       0.38      0.19      0.25       347
           4       0.38      0.33      0.36       693
           5       0.65      0.87      0.74      1346

    accuracy                           0.56      3000
   macro avg       0.47      0.41      0.42      3000
weighted avg       0.52      0.56      0.53      3000



In [37]:
compute_log_result("GradientBoostingClassifier",predgbi,target_test)

In [38]:
score_table

Unnamed: 0,precision_score,recall_score,f1_score,accuracy_score
NaiveBayes,0.41341,0.329487,0.294649,53.0
LogisticRegression,0.517297,0.46982,0.473995,59.87
RandomForestClassifier,0.415445,0.287003,0.252337,50.03
SVM,0.510167,0.455745,0.450641,59.83
XGBoost,0.478686,0.448945,0.452048,57.3
MLP,0.459027,0.449398,0.453622,53.0
GradientBoostingClassifier,0.46661,0.414202,0.420351,56.5
