In [155]:
import pandas as pd 
import numpy as np
import os
import re # regualr expression module

### 1. Load csv data

In [156]:
inp_review = pd.read_csv('Zomato_reviews.csv')
inp_review.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [157]:
inp_review.describe(include='all')

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


In [158]:
inp_review = inp_review[~inp_review.review_text.isnull()].copy()
inp_review.reset_index(inplace=True, drop=True)

In [159]:
inp_review.describe(include='all')

Unnamed: 0,rating,review_text
count,27748.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665291,
std,1.28463,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


### 2. Get the reviews into a list, for easy text clean up and manipulation

In [160]:
reviews = inp_review.review_text.values
reviews[0:5]

array(['Their service is worst, pricing in menu is different from bill. They can give you a bill with increased pricing. Even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
       "really appreciate their quality and timing . I have tried the thattil kutti dosa I've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
       'Went there on a Friday night, the place was surprisingly empty. Interesting menu which is almost fully made of dosas. I had bullseye dosa and cheese masala dosa. The bullseye Dosa was really good, with the egg perfectly cooked to a half boiled state. The masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. The chutney was good, the sambar was average. The dishes are reasonably priced.',
       'A very decent place serving good food.\r\nOrdered Chilli fish, Chicken & Pork sizzler.\r\nEverything tasted good but Pork could

In [161]:
# Normalise to lower case
reviews = [r.lower() for r in reviews]
reviews[0:5]

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly bett

In [162]:
# Remove extra line breaks
reviews = [' '.join(txt.split()) for txt in reviews]
reviews[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

### Tokenze using Tweet Tokenizer from NLTK

In [163]:
from nltk.tokenize import word_tokenize
print(word_tokenize(reviews[1]))

['really', 'appreciate', 'their', 'quality', 'and', 'timing', '.', 'i', 'have', 'tried', 'the', 'thattil', 'kutti', 'dosa', 'i', "'ve", 'been', 'addicted', 'to', 'the', 'dosa', 'really', 'and', 'the', 'chutney', '...', 'really', 'good', 'and', 'money', 'worth', 'much', 'better', 'than', 'a', 'thattukada', 'must', 'try', 'it']


In [164]:
reviewToken = [word_tokenize(tkn) for tkn in reviews]

### Remove stop words and punctuations

In [165]:
from nltk.corpus import stopwords
from string import punctuation

In [166]:
stop_nltk = stopwords.words('english')
stop_punctn = list(punctuation)

In [167]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [168]:
stop_nltk.remove('no')
stop_nltk.remove('not')
stop_nltk.remove('won')
stop_nltk.remove('don')

In [169]:
print('no' in stop_nltk)
print('not' in stop_nltk)
print('won' in stop_nltk)
print('don' in stop_nltk)

False
False
False
False


In [170]:
stop_final = stop_nltk + stop_punctn + ["...", "``","''", "====", "must"]

In [171]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [172]:
reviewClean = [del_stop(tkn) for tkn in reviewToken]

In [173]:
reviewClean[1]

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [174]:
reviewFinal = [' '.join(tkn) for tkn in reviewClean]

In [175]:
reviewFinal[1]

"really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"

### Separate X and Y and perform train test split, 70-30

In [176]:
X = reviewFinal
y = inp_review.rating.values

In [177]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Use TF-IDF values for the terms as a feature to get into a vector space model

In [178]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [179]:
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.fit_transform(X_test)

In [180]:
X_train_bow.shape, X_test_bow.shape

((19423, 5000), (8325, 5000))

### Model building: Random Forest

In [181]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [182]:
modelRF = RandomForestRegressor(random_state=42)
modelRF.fit(X_train_bow, y_train)

RandomForestRegressor(random_state=42)

In [183]:
y_train_pred = modelRF.predict(X_train_bow)

In [184]:
# Accuracy test
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, y_train_pred)**0.5

0.2375118642849858

### Increase number of trees

In [185]:
modelRF = RandomForestRegressor(random_state=42, n_estimators=20)
modelRF.fit(X_train_bow, y_train)
y_train_pred = modelRF.predict(X_train_bow)
mean_squared_error(y_train, y_train_pred)**0.5

0.25099759228559737

### Hyper-parameter tuning

In [186]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [187]:
param_grid = {
    'max_features' : [500, 'auto', 'sqrt', 'log2'],
    'max_depth' : [10, 15, 20, 25]
}

In [188]:
grid_search = GridSearchCV(estimator = modelRF, 
                           param_grid = param_grid,
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 1, 
                           scoring = 'neg_mean_squared_error')
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(n_estimators=20, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
                         'max_features': [500, 'auto', 'sqrt', 'log2']},
             scoring='neg_mean_squared_error', verbose=1)

In [192]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=25, max_features=500, n_estimators=20,
                      random_state=42)

### Predict and evaluate using the best estimator

In [193]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [194]:
mean_squared_error(y_train, y_train_pred)**0.5

0.5905371942625472

In [195]:
mean_squared_error(y_test, y_test_pred)**0.5

1.4912201808352477

### Identifying mismatch cases

In [196]:
res_df = pd.DataFrame({'review' : X_test, 'rating' : y_test, 'rating_pred' : y_test_pred})

In [197]:
res_df[(res_df.rating - res_df.rating_pred) >= 2].shape

(666, 3)

In [198]:
res_df[(res_df.rating - res_df.rating_pred) >= 2]

Unnamed: 0,review,rating,rating_pred
3,real hyderabadi biriyani lovers restaurant wen...,4.5,2.443815
15,looking new place gang 15 catch ordered couple...,5.0,2.182775
16,place really mind blowing idly sambar vada get...,4.5,1.328547
18,nandhini restaurant biriyni chill chicken nice...,5.0,1.749014
21,everything written cupcake box true fresh soft...,5.0,1.280630
...,...,...,...
8281,amazing totally impressed place give place ful...,5.0,1.966824
8294,great prices amazing customer services great s...,5.0,1.280630
8300,okay start loved food love service yes scope i...,5.0,2.826876
8307,good place nice ambience good food brunch opti...,4.0,1.823410
