In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import time
import glob
import datetime
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier



In [2]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')

In [3]:
data.head()

Unnamed: 0,business_id,city,latitude,longitude,name,open,review_count,BusinessStars,state,date,...,user_fans,user_name,user_review_count,user_votes_cool,user_votes_funny,user_votes_useful,user_yelping_since,rating1,rating3,rating5
0,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-08,...,1,Jim,26,3,10,26,2012-10,0,0,0
1,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-10,...,0,Jennifer,3,0,0,1,2016-04,1,0,0
2,UsFtqoBl7naz8AVUBZMjQQ,Dravosburg,40.350553,-79.886814,Clancy's Pub,True,5,3.0,PA,2014-11-28,...,10,Molly,158,80,46,161,2012-06,0,0,0
3,buOw4-D2dVxOS_DOC4q8OQ,Homestead,40.409064,-79.915797,Yokoso Japanese Steakhouse,True,99,3.0,PA,2014-10-02,...,10,Molly,158,80,46,161,2012-06,0,1,0
4,SsGNAc9U-aKPZccnaDtFkA,Pittsburgh,40.443145,-80.001104,Meat & Potatoes,True,1175,4.0,PA,2014-09-13,...,10,Molly,158,80,46,161,2012-06,0,0,1


In [4]:
data.shape

(156695, 35)

Most important words for rating 1 vs rating 5, rating 3 vs rating 5...

In [5]:
data_rating1 = data[(data.rating1 == 1) | (data.rating5 == 1)]

In [6]:
data_rating3 = data[(data.rating3 == 1) | (data.rating5 == 1)]

In [7]:
data_rating1.shape

(74090, 35)

In [8]:
data_rating3.shape

(82716, 35)

## Build a random forest model to predict review rating of 1 using the review text features

In [9]:
# convert rows with empty review text to texts that are of the string variable type
texts = data_rating1['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)

In [10]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating1['rating1']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))



CV AUC [ 0.97379081  0.97691817  0.98190105], Average AUC 0.977536677793


In [12]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

          Features  Importance Score
985          worst          0.040963
409       horrible          0.033543
882       terrible          0.028744
369          great          0.026528
26         amazing          0.020993
213      delicious          0.020407
38           asked          0.017961
517        manager          0.015752
730           rude          0.014998
76            best          0.014978
52           awful          0.014082
55             bad          0.013582
544        minutes          0.013532
898           told          0.012020
162           cold          0.011811
88           bland          0.011427
658           poor          0.011145
628        perfect          0.010438
733           said          0.009448
269      excellent          0.008570
553          money          0.007982
528       mediocre          0.007692
232  disappointing          0.007517
961          waste          0.007482
630      perfectly          0.006818
601        ordered          0.006569
5

#### How about bigrams?

In [13]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 2], 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating1['rating1']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.88045894  0.89104816  0.89771035], Average AUC 0.889739150836


In [15]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

              Features  Importance Score
870        tasted like          0.020736
2           20 minutes          0.012416
566      minutes later          0.012254
516        looked like          0.010875
958        waste money          0.010335
875   terrible service          0.009262
904       took forever          0.009217
446   horrible service          0.009088
982      worst service          0.008735
1           15 minutes          0.008702
324     food poisoning          0.008571
651       poor service          0.008518
0           10 minutes          0.008324
762   service horrible          0.008124
959         waste time          0.008047
773   service terrible          0.007802
3           30 minutes          0.007329
441   highly recommend          0.006870
729         save money          0.006644
39            bad food          0.006503
805          stay away          0.006310
41         bad service          0.005900
5           45 minutes          0.005845
950        walke

#### 2-3 words

In [17]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 3], 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating1['rating1']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.87982498  0.89053495  0.89701698], Average AUC 0.88912563991


In [18]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

              Features  Importance Score
871        tasted like          0.019834
568      minutes later          0.013159
2           20 minutes          0.012823
958        waste money          0.010803
518        looked like          0.010628
903       took forever          0.009898
981      worst service          0.009527
442   horrible service          0.009520
0           10 minutes          0.008985
1           15 minutes          0.008979
876   terrible service          0.008636
759   service horrible          0.008579
657       poor service          0.008286
320     food poisoning          0.008270
770   service terrible          0.007870
3           30 minutes          0.007367
959         waste time          0.007177
437   highly recommend          0.006933
802          stay away          0.006770
36            bad food          0.006401
38         bad service          0.006337
724         save money          0.006262
951        walked away          0.006198
887         time

## Build a random forest model to predict review rating of 3 using the review text features

In [19]:
# convert rows with empty review text to texts that are of the string variable type
texts = data_rating3['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)

In [20]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating3['rating3']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.89859042  0.91018894  0.90847058], Average AUC 0.905749980375


In [21]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

          Features  Importance Score
588             ok          0.031197
369           good          0.022528
218         decent          0.021808
25         amazing          0.014353
963           wasn          0.013568
668         pretty          0.012586
49         average          0.012062
77            best          0.011559
589           okay          0.011172
55             bad          0.010806
610        overall          0.009565
222      delicious          0.009485
80          better          0.008588
378          great          0.008451
629        perfect          0.007376
831          stars          0.007247
229           didn          0.007053
533       mediocre          0.006755
669    pretty good          0.006327
89           bland          0.006316
804           slow          0.005833
242  disappointing          0.005818
527          maybe          0.005773
452        just ok          0.005485
451           just          0.005052
332      food good          0.004864
3

#### How about bigrams?

In [22]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 2], 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating3['rating3']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.79163955  0.81182261  0.80785771], Average AUC 0.803773287276


In [23]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

              Features  Importance Score
665        pretty good          0.024973
469            just ok          0.017386
298          food good          0.014241
304            food ok          0.009893
438   highly recommend          0.007771
773       service good          0.006958
470          just okay          0.006957
290        food decent          0.006393
882        tasted like          0.005988
929          ve better          0.005537
958           wasn bad          0.005261
399         great food          0.005132
429         happy hour          0.004883
989         write home          0.004868
785       service slow          0.004708
461          just didn          0.004606
779         service ok          0.004563
526         love place          0.004481
359         good great          0.004474
605       overall good          0.004464
440           hit miss          0.004307
673          prime rib          0.004149
476          just wasn          0.004103
135  cooked perf

## Build a random forest model to predict review rating of 5 using the review text features (vs. rating of 1)

In [26]:
data_rating5 = data[(data.rating5 == 1) | (data.rating1 == 1)]

In [29]:
texts = data_rating5['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)

In [30]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating5['rating5']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.97300971  0.97595978  0.9822415 ], Average AUC 0.977070331065


In [31]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

          Features  Importance Score
985          worst          0.047057
882       terrible          0.027668
409       horrible          0.026447
369          great          0.025806
213      delicious          0.021186
38           asked          0.019083
26         amazing          0.018706
55             bad          0.017699
517        manager          0.017416
544        minutes          0.017036
76            best          0.016461
52           awful          0.015954
730           rude          0.014999
898           told          0.013819
162           cold          0.010581
88           bland          0.010364
628        perfect          0.010074
232  disappointing          0.009908
269      excellent          0.009576
658           poor          0.009389
733           said          0.007607
961          waste          0.006752
601        ordered          0.006557
51         awesome          0.006539
948         waited          0.006421
528       mediocre          0.006373
5

#### How about bigrams?

In [32]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[2, 2], 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating5['rating5']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.88046813  0.89077663  0.89883119], Average AUC 0.890025318711


In [33]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(50)

              Features  Importance Score
870        tasted like          0.019556
2           20 minutes          0.012483
566      minutes later          0.012194
446   horrible service          0.011172
516        looked like          0.010799
958        waste money          0.010659
904       took forever          0.010091
0           10 minutes          0.010079
982      worst service          0.009291
875   terrible service          0.009055
762   service horrible          0.008983
773   service terrible          0.008566
651       poor service          0.008160
324     food poisoning          0.007733
1           15 minutes          0.007450
729         save money          0.007266
441   highly recommend          0.006968
3           30 minutes          0.006893
959         waste time          0.006687
39            bad food          0.006257
805          stay away          0.006076
950        walked away          0.006021
770       service slow          0.005762
41         bad s