In [3]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import time
import glob
import datetime
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.sentiments import NaiveBayesAnalyzer

In [4]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')

In [3]:
data.head()

Unnamed: 0,business_id,city,latitude,longitude,name,open,review_count,BusinessStars,state,date,...,user_fans,user_name,user_review_count,user_votes_cool,user_votes_funny,user_votes_useful,user_yelping_since,rating1,rating3,rating5
0,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-08,...,1,Jim,26,3,10,26,2012-10,0,0,0
1,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-10,...,0,Jennifer,3,0,0,1,2016-04,1,0,0
2,UsFtqoBl7naz8AVUBZMjQQ,Dravosburg,40.350553,-79.886814,Clancy's Pub,True,5,3.0,PA,2014-11-28,...,10,Molly,158,80,46,161,2012-06,0,0,0
3,buOw4-D2dVxOS_DOC4q8OQ,Homestead,40.409064,-79.915797,Yokoso Japanese Steakhouse,True,99,3.0,PA,2014-10-02,...,10,Molly,158,80,46,161,2012-06,0,1,0
4,SsGNAc9U-aKPZccnaDtFkA,Pittsburgh,40.443145,-80.001104,Meat & Potatoes,True,1175,4.0,PA,2014-09-13,...,10,Molly,158,80,46,161,2012-06,0,0,1


In [4]:
data.shape

(156695, 35)

Most important words for rating1 vs rating 5, rating3 vs rating 5...

In [5]:
data_rating1 = data[(data.rating1 == 1) | (data.rating5 == 1)]

In [6]:
data_rating3 = data[(data.rating3 == 1) | (data.rating5 == 1)]

In [7]:
data_rating1.shape

(74090, 35)

In [8]:
data_rating3.shape

(82716, 35)

## Build a random forest model to predict review rating of 1 using the review text features

#### https://www.researchgate.net/post/What_is_the_value_of_the_area_under_the_roc_curve_AUC_to_conclude_that_a_classifier_is_excellent

In [9]:
# convert rows with empty review text to texts that are of the string variable type
texts = data_rating1['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=True)


# max_features=2000,
# max_df=0.20,
# min_df=5,
# ngram_range=[1, 2], 
# stop_words='english',
# binary=True -- To use a dummy column as the entry (1 or 0, as opposed to the count). 
# This is useful if you think a word appearing 10 times is not more important than whether the word appears at all.


# n_estimators could also be 100, ran out of memory when trying to do so 

In [10]:
model = RandomForestClassifier(n_estimators = 20)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating1['rating1']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.96861776  0.97195236  0.97843628], Average AUC 0.97300212956


In [11]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(20)

      Features  Importance Score
413   horrible          0.043114
985      worst          0.040867
881   terrible          0.029769
371      great          0.023061
213  delicious          0.019085
27     amazing          0.018778
77        best          0.017088
41       asked          0.014942
57         bad          0.014642
54       awful          0.014021
728       rude          0.013759
656       poor          0.013026
547    minutes          0.012529
898       told          0.012014
519    manager          0.011316
270  excellent          0.010249
162       cold          0.010172
628    perfect          0.009577
731       said          0.009428
88       bland          0.009169


#### Repeat with binary = False, bump up estimators

In [None]:
## vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)

model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating1['rating1']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

In [13]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(20)

      Features  Importance Score
985      worst          0.043077
410   horrible          0.033616
370      great          0.025379
882   terrible          0.024603
38       asked          0.022382
213  delicious          0.020570
26     amazing          0.019495
55         bad          0.018560
545    minutes          0.016521
76        best          0.015262
52       awful          0.014768
518    manager          0.013533
162       cold          0.012635
731       rude          0.012273
898       told          0.010592
659       poor          0.010452
88       bland          0.010370
629    perfect          0.010112
269  excellent          0.009848
554      money          0.009289


said gone, money added

## Build a random forest model to predict review rating of 3 using the review text features

In [25]:
# convert rows with empty review text to texts that are of the string variable type
texts = data_rating3['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=True)

In [26]:
model = RandomForestClassifier(n_estimators = 20)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating3['rating3']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.88131437  0.89380882  0.89345099], Average AUC 0.889524726956


In [28]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(20)

        Features  Importance Score
586           ok          0.028155
210       decent          0.016926
23       amazing          0.015991
961         wasn          0.015130
363         good          0.013573
587         okay          0.013371
46       average          0.013308
73          best          0.011095
52           bad          0.011017
667  pretty good          0.010860
215    delicious          0.008926
666       pretty          0.008631
84         bland          0.008040
608      overall          0.007835
524        maybe          0.007516
829        stars          0.007381
478         like          0.007133
629      perfect          0.006988
531     mediocre          0.006883
372        great          0.006795


#### Repeat with binary = False, bump up estimators

In [29]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)

model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_rating3['rating3']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.89788537  0.90972421  0.90674561], Average AUC 0.904785063752


In [30]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(20)

        Features  Importance Score
588           ok          0.036288
370         good          0.021795
219       decent          0.019052
25       amazing          0.016553
963         wasn          0.013723
49       average          0.012129
77          best          0.011411
668       pretty          0.010879
589         okay          0.010012
831        stars          0.009311
223    delicious          0.009014
89         bland          0.008973
55           bad          0.008716
379        great          0.008350
610      overall          0.008112
669  pretty good          0.008029
629      perfect          0.007666
80        better          0.007197
534     mediocre          0.006923
230         didn          0.006780


### Import additional dataset

In [5]:
data_votes = pd.read_csv('../../yelp-data/new_data/final_data/data_votes.csv')

In [3]:
data_votes.head()

Unnamed: 0,business_id,city,latitude,longitude,name,open,review_count,BusinessStars,state,date,...,user_review_count,user_votes_cool,user_votes_funny,user_votes_useful,user_yelping_since,rating1,rating3,rating5,funny,useful
0,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-08,...,26,3,10,26,2012-10,0,0,0,0,0
1,5UmKMjUEUNdYWqANhGckJw,Dravosburg,40.354327,-79.900706,Mr Hoagie,True,7,3.5,PA,2016-04-10,...,3,0,0,1,2016-04,1,0,0,0,0
2,UsFtqoBl7naz8AVUBZMjQQ,Dravosburg,40.350553,-79.886814,Clancy's Pub,True,5,3.0,PA,2014-11-28,...,158,80,46,161,2012-06,0,0,0,0,0
3,buOw4-D2dVxOS_DOC4q8OQ,Homestead,40.409064,-79.915797,Yokoso Japanese Steakhouse,True,99,3.0,PA,2014-10-02,...,158,80,46,161,2012-06,0,1,0,0,0
4,SsGNAc9U-aKPZccnaDtFkA,Pittsburgh,40.443145,-80.001104,Meat & Potatoes,True,1175,4.0,PA,2014-09-13,...,158,80,46,161,2012-06,0,0,1,0,0


## Build a random forest model to predict review funniness using the review text features

In [6]:
# convert rows with empty review text to texts that are of the string variable type
texts = data_votes['text'].fillna('')

vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=[1, 2], 
                             stop_words='english',
                             binary=False)


In [6]:
model = RandomForestClassifier(n_estimators = 20)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_votes['funny']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.69009641  0.74525723  0.71308221], Average AUC 0.71614528169


In [7]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(30)

       Features  Importance Score
475        like          0.005735
444        just          0.005093
995        yelp          0.004872
360        good          0.004270
321        food          0.004106
838       steak          0.004043
641       place          0.003620
454        know          0.003604
161         com          0.003578
941       vegas          0.003410
242         don          0.003133
537        menu          0.003105
689      really          0.002998
528        meat          0.002966
71         beef          0.002873
940          ve          0.002842
961        want          0.002795
996         yes          0.002779
708  restaurant          0.002737
163        come          0.002722
775     service          0.002691
119        came          0.002664
898        time          0.002606
315      flavor          0.002571
134      cheese          0.002558
231      dinner          0.002552
867       sweet          0.002535
369       great          0.002533
594       orde

#### Repeat with higher # of estimators

In [7]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_votes['funny']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.73170678  0.78163023  0.7634611 ], Average AUC 0.758932703726


In [8]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(30)

       Features  Importance Score
475        like          0.007186
995        yelp          0.005950
444        just          0.004696
321        food          0.004435
838       steak          0.004309
360        good          0.003990
161         com          0.003753
641       place          0.003458
242         don          0.003367
941       vegas          0.003349
689      really          0.003181
454        know          0.003093
898        time          0.003065
775     service          0.002950
708  restaurant          0.002828
940          ve          0.002777
71         beef          0.002762
163        come          0.002740
134      cheese          0.002711
369       great          0.002653
537        menu          0.002648
60          bar          0.002645
481      little          0.002562
961        want          0.002514
231      dinner          0.002511
528        meat          0.002510
594       order          0.002495
733       salad          0.002459
719       righ

## Build a random forest model to predict review usefulness using the review text features

In [8]:
model = RandomForestClassifier(n_estimators = 20)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_votes['useful']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.72850532  0.75059615  0.73052828], Average AUC 0.736543249353


In [9]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(30)

       Features  Importance Score
475        like          0.006006
444        just          0.005926
360        good          0.004374
838       steak          0.004357
321        food          0.004244
641       place          0.004136
537        menu          0.004109
746       sauce          0.004050
689      really          0.003954
708  restaurant          0.003918
898        time          0.003787
995        yelp          0.003613
242         don          0.003452
594       order          0.003451
775     service          0.003441
369       great          0.003293
134      cheese          0.003199
867       sweet          0.003195
71         beef          0.003171
60          bar          0.003139
940          ve          0.003012
941       vegas          0.002888
484          ll          0.002861
77         best          0.002736
528        meat          0.002712
968         way          0.002698
922         try          0.002698
163        come          0.002666
595     ordere

#### Repeat with higher # of estimators

In [9]:
model = RandomForestClassifier(n_estimators = 50)

# Use `fit` to learn the vocabulary of the review text
vectorizer.fit(texts)

# Use `tranform` to generate the sample X word matrix - 
# one column per feature (word or n-grams)
X = vectorizer.transform(texts).todense()
y = data_votes['funny']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.72803247  0.7830376   0.77517395], Average AUC 0.762081339938


In [10]:
model.fit(X, y)
feature_importances = pd.DataFrame({'Features' : vectorizer.get_feature_names(), 
                                    'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(30)

       Features  Importance Score
475        like          0.006500
995        yelp          0.005643
444        just          0.004493
360        good          0.004186
321        food          0.004149
161         com          0.003990
838       steak          0.003876
454        know          0.003494
641       place          0.003489
941       vegas          0.003317
689      really          0.003222
242         don          0.003176
708  restaurant          0.002906
775     service          0.002879
369       great          0.002866
71         beef          0.002858
163        come          0.002840
898        time          0.002836
537        menu          0.002803
60          bar          0.002744
134      cheese          0.002596
484          ll          0.002573
940          ve          0.002555
719       right          0.002538
528        meat          0.002524
365         got          0.002495
119        came          0.002486
746       sauce          0.002484
257         ea

### What if we added some quantitative features?

In [14]:
data_rating1.columns

Index([u'business_id', u'city', u'latitude', u'longitude', u'name', u'open',
       u'review_count', u'BusinessStars', u'state', u'date', u'review_id',
       u'ReviewStars', u'text', u'user_id', u'votes_cool', u'votes_funny',
       u'votes_useful', u'user_average_stars', u'user_compliments_cool',
       u'user_compliments_cute', u'user_compliments_funny',
       u'user_compliments_hot', u'user_compliments_photos',
       u'user_compliments_profile', u'user_compliments_writer', u'user_fans',
       u'user_name', u'user_review_count', u'user_votes_cool',
       u'user_votes_funny', u'user_votes_useful', u'user_yelping_since',
       u'rating1', u'rating3', u'rating5'],
      dtype='object')

#### Do we care about user stuff or biz stuff? User info (avg stars, fans, review count, useful votes) may be helpful. Let's focus on biz for now.
- votes_cool  
- votes_funny  
- votes_useful   
- BusinessStars

In [15]:
model = RandomForestClassifier(n_estimators = 50)

In [18]:
data_rating1_new = data_rating1

In [20]:
del data_rating1_new['business_id']
del data_rating1_new['city']
del data_rating1_new['latitude']
del data_rating1_new['longitude']
del data_rating1_new['name']
del data_rating1_new['open']
del data_rating1_new['review_count']
del data_rating1_new['state']
del data_rating1_new['date']
del data_rating1_new['ReviewStars']
del data_rating1_new['user_id']
del data_rating1_new['user_average_stars']
del data_rating1_new['user_compliments_cool']
del data_rating1_new['user_compliments_cute']
del data_rating1_new['user_compliments_funny']
del data_rating1_new['user_compliments_hot']
del data_rating1_new['user_compliments_photos']
del data_rating1_new['user_compliments_profile']
del data_rating1_new['user_compliments_writer']
del data_rating1_new['user_fans']
del data_rating1_new['user_name']
del data_rating1_new['user_review_count']
del data_rating1_new['user_votes_cool']
del data_rating1_new['user_votes_funny']
del data_rating1_new['user_votes_useful']
del data_rating1_new['user_yelping_since']

In [21]:
data_rating1_new.head()

Unnamed: 0,BusinessStars,review_id,text,votes_cool,votes_funny,votes_useful,rating1,rating3,rating5
1,3.5,V-bqYx62zpxfH2oFkzXPzw,"Normally, I do not do reviews of an establishm...",0,0,0,1,0,0
4,4.0,Ly-zAhTQgfp2ZDVdrernhw,Best brunch in the area. Interesting selection...,0,0,1,0,0,1
5,3.5,PYOZ47RyypAOsK-wZ6X7yg,Too. Much. Meat. \nI'm a meat person. I'll pic...,0,1,1,1,0,0
6,4.5,6w6gMZ3iBLGcUM4RBIuifQ,This place was DELICIOUS!! My parents saw a r...,0,0,5,0,0,1
7,4.5,uf61rPucuICXhSPXlZ1hIQ,"After a morning of Thrift Store hunting, a fri...",2,0,3,0,0,1


In [22]:
# data_rating1_new.dropna()

Unnamed: 0,BusinessStars,review_id,text,votes_cool,votes_funny,votes_useful,rating1,rating3,rating5
1,3.5,V-bqYx62zpxfH2oFkzXPzw,"Normally, I do not do reviews of an establishm...",0,0,0,1,0,0
4,4.0,Ly-zAhTQgfp2ZDVdrernhw,Best brunch in the area. Interesting selection...,0,0,1,0,0,1
5,3.5,PYOZ47RyypAOsK-wZ6X7yg,Too. Much. Meat. \nI'm a meat person. I'll pic...,0,1,1,1,0,0
6,4.5,6w6gMZ3iBLGcUM4RBIuifQ,This place was DELICIOUS!! My parents saw a r...,0,0,5,0,0,1
7,4.5,uf61rPucuICXhSPXlZ1hIQ,"After a morning of Thrift Store hunting, a fri...",2,0,3,0,0,1
16,4.0,Hhs7093PTAZ72OR8Lk0qXw,"As a specialty market, this really is as good ...",0,0,2,0,0,1
17,3.5,sPW1m1seOC9eBopMK3zD-w,"No-carb diet...meat, meat, meat...how do I lov...",0,1,3,0,0,1
24,4.0,8nML3g4sYSiJycs_81k9Kw,"You mix the lime and the coconut, you shake it...",0,0,1,0,0,1
27,4.0,6Y4IdkneW2P3uhxoug1nfw,YUM-ME\n\nMe and a buddy decided to explore do...,0,0,1,0,0,1
28,4.0,0YS8HcqIaCse9kkblUG_NQ,Met some friends from out of town and we were ...,0,0,0,0,0,1


In [23]:
texts = data_rating1_new['text'].fillna('')
y = data_rating1_new['rating1']

In [24]:
# Use `transform` to generate the sample X word matrix - one column per feature (word or n-grams)
texts_transformed = vectorizer.transform(texts)
X_text_features = pd.DataFrame(texts_transformed.todense(), columns=vectorizer.get_feature_names())

# Identify the features you want from the original dataset
other_features_columns = ['BusinessStars', 
          'votes_cool', 
          'votes_funny',
          'votes_useful']

# predictor_list = np.hstack([other_features_columns, dummy_categories])
other_features = data_rating1_new[other_features_columns]

# Stack them horizontally together
# This takes all of the word/n-gram columns and appends on two more columns for `html_ratio` and `image_ratio`
X = pd.concat([X_text_features, other_features], axis=1)

scores = cross_val_score(model, X.values, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

ValueError: Found arrays with inconsistent numbers of samples: [ 74090 117224]

##### http://stackoverflow.com/questions/30813044/sklearn-found-arrays-with-inconsistent-numbers-of-samples-when-calling-linearre

Keep same vectorizer (with binary = False)

In [None]:
# What features of these are most important?
model.fit(X, y)

# create a dataframe out of the predictors used to fit the model
all_feature_names = vectorizer.get_feature_names() + other_features_columns
feature_importances = pd.DataFrame({'Features' : all_feature_names, 'Importance Score': model.feature_importances_})
print feature_importances.sort_values('Importance Score', ascending=False).head(20)