In [49]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse

# SK-learn for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer 

#Dimensionality Reduction 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

#Encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import StandardScaler

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


import sys
print sys.version_info[0]
print pd.__version__

2
0.21.0


In [50]:
def glue_jsons(user, review, bsnes, checkin):
    #Convert all checkin times to their own columns 
    checkin = pd.concat([checkin.drop(["checkin_info"], axis=1), 
                         checkin["checkin_info"].apply(pd.Series)], axis=1)

    user.rename(columns={'name': 'user_name', 
                          'review_count': 'user_review_count'}, inplace=True)

    bsnes.rename(columns={'name': 'business_name', 
                      'review_count': 'business_review_count',
                      'stars': 'business_stars'}, inplace=True)
    
    xtrain = pd.merge(review, user, how='left', on=['user_id'])
    xtrain = pd.merge(xtrain, bsnes, how='left', on=['business_id'])
    xtrain = pd.merge(xtrain, checkin, how='left', on=['business_id'])
    del xtrain["type_x"]
    del xtrain["type_y"]
    del xtrain["neighborhoods"]
    xtrain.fillna(0, inplace=True)
    return xtrain

def mutually_exclusive_elements(set1, set2):
    set1ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set1)
    set2ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set2)
    print "first: ", len(set1ExclusiveElements)
    print "second: ", len(set2ExclusiveElements)
    return {"first": set1ExclusiveElements, "second": set2ExclusiveElements}

In [51]:
bsnes = pd.read_json(open("data/yelp_training_set_business.json"), lines=True)
checkin = pd.read_json(open("data/yelp_training_set_checkin.json"), lines=True)
user = pd.read_json(open("data/yelp_training_set_user.json"), lines=True)
review = pd.read_json(open("data/yelp_training_set_review.json"), lines=True)

bsnes_test = pd.read_json(open("data/yelp_test_set_business.json"), lines=True)
checkin_test = pd.read_json(open("data/yelp_test_set_checkin.json"), lines=True)
user_test = pd.read_json(open("data/yelp_test_set_user.json"), lines=True)
review_test = pd.read_json(open("data/yelp_test_set_review.json"), lines=True)



#Manually joining the user votes data from train into test where it's missing 
user = pd.concat([user.drop(["votes"], axis=1), user["votes"].apply(pd.Series)], axis=1)
#Need to merge user level vote data to the test data by joining on user_id and imputing the rest. 
user.rename(columns={ "funny": "funny_votes_business", 
                      "useful": "useful_votes_business",
                      "cool": "cool_votes_business"}, inplace=True)

user_test = pd.merge(user_test, user[["funny","useful","cool"]], how='left', on['user_id'])

xtest = glue_jsons(user_test, review_test, bsnes_test, checkin_test)
xtrain = glue_jsons(user, review, bsnes, checkin)
ytrain = [i['useful'] for i in review.votes]


In [52]:
categorical_variables = list(set(xtrain.select_dtypes(include=["object","bool"]).columns) - set(["text"]))
numeric_variables = list(set(xtrain.columns) - set(categorical_variables) - set(["text", "date"]))

In [53]:
#  Encode Categorical Data
from collections import defaultdict
leD = defaultdict(LabelEncoder)
oheD = defaultdict(OneHotEncoder)
ohe = OneHotEncoder()

#Brilliiant, this both fits, transforms, and retains the fit in the dictionary 
categoricalTrainDataLE = xtrain[categorical_variables].apply(lambda x: leD[x.name].fit_transform(x))
ohe = ohe.fit(categoricalTrainDataLE)
categoricalTrainDataOHE = ohe.transform(categoricalTrainDataLE)


In [54]:
categoriesSvd = TruncatedSVD(n_components=200)
categoriesSvd.fit(categoricalTrainDataOHE)
categoricalTrainDataOHESvd = categoriesSvd.transform(categoricalTrainDataOHE)
categoriesSvd.explained_variance_ratio_

array([ 0.0053496 ,  0.03917613,  0.02096237,  0.01549227,  0.01063187,
        0.00807126,  0.00731319,  0.00690965,  0.00494638,  0.00459248,
        0.00402336,  0.00386649,  0.0028704 ,  0.00268211,  0.00247562,
        0.00240892,  0.0022879 ,  0.00212588,  0.00199901,  0.00193747,
        0.00189519,  0.00185844,  0.00182885,  0.00177708,  0.00166503,
        0.00163475,  0.00161188,  0.00160095,  0.0015648 ,  0.00152245,
        0.00146319,  0.00144717,  0.00143393,  0.00142274,  0.00141463,
        0.00139121,  0.00135745,  0.00131679,  0.00131298,  0.0012835 ,
        0.00123112,  0.00116519,  0.00113645,  0.00112108,  0.00111705,
        0.00111215,  0.00108944,  0.00104616,  0.0010023 ,  0.00099707,
        0.00099574,  0.00098065,  0.00096262,  0.00094827,  0.00093895,
        0.00093573,  0.00092838,  0.00092367,  0.00091658,  0.00091342,
        0.00090197,  0.00086907,  0.00086566,  0.00085337,  0.00085028,
        0.00084086,  0.00083482,  0.00083065,  0.00082784,  0.00

In [57]:
sum(categoriesSvd.explained_variance_ratio_)

0.28343742376465231

In [58]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}', ngram_range=(1,2), use_idf=1,
                     smooth_idf=1, sublinear_tf=1, stop_words='english')
textFeatures = tfv.fit_transform(xtrain.text)
svd = TruncatedSVD(n_components=120)
svd.fit(textFeatures)
textFeaturesSVD = svd.transform(textFeatures)
svd.explained_variance_ratio_

array([ 0.00085878,  0.00149577,  0.00111431,  0.0009416 ,  0.00088758,
        0.00084887,  0.00078751,  0.00077761,  0.00071704,  0.00069703,
        0.0006692 ,  0.00063505,  0.00060891,  0.0006009 ,  0.00057535,
        0.00056297,  0.00055195,  0.00051869,  0.00051556,  0.00050494,
        0.00049736,  0.00049136,  0.00048019,  0.00048008,  0.00047129,
        0.00046275,  0.00045498,  0.00044752,  0.00044104,  0.00043785,
        0.00042467,  0.00042049,  0.00041108,  0.00040786,  0.00039945,
        0.00039693,  0.00039319,  0.00039047,  0.00038647,  0.00038207,
        0.00037908,  0.00037457,  0.000371  ,  0.00036581,  0.00035883,
        0.00035698,  0.00035336,  0.00035154,  0.00034848,  0.00034655,
        0.00034522,  0.00034356,  0.00033966,  0.00033868,  0.00033509,
        0.00033408,  0.00033071,  0.00032959,  0.00032559,  0.00032493,
        0.00032147,  0.00032108,  0.00031595,  0.00031459,  0.00031264,
        0.00031051,  0.00030727,  0.00030563,  0.00030402,  0.00

In [None]:
sum(svd.explained_variance_ratio_)

In [59]:
scaler = StandardScaler()
scaler.fit(xtrain[numeric_variables])
xtrain[numeric_variables] = scaler.transform(xtrain[numeric_variables])

In [42]:
print textFeaturesSVD
print textFeaturesSVD.shape
print categoricalTrainDataOHESvd.shape
print xtrain[numeric_variables].shape

[[ 0.10822675 -0.00035359 -0.03582709 ...,  0.00903369  0.00539432
  -0.00918362]
 [ 0.11138171 -0.02405241 -0.01843058 ..., -0.01014245 -0.01171632
   0.01615613]
 [ 0.04084059  0.02322308 -0.01916287 ...,  0.03630289 -0.00520356
   0.01950582]
 ..., 
 [ 0.08576055  0.12638387  0.00428202 ..., -0.02038182 -0.02039968
   0.03669999]
 [ 0.07850521  0.00916698 -0.0178492  ..., -0.01431194 -0.00397014
  -0.02569238]
 [ 0.01991111 -0.0027785   0.02411887 ...,  0.02361712 -0.06427272
  -0.01346345]]
(229907, 120)
(229907, 200)
(229907, 178)


In [61]:
#Stack features like a ballerrrrr
# xtrain_sparse = sparse.hstack((textFeaturesSVD,
#                                categoricalTrainDataOHESvd, 
#                                xtrain[numeric_variables].values),format='csr')

#No longer doing sparse matrices, so this is kosher 
xtrain_joined = np.hstack((textFeaturesSVD,
                               categoricalTrainDataOHESvd, 
                               xtrain[numeric_variables].values))

del textFeatures
del textFeaturesSVD
del categoricalTrainDataOHESvd
del xtrain

In [62]:
xtrain_joined.shape

(229907, 498)

In [65]:
rfc = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rfc.fit(xtrain_joined, ytrain)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [71]:
rfc.predict(xtrain_joined[2:20])

array([1, 2, 0, 3, 7, 1, 0, 1, 3, 1, 2, 2, 1, 2, 4, 0, 6, 1])

In [10]:
print bsnes.head()
print checkin.head()
print user.head()
review.head()

              business_id                                         categories  \
0  rncjoVoEFUJGCUoC1JgnUA  [Accountants, Professional Services, Tax Servi...   
1  0FNFSzCFP_rGUoJx8W7tJg                  [Sporting Goods, Bikes, Shopping]   
2  3f_lyB6vFK48ukH6ScvLHg                                                 []   
3  usAsSV36QmUej8--yvN-dg                                    [Food, Grocery]   
4  PzOqRohWw7F7YEPBz6AubA                 [Food, Bagels, Delis, Restaurants]   

          city                                       full_address   latitude  \
0       Peoria         8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345  33.581867   
1      Phoenix                  2149 W Wood Dr\nPhoenix, AZ 85029  33.604054   
2      Phoenix              1134 N Central Ave\nPhoenix, AZ 85004  33.460526   
3      Phoenix              845 W Southern Ave\nPhoenix, AZ 85041  33.392210   
4  Glendale Az  6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...  33.712797   

    longitude                         

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{u'funny': 0, u'useful': 5, u'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{u'funny': 0, u'useful': 0, u'cool': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,"{u'funny': 0, u'useful': 1, u'cool': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,"{u'funny': 0, u'useful': 2, u'cool': 1}"
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,"{u'funny': 0, u'useful': 0, u'cool': 0}"


In [61]:
#All reviews have a business to join to, and we have extra checkins that remain unused 
mutually_exclusive_elements(bsnes.business_id, review.business_id)
mutually_exclusive_elements(checkin.business_id, bsnes.business_id)
hm = mutually_exclusive_elements(checkin.business_id, review.business_id)

first:  0
second:  0
first:  0
second:  3255
first:  0
second:  3255


In [76]:
#Merge all the data into a single data frame



#Apply text transformations on the review text
#   Done, now it's a sparse matrix 
#Do oneHotEncoding or LabelEncoder stuff to categorical variables
#   i. OHE takes all potential values for a category and turns them into their own binary feature
#   ii. LE makes everything {1,2,...,N} for the # of categories, which is bad if ordinal relationship isn't intended
#   iii. OHE is almost always better; only pitfalls are when there are lots of categories and space is an issue, but we
#        can solve this by using a PCA on the OHE 
#Use Normalizer or StandardScaler (sklearn) on the linear variables
#   i. I'm not sure why we need to do this.  Look this up 
#Perform a PCA or TruncatedSVD to reduce dimensionality (probably only necessary due to the text features)
#Do feature selection; tons of ways to do this, choose one
#Choose a model, train it, test it 

Index([      u'business_id',              u'date',         u'review_id',
                   u'stars',              u'text',            u'type_x',
                 u'user_id',      u'useful_votes',     u'average_stars',
               u'user_name', u'user_review_count',            u'type_y',
              u'user_votes'],
      dtype='object')
Index([          u'business_id',                  u'date',
                   u'review_id',                 u'stars',
                        u'text',                u'type_x',
                     u'user_id',          u'useful_votes',
               u'average_stars',             u'user_name',
           u'user_review_count',                u'type_y',
                  u'user_votes',            u'categories',
                        u'city',          u'full_address',
                    u'latitude',             u'longitude',
               u'business_name',         u'neighborhoods',
                        u'open', u'business_review_count',
       

In [None]:
xtest_le = xtest[categorical_variables].apply(lambda x: leD[x.name].transform(x))
xtest_ohe = ohe.transform(xtest_le)
xtest_ohe = categoriesSvd.transform(xtest_ohe)
test_text_features = tfv.transform(xtest.text)
test_text_features = svd.transform(test_text_features)
xtest[numeric_variables] = scaler.transform(xtest[numeric_variables])