In [49]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse

# SK-learn for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer 

#Dimensionality Reduction 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

#Encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import StandardScaler

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


import sys
print sys.version_info[0]
print pd.__version__

2
0.21.0


In [50]:
def mutually_exclusive_elements(set1, set2):
    set1ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set1)
    set2ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set2)
    print "first: ", len(set1ExclusiveElements)
    print "second: ", len(set2ExclusiveElements)
    return {"first": set1ExclusiveElements, "second": set2ExclusiveElements}

In [51]:
bsnes = pd.read_json(open("data/yelp_training_set_business.json"), lines=True)
checkin = pd.read_json(open("data/yelp_training_set_checkin.json"), lines=True)
user = pd.read_json(open("data/yelp_training_set_user.json"), lines=True)
review = pd.read_json(open("data/yelp_training_set_review.json"), lines=True)

user = pd.concat([user.drop(["votes"], axis=1), user["votes"].apply(pd.Series)], axis=1)

#Convert all checkin times to their own columns 
checkin = pd.concat([checkin.drop(["checkin_info"], axis=1), checkin["checkin_info"].apply(pd.Series)], axis=1)

#Need to merge user level vote data to the test data by joining on user_id and imputing the rest. 
bsnes.rename(columns={'name': 'business_name', 
                      'review_count': 'business_review_count',
                      'stars': 'business_stars'}, inplace=True)

user.rename(columns={'name': 'user_name', 
                      'review_count': 'user_review_count',
                      "funny": "funny_votes_business", 
                      "useful": "useful_votes_business",
                      "cool": "cool_votes_business"}, inplace=True)

review.rename(columns={'votes': 'useful_votes'}, inplace=True)

xtrain = pd.merge(review, user, how='left', on=['user_id'])
xtrain = pd.merge(xtrain, bsnes, how='left', on=['business_id'])
xtrain = pd.merge(xtrain, checkin, how='left', on=['business_id'])
xtrain.useful_votes = [i['useful'] for i in review.useful_votes]
ytrain = xtrain.useful_votes
del xtrain["useful_votes"]
del xtrain["type_x"]
del xtrain["type_y"]
del xtrain["neighborhoods"]
xtrain.fillna(0, inplace=True)

In [52]:
categorical_variables = list(set(xtrain.select_dtypes(include=["object","bool"]).columns) - set(["text"]))
numeric_variables = list(set(xtrain.columns) - set(categorical_variables) - set(["text", "date"]))

In [None]:
#  Encode Categorical Data
from collections import defaultdict
leD = defaultdict(LabelEncoder)
oheD = defaultdict(OneHotEncoder)
ohe = OneHotEncoder()

#Brilliiant, this both fits, transforms, and retains the fit in the dictionary 
categoricalTrainDataLE = xtrain[categorical_variables].apply(lambda x: leD[x.name].fit_transform(x))
ohe = ohe.fit(categoricalTrainDataLE)
categoricalTrainDataOHE = ohe.transform(categoricalTrainDataLE)


In [None]:
categoriesSvd = TruncatedSVD(n_components=200)
categoriesSvd.fit(categoricalTrainDataOHE)
categoricalTrainDataOHESvd = categoriesSvd.transform(categoricalTrainDataOHE)
categoriesSvd.explained_variance_ratio_

In [None]:
sum(categoriesSvd.explained_variance_ratio_)

In [None]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}', ngram_range=(1,2), use_idf=1,
                     smooth_idf=1, sublinear_tf=1, stop_words='english')
textFeatures = tfv.fit_transform(xtrain.text)
svd = TruncatedSVD(n_components=120)
svd.fit(textFeatures)
textFeaturesSVD = svd.transform(textFeatures)
svd.explained_variance_ratio_

In [None]:
sum(svd.explained_variance_ratio_)

In [None]:
scaler = StandardScaler()
scaler.fit(xtrain[numeric_variables])
xtrain[numeric_variables] = scaler.transform(xtrain[numeric_variables])

In [42]:
print textFeaturesSVD
print textFeaturesSVD.shape
print categoricalTrainDataOHESvd.shape
print xtrain[numeric_variables].shape

[[ 0.10822675 -0.00035359 -0.03582709 ...,  0.00903369  0.00539432
  -0.00918362]
 [ 0.11138171 -0.02405241 -0.01843058 ..., -0.01014245 -0.01171632
   0.01615613]
 [ 0.04084059  0.02322308 -0.01916287 ...,  0.03630289 -0.00520356
   0.01950582]
 ..., 
 [ 0.08576055  0.12638387  0.00428202 ..., -0.02038182 -0.02039968
   0.03669999]
 [ 0.07850521  0.00916698 -0.0178492  ..., -0.01431194 -0.00397014
  -0.02569238]
 [ 0.01991111 -0.0027785   0.02411887 ...,  0.02361712 -0.06427272
  -0.01346345]]
(229907, 120)
(229907, 200)
(229907, 178)


In [47]:
#Stack features like a ballerrrrr
# xtrain_sparse = sparse.hstack((textFeaturesSVD,
#                                categoricalTrainDataOHESvd, 
#                                xtrain[numeric_variables].values),format='csr')

#No longer doing sparse matrices, so this is kosher 
xtrain_joined = np.hstack((textFeaturesSVD,
                               categoricalTrainDataOHESvd, 
                               xtrain[numeric_variables].values))


In [44]:
xtrain_joined.shape

(229907, 498)

In [48]:
rfc = RandomForestClassifier(n_estimators=2, n_jobs=-1)
rfc.fit(xtrain_joined, ytrain)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
print textFeatures.shape
print categoricalTrainDataOHE.shape
print xtrain[numeric_variables].shape
print xtrain_sparse.shape
print xtrain_sparse[1,:]

(229907, 779714)
(229907, 316529)
(229907, 178)
(229907, 1096421)
  (0, 482)	0.0812243340281
  (0, 2475)	0.0501251182175
  (0, 2631)	0.112597709649
  (0, 10220)	0.0775195567373
  (0, 10545)	0.0521383900644
  (0, 10605)	0.0744493394891
  (0, 11467)	0.0432799112168
  (0, 11819)	0.103073283617
  (0, 12069)	0.0791233926149
  (0, 12265)	0.0674240526336
  (0, 12272)	0.112597709649
  (0, 36814)	0.0438607006676
  (0, 36835)	0.0816543266098
  (0, 47696)	0.0331906540948
  (0, 48120)	0.0909184079132
  (0, 51028)	0.0677673854462
  (0, 51804)	0.171855176722
  (0, 51805)	0.0729746170426
  (0, 52938)	0.0488439684356
  (0, 53093)	0.09217563283
  (0, 62063)	0.038829741185
  (0, 62527)	0.099545469434
  (0, 67769)	0.0491657067593
  (0, 67990)	0.112597709649
  (0, 68669)	0.101500435813
  :	:
  (0, 1096396)	-0.191088257068
  (0, 1096397)	-0.153391205149
  (0, 1096398)	-0.170857969492
  (0, 1096399)	-0.177459884754
  (0, 1096400)	-0.195334284418
  (0, 1096401)	-0.192975646388
  (0, 1096402)	-0.19577211688
 

In [16]:
pd.DataFrame(textFeatures.A, columns=tfv.get_feature_names())

Unnamed: 0,amazing,awesome,came,delicious,didn,didn t,evening,food,good,great,like,love,menu,outside,place,s,t,try,ve,wait
0,0.46128,0.0,0.272439,0.272439,0.0,0.0,0.0,0.272439,0.0,0.0,0.242218,0.0,0.242218,0.272439,0.217525,0.0,0.196648,0.0,0.46128,0.272439
1,0.0,0.263062,0.263062,0.0,0.0,0.0,0.263062,0.0,0.210038,0.263062,0.490825,0.0,0.0,0.0,0.210038,0.355625,0.0,0.445402,0.0,0.263062
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62395,0.0,0.0,0.781464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.781464,0.0,0.0,0.0,0.62395,0.0,0.0,0.0,0.0
4,0.0,0.46694,0.0,0.0,0.0,0.0,0.0,0.0,0.372822,0.0,0.415144,0.0,0.0,0.0,0.0,0.372822,0.33704,0.0,0.46694,0.0
5,0.0,0.0,0.194909,0.194909,0.465111,0.465111,0.194909,0.194909,0.263492,0.0,0.173288,0.0,0.173288,0.0,0.155623,0.263492,0.392763,0.194909,0.0,0.0
6,0.212444,0.212444,0.0,0.359699,0.0,0.0,0.0,0.359699,0.287197,0.212444,0.0,0.212444,0.188878,0.212444,0.169623,0.355973,0.259633,0.359699,0.212444,0.0
7,0.0,0.0,0.0,0.0,0.629815,0.629815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454604,0.0,0.0,0.0
8,0.350102,0.0,0.0,0.0,0.350102,0.350102,0.350102,0.0,0.0,0.350102,0.0,0.0,0.0,0.350102,0.279534,0.0,0.252706,0.0,0.0,0.350102
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print bsnes.head()
print checkin.head()
print user.head()
review.head()

              business_id                                         categories  \
0  rncjoVoEFUJGCUoC1JgnUA  [Accountants, Professional Services, Tax Servi...   
1  0FNFSzCFP_rGUoJx8W7tJg                  [Sporting Goods, Bikes, Shopping]   
2  3f_lyB6vFK48ukH6ScvLHg                                                 []   
3  usAsSV36QmUej8--yvN-dg                                    [Food, Grocery]   
4  PzOqRohWw7F7YEPBz6AubA                 [Food, Bagels, Delis, Restaurants]   

          city                                       full_address   latitude  \
0       Peoria         8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345  33.581867   
1      Phoenix                  2149 W Wood Dr\nPhoenix, AZ 85029  33.604054   
2      Phoenix              1134 N Central Ave\nPhoenix, AZ 85004  33.460526   
3      Phoenix              845 W Southern Ave\nPhoenix, AZ 85041  33.392210   
4  Glendale Az  6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...  33.712797   

    longitude                         

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{u'funny': 0, u'useful': 5, u'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{u'funny': 0, u'useful': 0, u'cool': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,"{u'funny': 0, u'useful': 1, u'cool': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,"{u'funny': 0, u'useful': 2, u'cool': 1}"
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,"{u'funny': 0, u'useful': 0, u'cool': 0}"


In [61]:
#All reviews have a business to join to, and we have extra checkins that remain unused 
mutually_exclusive_elements(bsnes.business_id, review.business_id)
mutually_exclusive_elements(checkin.business_id, bsnes.business_id)
hm = mutually_exclusive_elements(checkin.business_id, review.business_id)

first:  0
second:  0
first:  0
second:  3255
first:  0
second:  3255


In [76]:
#Merge all the data into a single data frame



#Apply text transformations on the review text
#   Done, now it's a sparse matrix 
#Do oneHotEncoding or LabelEncoder stuff to categorical variables
#   i. OHE takes all potential values for a category and turns them into their own binary feature
#   ii. LE makes everything {1,2,...,N} for the # of categories, which is bad if ordinal relationship isn't intended
#   iii. OHE is almost always better; only pitfalls are when there are lots of categories and space is an issue, but we
#        can solve this by using a PCA on the OHE 
#Use Normalizer or StandardScaler (sklearn) on the linear variables
#   i. I'm not sure why we need to do this.  Look this up 
#Perform a PCA or TruncatedSVD to reduce dimensionality (probably only necessary due to the text features)
#Do feature selection; tons of ways to do this, choose one
#Choose a model, train it, test it 

Index([      u'business_id',              u'date',         u'review_id',
                   u'stars',              u'text',            u'type_x',
                 u'user_id',      u'useful_votes',     u'average_stars',
               u'user_name', u'user_review_count',            u'type_y',
              u'user_votes'],
      dtype='object')
Index([          u'business_id',                  u'date',
                   u'review_id',                 u'stars',
                        u'text',                u'type_x',
                     u'user_id',          u'useful_votes',
               u'average_stars',             u'user_name',
           u'user_review_count',                u'type_y',
                  u'user_votes',            u'categories',
                        u'city',          u'full_address',
                    u'latitude',             u'longitude',
               u'business_name',         u'neighborhoods',
                        u'open', u'business_review_count',
       