In [41]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse

# SK-learn for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer 

#Encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder 

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
import sys
print sys.version_info[0]
print pd.__version__

2
0.21.0


In [2]:
def mutually_exclusive_elements(set1, set2):
    set1ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set1)
    set2ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set2)
    print "first: ", len(set1ExclusiveElements)
    print "second: ", len(set2ExclusiveElements)
    return {"first": set1ExclusiveElements, "second": set2ExclusiveElements}

In [93]:
bsnes = pd.read_json(open("data/yelp_training_set_business.json"), lines=True)
checkin = pd.read_json(open("data/yelp_training_set_checkin.json"), lines=True)
user = pd.read_json(open("data/yelp_training_set_user.json"), lines=True)
review = pd.read_json(open("data/yelp_training_set_review.json"), lines=True)

user = pd.concat([user.drop(["votes"], axis=1), user["votes"].apply(pd.Series)], axis=1)

#Convert all checkin times to their own columns 
checkin = pd.concat([checkin.drop(["checkin_info"], axis=1), checkin["checkin_info"].apply(pd.Series)], axis=1)

#Need to merge user level vote data to the test data by joining on user_id and imputing the rest. 
bsnes.rename(columns={'name': 'business_name', 
                      'review_count': 'business_review_count',
                      'stars': 'business_stars'}, inplace=True)

user.rename(columns={'name': 'user_name', 
                      'review_count': 'user_review_count',
                      "funny": "funny_votes_business", 
                      "useful": "useful_votes_business",
                      "cool": "cool_votes_business"}, inplace=True)

review.rename(columns={'votes': 'useful_votes'}, inplace=True)

totalRev = pd.merge(review, user, how='left', on=['user_id'])
totalRev = pd.merge(totalRev, bsnes, how='left', on=['business_id'])
totalRev = pd.merge(totalRev, checkin, how='left', on=['business_id'])
totalRev.useful_votes = [i['useful'] for i in review.useful_votes]
del totalRev["type_x"]
del totalRev["type_y"]
del totalRev["neighborhoods"]


In [94]:
categorical_variables = list(set(totalRev.select_dtypes(include=["object","bool"]).columns) - set(["text"]))
numeric_variables = list(set(totalRev.columns) - set(categorical_variables) - set(["text"]))

In [97]:
#
#  Encode Categorical Data
#
from collections import defaultdict
leD = defaultdict(LabelEncoder)
oheD = defaultdict(OneHotEncoder)
ohe = OneHotEncoder()

#Brilliiant, this both fits, transforms, and retains the fit in the dictionary 
categoricalTrainDataLE = totalRev[categorical_variables].apply(lambda x: leD[x.name].fit_transform(x))
ohe = ohe.fit(categoricalTrainDataLE)
categoricalTrainDataOHE = ohe.transform(categoricalTrainDataLE)

In [63]:
print leTesting
print oheTesting

   date  user_id  user_name
0     4        4          2
1     5        0          6
2     7        1          5
3     2        7          9
4     6        8          7
5     0        6          0
6     1        9          4
7     8        2          1
8     9        3          8
9     3        5          3
[[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [8]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}', ngram_range=(1,2), use_idf=1,
                     smooth_idf=1, sublinear_tf=1, stop_words='english')
textFeatures = tfv.fit_transform(totalRev.text)

In [100]:
#This works, but it's only one of the columns
#hmmm = sparse.hstack((np.array(totalRev.stars)[:,None], textFeatures))
#X = sparse.hstack((textFeatures,totalRev[numeric_variables].values),format='csr')


#Still broken; need to replace Nan's with 0's, also guarantee totalRev is all numeric
xtrain_sparse = sparse.hstack((textFeatures,
                               categoricalTrainDataOHE, 
                               totalRev[numeric_variables].values),format='csr')


TypeError: no supported conversion for types: (dtype('float64'), dtype('float64'), dtype('O'))

In [102]:
totalRev[numeric_variables].values[0]

array([3.72, nan, nan, nan, 1034.0, nan, nan, nan, 33.390792, 2.0, nan,
       nan, nan, nan, nan, nan, nan, nan, 14.0, nan, 19.0, nan, nan, nan,
       nan, 8.0, 9.0, 1.0, 2.0, nan, nan, nan, nan, 116, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, 1.0, nan, 3.0, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, 1.0, nan, nan, nan, nan,
       nan, 1.0, 6.0, 2.0, nan, 1.0, 1.0, 2.0, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, 1.0, nan, nan, nan, nan, nan, 5, nan, nan, nan,
       nan, nan, nan, nan, 2.0, nan, 1.0, nan, nan, nan, nan, nan, nan,
       nan, 12.0, nan, nan, nan, nan, nan, nan, -112.012504, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, 2.0, nan, nan, 1.0, 10.0, 11.0, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, 1.0, nan, nan, nan, nan,
       nan, nan, nan, nan, 5, nan, nan, Timestamp('2011-01-26 00:00:00'),
       331.0, 4.0, nan, nan, nan, nan, nan, nan, na

In [None]:
pd.DataFrame(hmmm.A, columns=tfv.get_feature_names())

In [24]:
xtrain_sparse.shape

(229907, 27)

In [25]:
textFeatures.shape

(229907, 779714)

In [26]:
textFeatures[0]

<1x779714 sparse matrix of type '<type 'numpy.float64'>'
	with 122 stored elements in Compressed Sparse Row format>

In [13]:
totalRev.text[0:7]

0    My wife took me here on my birthday for breakf...
1    I have no idea why some people give bad review...
2    love the gyro plate. Rice is so good and I als...
3    Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4    General Manager Scott Petello is a good egg!!!...
5    Quiessence is, simply put, beautiful.  Full wi...
6    Drop what you're doing and drive here. After I...
Name: text, dtype: object

In [2]:
textFeatures = tfv.fit_transform(totalRev.text[0:10])

NameError: name 'tfv' is not defined

In [16]:
pd.DataFrame(textFeatures.A, columns=tfv.get_feature_names())

Unnamed: 0,amazing,awesome,came,delicious,didn,didn t,evening,food,good,great,like,love,menu,outside,place,s,t,try,ve,wait
0,0.46128,0.0,0.272439,0.272439,0.0,0.0,0.0,0.272439,0.0,0.0,0.242218,0.0,0.242218,0.272439,0.217525,0.0,0.196648,0.0,0.46128,0.272439
1,0.0,0.263062,0.263062,0.0,0.0,0.0,0.263062,0.0,0.210038,0.263062,0.490825,0.0,0.0,0.0,0.210038,0.355625,0.0,0.445402,0.0,0.263062
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62395,0.0,0.0,0.781464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.781464,0.0,0.0,0.0,0.62395,0.0,0.0,0.0,0.0
4,0.0,0.46694,0.0,0.0,0.0,0.0,0.0,0.0,0.372822,0.0,0.415144,0.0,0.0,0.0,0.0,0.372822,0.33704,0.0,0.46694,0.0
5,0.0,0.0,0.194909,0.194909,0.465111,0.465111,0.194909,0.194909,0.263492,0.0,0.173288,0.0,0.173288,0.0,0.155623,0.263492,0.392763,0.194909,0.0,0.0
6,0.212444,0.212444,0.0,0.359699,0.0,0.0,0.0,0.359699,0.287197,0.212444,0.0,0.212444,0.188878,0.212444,0.169623,0.355973,0.259633,0.359699,0.212444,0.0
7,0.0,0.0,0.0,0.0,0.629815,0.629815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454604,0.0,0.0,0.0
8,0.350102,0.0,0.0,0.0,0.350102,0.350102,0.350102,0.0,0.0,0.350102,0.0,0.0,0.0,0.350102,0.279534,0.0,0.252706,0.0,0.0,0.350102
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print bsnes.head()
print checkin.head()
print user.head()
review.head()

              business_id                                         categories  \
0  rncjoVoEFUJGCUoC1JgnUA  [Accountants, Professional Services, Tax Servi...   
1  0FNFSzCFP_rGUoJx8W7tJg                  [Sporting Goods, Bikes, Shopping]   
2  3f_lyB6vFK48ukH6ScvLHg                                                 []   
3  usAsSV36QmUej8--yvN-dg                                    [Food, Grocery]   
4  PzOqRohWw7F7YEPBz6AubA                 [Food, Bagels, Delis, Restaurants]   

          city                                       full_address   latitude  \
0       Peoria         8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345  33.581867   
1      Phoenix                  2149 W Wood Dr\nPhoenix, AZ 85029  33.604054   
2      Phoenix              1134 N Central Ave\nPhoenix, AZ 85004  33.460526   
3      Phoenix              845 W Southern Ave\nPhoenix, AZ 85041  33.392210   
4  Glendale Az  6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...  33.712797   

    longitude                         

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{u'funny': 0, u'useful': 5, u'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{u'funny': 0, u'useful': 0, u'cool': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,"{u'funny': 0, u'useful': 1, u'cool': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,"{u'funny': 0, u'useful': 2, u'cool': 1}"
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,"{u'funny': 0, u'useful': 0, u'cool': 0}"


In [35]:
#Is there user data for every review? No, will have to impute reviews w/ no reviewer data
print user.user_id.unique().size
print review.user_id.unique().size
s1 = pd.merge(review, user, how='inner', on=["user_id"])
s1.user_id.unique().size

43873
45981


43873

In [61]:
#All reviews have a business to join to, and we have extra checkins that remain unused 
mutually_exclusive_elements(bsnes.business_id, review.business_id)
mutually_exclusive_elements(checkin.business_id, bsnes.business_id)
hm = mutually_exclusive_elements(checkin.business_id, review.business_id)

first:  0
second:  0
first:  0
second:  3255
first:  0
second:  3255


In [76]:
#Merge all the data into a single data frame



#Apply text transformations on the review text
#   Done, now it's a sparse matrix 
#Do oneHotEncoding or LabelEncoder stuff to categorical variables
#   i. OHE takes all potential values for a category and turns them into their own binary feature
#   ii. LE makes everything {1,2,...,N} for the # of categories, which is bad if ordinal relationship isn't intended
#   iii. OHE is almost always better; only pitfalls are when there are lots of categories and space is an issue, but we
#        can solve this by using a PCA on the OHE 
#Use Normalizer or StandardScaler (sklearn) on the linear variables
#   i. I'm not sure why we need to do this.  Look this up 
#Perform a PCA or TruncatedSVD to reduce dimensionality (probably only necessary due to the text features)
#Do feature selection; tons of ways to do this, choose one
#Choose a model, train it, test it 

Index([      u'business_id',              u'date',         u'review_id',
                   u'stars',              u'text',            u'type_x',
                 u'user_id',      u'useful_votes',     u'average_stars',
               u'user_name', u'user_review_count',            u'type_y',
              u'user_votes'],
      dtype='object')
Index([          u'business_id',                  u'date',
                   u'review_id',                 u'stars',
                        u'text',                u'type_x',
                     u'user_id',          u'useful_votes',
               u'average_stars',             u'user_name',
           u'user_review_count',                u'type_y',
                  u'user_votes',            u'categories',
                        u'city',          u'full_address',
                    u'latitude',             u'longitude',
               u'business_name',         u'neighborhoods',
                        u'open', u'business_review_count',
       