In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
import sys
print sys.version_info[0]
print pd.__version__

In [57]:
def mutually_exclusive_elements(set1, set2):
    set1ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set1)
    set2ExclusiveElements = set(set1).symmetric_difference(set2).intersection(set2)
    print "first: ", len(set1ExclusiveElements)
    print "second: ", len(set2ExclusiveElements)
    return {"first": set1ExclusiveElements, "second": set2ExclusiveElements}

In [None]:
bsnes = pd.read_json(open("data/yelp_training_set_business.json"), lines=True)
checkin = pd.read_json(open("data/yelp_training_set_checkin.json"), lines=True)
user = pd.read_json(open("data/yelp_training_set_user.json"), lines=True)
review = pd.read_json(open("data/yelp_training_set_review.json"), lines=True)

In [41]:
#Should omit counts for funny and cool reviews and set votes column to just the useful value, we don't have access
#to these features in the test set 
review.votes = [i['useful'] for i in review.votes]

In [10]:
print bsnes.head()
print checkin.head()
print user.head()
review.head()

              business_id                                         categories  \
0  rncjoVoEFUJGCUoC1JgnUA  [Accountants, Professional Services, Tax Servi...   
1  0FNFSzCFP_rGUoJx8W7tJg                  [Sporting Goods, Bikes, Shopping]   
2  3f_lyB6vFK48ukH6ScvLHg                                                 []   
3  usAsSV36QmUej8--yvN-dg                                    [Food, Grocery]   
4  PzOqRohWw7F7YEPBz6AubA                 [Food, Bagels, Delis, Restaurants]   

          city                                       full_address   latitude  \
0       Peoria         8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345  33.581867   
1      Phoenix                  2149 W Wood Dr\nPhoenix, AZ 85029  33.604054   
2      Phoenix              1134 N Central Ave\nPhoenix, AZ 85004  33.460526   
3      Phoenix              845 W Southern Ave\nPhoenix, AZ 85041  33.392210   
4  Glendale Az  6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...  33.712797   

    longitude                         

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{u'funny': 0, u'useful': 5, u'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{u'funny': 0, u'useful': 0, u'cool': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,"{u'funny': 0, u'useful': 1, u'cool': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,"{u'funny': 0, u'useful': 2, u'cool': 1}"
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,"{u'funny': 0, u'useful': 0, u'cool': 0}"


In [19]:
print checkin.business_id.unique().size
print bsnes.business_id.unique().size

8282
11537


In [35]:
#Is there user data for every review? No, will have to impute reviews w/ no reviewer data
print user.user_id.unique().size
print review.user_id.unique().size
s1 = pd.merge(review, user, how='inner', on=["user_id"])
s1.user_id.unique().size

43873
45981


43873

In [61]:
#All reviews have a business to join to, and we have extra checkins that remain unused 
mutually_exclusive_elements(bsnes.business_id, review.business_id)
mutually_exclusive_elements(checkin.business_id, bsnes.business_id)
hm = mutually_exclusive_elements(checkin.business_id, review.business_id)

first:  0
second:  0
first:  0
second:  3255
first:  0
second:  3255


In [None]:
#Merge all the data into a single data frame
#Apply text transformations on the review text
#Do oneHotEncoding or LabelEncoder stuff to categorical variables
#   i. OHE takes all potential values for a category and turns them into their own binary feature
#   ii. LE makes everything {1,2,...,N} for the # of categories, which is bad if ordinal relationship isn't intended
#   iii. OHE is almost always better; only pitfalls are when there are lots of categories and space is an issue, but we
#        can solve this by using a PCA on the OHE 
#Use Normalizer or StandardScaler (sklearn) on the linear variables
#   i. I'm not sure why we need to do this.  Look this up 
#Perform a PCA or TruncatedSVD to reduce dimensionality (probably only necessary due to the text features)
#Do feature selection; tons of ways to do this, choose one
#Choose a model, train it, test it 