## Machine Learning to Predict Yelp Ratings from Attributes

The goal of this project is to explore the power of Yelp metadata attributes to predict the rating of a venue. Yelp collects a lot of data on businesses (See [Yelp Developer Docs](https://www.yelp.com/developers/documentation/v2/business)). I will focus on:

* city - city in which the business resides
* longitude & latitude - coordinates of business
* categories - provides a list of catogories the business is associated with
* attributes - a list of various features (Take Out, Waiter Service, Alcohol, etc.)



In [1]:
import re
import gzip
import simplejson
import pandas as pd
import numpy as np

In [2]:
data=gzip.open('yelp_train_academic_dataset_business.json.gz')
data_contents = data.read()
data.close()
lines = re.split('\n',data_contents)
json_data = [simplejson.loads(line) for line in lines[:-1]]
df = pd.DataFrame(json_data)
data_dict = [dict(df.iloc[i]) for i in xrange(len(df))]
df.head(2)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37938 entries, 0 to 37937
Data columns (total 15 columns):
attributes       37938 non-null object
business_id      37938 non-null object
categories       37938 non-null object
city             37938 non-null object
full_address     37938 non-null object
hours            37938 non-null object
latitude         37938 non-null float64
longitude        37938 non-null float64
name             37938 non-null object
neighborhoods    37938 non-null object
open             37938 non-null bool
review_count     37938 non-null int64
stars            37938 non-null float64
state            37938 non-null object
type             37938 non-null object
dtypes: bool(1), float64(3), int64(1), object(10)
memory usage: 4.4+ MB


In [18]:
from sklearn.base import TransformerMixin

class ColumnSelectTransformer(TransformerMixin):
    def __init__(self, columns=None):
        if columns:
            self.columns = columns
        else:
            self.columns = ['business_id']
            
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self,record, *args, **transform_params):
        if isinstance(record,dict):
            return [record[arg] for arg in self.columns]
        elif isinstance(record,list):
            return [[rec[arg] for arg in self.columns] for rec in record]
        
tf_city = ColumnSelectTransformer(columns=['city'])
print 'City:', tf_city.transform(data_dict[0])
tf_latlong = ColumnSelectTransformer(columns=['latitude','longitude'])
print 'Lat. and Long.:', tf_latlong.transform(data_dict[0])
tf_stars = ColumnSelectTransformer(columns=['stars'])
print 'Stars:', tf_stars.transform(data_dict[0])

City: ['Phoenix']
Lat. and Long.: [33.499313000000001, -111.98375799999999]
Stars: [3.5]


In [80]:
df.to_dict

<bound method DataFrame.to_dict of                                               attributes  \
0                         {u'By Appointment Only': True}   
1      {u'Take-out': True, u'Price Range': 1, u'Outdo...   
2      {u'Take-out': True, u'Outdoor Seating': False,...   
3      {u'Take-out': True, u'Accepts Credit Cards': T...   
4      {u'Take-out': True, u'Has TV': False, u'Outdoo...   
5                                                     {}   
6                                                     {}   
7                                    {u'Wi-Fi': u'free'}   
8                                                     {}   
9      {u'Alcohol': u'full_bar', u'Noise Level': u'lo...   
10     {u'Accepts Credit Cards': True, u'Wi-Fi': u'fr...   
11                              {u'Good for Kids': True}   
12     {u'Take-out': True, u'Price Range': 1, u'Outdo...   
13     {u'Take-out': True, u'Accepts Credit Cards': T...   
14     {u'Take-out': True, u'Caters': True, u'Attire'...   
15   

In [38]:
#from sklearn.cross_validation import train_test_split
#data_train, data_test, stars_train, stars_test = train_test_split(data_dict,df['stars'].astype(float))
#print 'Training size:', len(data_train)
#print 'Test size:', len(data_test)
columns=['city', 'stars']
dg = pd.DataFrame(df[columns])
dg.groupby('city').mean()

Unnamed: 0_level_0,stars
city,Unnamed: 1_level_1
Ahwatukee,3.687500
Anthem,3.781818
Apache Junction,3.637500
Arcadia,5.000000
Atlanta,3.500000
Avondale,3.538627
Black Canyon City,3.000000
Bonnyrigg,3.750000
Boulder City,4.136364
Buckeye,3.408451


In [57]:
from sklearn.base import BaseEstimator, RegressorMixin
class CityMeanEstimator(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y):
        df=pd.DataFrame(np.vstack([X,y]).T,columns=['city','stars'])
        df['stars']=df['stars'].astype(float)
        self.f = df.groupby('city').mean()#.to_dict
        return self
    
    def predict(self, X):
        print X
        if X in self.f.index:
            val = self.f.loc[X].values
        else:
            val = 3.7
        return val
    
#pd.DataFrame()

In [17]:
#test_data = pd.DataFrame([tf_city.transform(data_train),stars_train],columns=['city','stars'])
#type(tf_city.transform(data_train))
type(stars_train)

pandas.core.series.Series

In [28]:
data_dict[0].keys()

['city',
 'review_count',
 'name',
 'neighborhoods',
 'type',
 'business_id',
 'full_address',
 'hours',
 'state',
 'longitude',
 'stars',
 'latitude',
 'attributes',
 'open',
 'categories']

In [29]:
data_train[0]

{'attributes': {},
 'business_id': 'Urnid7YUwfMuoFAwYT7PFw',
 'categories': ['Lighting Fixtures & Equipment', 'Home Services'],
 'city': 'Madison',
 'full_address': '929 S Park St\nBay Creek\nMadison, WI 53715',
 'hours': {'Friday': {'close': '17:00', 'open': '07:30'},
  'Monday': {'close': '17:00', 'open': '07:30'},
  'Thursday': {'close': '17:00', 'open': '07:30'},
  'Tuesday': {'close': '17:00', 'open': '07:30'},
  'Wednesday': {'close': '17:00', 'open': '07:30'}},
 'latitude': 43.057748699999998,
 'longitude': -89.399107799999996,
 'name': 'Light House Lamp Repair',
 'neighborhoods': ['Bay Creek'],
 'open': False,
 'review_count': 3,
 'stars': 5.0,
 'state': 'WI',
 'type': 'business'}

In [32]:
type(stars_train.as_matrix())

numpy.ndarray

In [39]:
from sklearn import neighbors
kn = neighbors.KNeighborsRegressor()
latlon_cols = ['latitude','longitude']
kn.fit(df[latlon_cols],df['stars'])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [41]:
kn.predict(df[latlon_cols].ix[0])

array([ 4.])

In [42]:
df['stars'].ix[0]

3.5

In [43]:
kn.score(df[latlon_cols],df['stars'])

0.25222497582039283

In [59]:
est = CityMeanEstimator()
est.fit(df['city'],df['stars'])

CityMeanEstimator()

In [77]:
est.predict(df['city'].ix[1])
#est.f
print df['city'].ix[1:6]
est.f.loc[df['city'].ix[1:6]].values

De Forest
1     De Forest
2     De Forest
3     De Forest
4     De Forest
5    Mc Farland
6    Mc Farland
Name: city, dtype: object


array([[ 3.75],
       [ 3.75],
       [ 3.75],
       [ 3.75],
       [ 3.1 ],
       [ 3.1 ]])

In [79]:
[est.predict(x) for x in df['city'].ix[1:6]]

De Forest
De Forest
De Forest
De Forest
Mc Farland
Mc Farland


[array([ 3.75]),
 array([ 3.75]),
 array([ 3.75]),
 array([ 3.75]),
 array([ 3.1]),
 array([ 3.1])]

In [52]:
good_columns=df._get_numeric_data()
type(good_columns)

pandas.core.frame.DataFrame