In [2]:
import numpy as np
import pandas as pd

In [3]:
yelp = pd.read_csv('../datasets/yelp_data_austin.csv')

In [4]:
yelp.columns

Index(['Unnamed: 0', 'business_id', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'attributes', 'categories', 'hours', 'RestaurantsPriceRange2',
       'ByAppointmentOnly', 'DogsAllowed', 'RestaurantsDelivery',
       'RestaurantsTakeOut', 'WheelchairAccessible',
       'RestaurantsGoodForGroups', 'OutdoorSeating', 'NoiseLevel', 'Ambience',
       'GoodForKids', 'HappyHour', 'Alcohol', 'MonHours', 'TuesHours',
       'WedHours', 'ThursHours', 'FriHours', 'SatHours', 'SunHours'],
      dtype='object')

In [5]:
yelp.describe(include='object')

Unnamed: 0,business_id,name,address,city,state,attributes,categories,hours,RestaurantsPriceRange2,RestaurantsGoodForGroups,...,NoiseLevel,Ambience,Alcohol,MonHours,TuesHours,WedHours,ThursHours,FriHours,SatHours,SunHours
count,5512,5512,5385,5512,5512,5512,5512,5512,5512,4047,...,5512,3695,3966,1025,1043,1078,1106,1105,1071,920
unique,5512,4183,4224,2,3,5383,4160,2947,5,2,...,4,304,7,209,238,243,262,269,269,224
top,1Eq16r_MSnvPsnIykBdy9w,Subway,1720 Barton Springs Rd,Austin,TX,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Mexican, Restaurants",{},1,True,...,average,"{'romantic': False, 'intimate': False, 'touris...",u'none',0:0-0:0,11:0-21:0,11:0-21:0,11:0-22:0,11:0-22:0,11:0-22:0,11:0-21:0
freq,1,52,13,5510,5509,11,87,874,2938,3600,...,4594,712,1449,275,104,106,97,94,95,92


## Flatten Ambience

In [6]:
import json

In [7]:
def load_bad_json(s):
  t = s.replace("\'","\"").replace("F","f").replace("T","t").replace("None", "null")
  try:
    ret = json.loads(t)
    if ret == False:
      print(t)
    return ret
  except Exception as e:
    print(e)
    print(t)

In [8]:
len(yelp[yelp.Ambience=='False'])

44

In [9]:
yelp.Ambience = yelp.Ambience.apply(lambda x : '{}' if pd.isna(x) or x == 'False' else x).apply(load_bad_json)

In [10]:
yelp = yelp.join(pd.json_normalize(yelp.Ambience))

In [11]:
yelp.describe(include='object')

Unnamed: 0,business_id,name,address,city,state,attributes,categories,hours,RestaurantsPriceRange2,RestaurantsGoodForGroups,...,SunHours,romantic,intimate,classy,hipster,divey,touristy,trendy,upscale,casual
count,5512,5512,5385,5512,5512,5512,5512,5512,5512,4047,...,920,3543,3505,3551,3439,3426,3535,3436,3551,3592
unique,5512,4183,4224,2,3,5383,4160,2947,5,2,...,224,2,2,2,2,2,2,2,2,2
top,1Eq16r_MSnvPsnIykBdy9w,Subway,1720 Barton Springs Rd,Austin,TX,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Mexican, Restaurants",{},1,True,...,11:0-21:0,False,False,False,False,False,False,False,False,True
freq,1,52,13,5510,5509,11,87,874,2938,3600,...,92,3491,3455,2902,3239,3290,3523,3145,3502,2215


## Make things right type

In [12]:
## Drop restaurants with no price range
yelp = yelp[yelp.RestaurantsPriceRange2 != 'False']

In [13]:
yelp = yelp.astype({
  'RestaurantsPriceRange2': 'int64',
  'RestaurantsGoodForGroups': 'bool', 
  'OutdoorSeating': 'bool',
  'romantic': 'bool',
  'intimate': 'bool',
  'classy': 'bool',
  'hipster': 'bool',
  'divey': 'bool',
  'touristy': 'bool',
  'trendy': 'bool',
  'upscale': 'bool',
  'casual': 'bool',
})

## Process alcohol

In [14]:
## alcohol
yelp.Alcohol.unique()

array([nan, "u'full_bar'", "u'none'", "u'beer_and_wine'", "'none'",
       "'full_bar'", "'beer_and_wine'", 'False'], dtype=object)

In [15]:
# make full_bar another attribute
fb = yelp.Alcohol.map(lambda s: (not pd.isna(s)) and 'full_bar' in s)
yelp.Alcohol = yelp.Alcohol.apply(lambda s: (not pd.isna(s)) and ('full_bar' in s or 'beer_and_wine' in s))

In [16]:
fb = fb.rename('FullBar')
yelp = yelp.join(fb)

In [17]:
yelp.describe(include='bool')

Unnamed: 0,ByAppointmentOnly,DogsAllowed,RestaurantsDelivery,RestaurantsTakeOut,WheelchairAccessible,RestaurantsGoodForGroups,OutdoorSeating,GoodForKids,HappyHour,Alcohol,romantic,intimate,classy,hipster,divey,touristy,trendy,upscale,casual,FullBar
count,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510,5510
unique,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
top,False,False,False,True,False,True,True,True,False,False,False,False,False,False,False,False,False,False,True,False
freq,5488,4761,3116,4411,3974,5063,3789,3416,4325,3445,3598,3600,3001,3450,3514,3638,3359,3592,4075,4340


## Process categories

In [18]:
# categories: split and map category -> business_id's
category_list = []

for row in yelp.itertuples():
    cats = row.categories.split(", ")
    for c in cats:
        category_list.append([row.business_id, c])
print(len(category_list))

category_table =  pd.DataFrame(category_list, columns=['business_id', 'category'])

26843


In [19]:
## cols to drop
to_drop = [
  'city', 'state', ## cause all Austin, TX 
  'attributes', ## flattened
  'categories', ## in separate categories table 
  'Ambience', ## flattened
  'NoiseLevel',
]

In [20]:
## remove city, state
yelp = yelp.drop(to_drop, axis=1)

In [21]:
yelp.to_csv('../datasets/yelp_data_austin2.csv')

In [22]:
category_table.to_csv('../datasets/yelp_categories2.csv')

In [23]:
yelp.dtypes

Unnamed: 0                    int64
business_id                  object
name                         object
address                      object
postal_code                 float64
latitude                    float64
longitude                   float64
stars                       float64
review_count                  int64
hours                        object
RestaurantsPriceRange2        int64
ByAppointmentOnly              bool
DogsAllowed                    bool
RestaurantsDelivery            bool
RestaurantsTakeOut             bool
WheelchairAccessible           bool
RestaurantsGoodForGroups       bool
OutdoorSeating                 bool
GoodForKids                    bool
HappyHour                      bool
Alcohol                        bool
MonHours                     object
TuesHours                    object
WedHours                     object
ThursHours                   object
FriHours                     object
SatHours                     object
SunHours                    