In [13]:
import pandas as pd

## Processing the yelp data
- Load into dataframe
- Choose city with largest size that we have Airbnb data for
- Filter by city
- [ ] TODO: Remove unwanted columns
- Save to CSV


In [14]:
# NOTE: file was too big for github, dataset was downloaded from: https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_business.json
yelp_data = pd.read_json('./datasets/yelp_academic_dataset_business.json', lines=True) 

# 1. Filter for city

In [15]:
yelp_cities = yelp_data.city.unique()

In [16]:
airbnb_cities = ['Austin', 'Boston', 'Cambridge', 'Chicago', 'Los Angeles', 'Oakland', 'San Francisco', 'Seattle']

In [17]:
potential_cities = list(filter(lambda c: c in yelp_cities, airbnb_cities))

In [18]:
def city_yelp_count(c):
  return len(yelp_data[yelp_data.city == c])

In [19]:
city_sizes = {c: city_yelp_count(c) for c in potential_cities}

In [20]:
city_sizes

{'Austin': 22416, 'Boston': 8263, 'Cambridge': 2433, 'Oakland': 11}

We choose Austin since it has the largest available amount of data

In [None]:
list(filter(lambda c: 'aus' in c or 'Aus' in c, yelp_cities))

In [35]:
yelp_data_austin = yelp_data[['aus' in c or 'Aus' in c for c in yelp_data.city]]

In [36]:
yelp_data_austin.head().state.unique()

array(['TX'], dtype=object)

In [37]:
yelp_data_austin = yelp_data_austin.reset_index().drop('index', 1)

In [39]:
yelp_data_austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"7801 N Lamar Blvd, Ste A140",Austin,TX,78752,30.346169,-97.711458,5.0,30,1,"{'RestaurantsPriceRange2': '1', 'ByAppointment...","Shopping, Jewelry Repair, Appraisal Services, ...","{'Monday': '12:15-17:0', 'Tuesday': '12:15-17:..."
1,tXvdYGvlEceDljN8gt2_3Q,Capital City Barber Shop,"615 W Slaughter Ln, Ste 113",Austin,TX,78748,30.172706,-97.79992,4.0,5,0,"{'BusinessAcceptsCreditCards': 'False', 'Resta...","Barbers, Beauty & Spas","{'Monday': '9:0-17:0', 'Tuesday': '9:0-19:0', ..."
2,nTIhpR7MhsALPwg_Hh14EA,DoubleTree by Hilton Hotel Austin,6505 N Interstate 35,Austin,TX,78752,30.326377,-97.704543,3.0,139,1,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...","Hotels, Hotels & Travel, Event Planning & Serv...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
3,8XyEpVdAO0o6iVkVxkWosQ,PS Property Management Company,"2506 S Lamar Blvd, Ste 2",Austin,TX,78704,30.246465,-97.778738,4.5,9,1,{'BusinessAcceptsCreditCards': 'True'},"Home Services, Real Estate, Property Management","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
4,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"5625 Eiger Rd, Ste 160",Austin,TX,78735,30.244902,-97.857409,5.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Accept...","Chiropractors, Health & Medical","{'Monday': '9:0-17:45', 'Tuesday': '12:0-17:45..."


In [44]:
yelp_data_austin.attributes[0]

{'RestaurantsPriceRange2': '1',
 'ByAppointmentOnly': 'False',
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'BusinessAcceptsCreditCards': 'True',
 'DogsAllowed': 'True',
 'RestaurantsDelivery': 'None',
 'BusinessAcceptsBitcoin': 'False',
 'BikeParking': 'True',
 'RestaurantsTakeOut': 'None',
 'WheelchairAccessible': 'True'}

## Flattening attributes

In [46]:
yelp_data_austin.attributes=yelp_data_austin.attributes.apply(lambda x: {} if pd.isna(x) else x)

In [50]:
yelp_data_austin = yelp_data_austin.join(pd.json_normalize(yelp_data_austin.attributes.tolist())).

In [51]:
yelp_data_austin.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'RestaurantsPriceRange2',
       'ByAppointmentOnly', 'BusinessParking', 'BusinessAcceptsCreditCards',
       'DogsAllowed', 'RestaurantsDelivery', 'BusinessAcceptsBitcoin',
       'BikeParking', 'RestaurantsTakeOut', 'WheelchairAccessible', 'WiFi',
       'AcceptsInsurance', 'RestaurantsGoodForGroups', 'HasTV',
       'RestaurantsReservations', 'OutdoorSeating', 'NoiseLevel', 'Ambience',
       'GoodForKids', 'RestaurantsAttire', 'CoatCheck', 'GoodForDancing',
       'Caters', 'RestaurantsTableService', 'HappyHour', 'Music', 'Alcohol',
       'GoodForMeal', 'Smoking', 'BestNights', 'BYOBCorkage',
       'HairSpecializesIn', 'DriveThru', 'BYOB', 'Corkage',
       'RestaurantsCounterService', 'Open24Hours', 'AgesAllowed',
       'DietaryRestrictions'],
      dtype='object')

In [53]:
yelp_data_austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,BestNights,BYOBCorkage,HairSpecializesIn,DriveThru,BYOB,Corkage,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"7801 N Lamar Blvd, Ste A140",Austin,TX,78752,30.346169,-97.711458,5.0,30,...,,,,,,,,,,
1,tXvdYGvlEceDljN8gt2_3Q,Capital City Barber Shop,"615 W Slaughter Ln, Ste 113",Austin,TX,78748,30.172706,-97.79992,4.0,5,...,,,,,,,,,,
2,nTIhpR7MhsALPwg_Hh14EA,DoubleTree by Hilton Hotel Austin,6505 N Interstate 35,Austin,TX,78752,30.326377,-97.704543,3.0,139,...,,,,,,,,,,
3,8XyEpVdAO0o6iVkVxkWosQ,PS Property Management Company,"2506 S Lamar Blvd, Ste 2",Austin,TX,78704,30.246465,-97.778738,4.5,9,...,,,,,,,,,,
4,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"5625 Eiger Rd, Ste 160",Austin,TX,78735,30.244902,-97.857409,5.0,5,...,,,,,,,,,,


In [57]:
len(yelp_data_austin[~yelp_data_austin.RestaurantsPriceRange2.isnull()])

10262

## Next steps
- [ ] figure out how to flatten the hours sensibly
- [ ] maybe want to drop some of these attribute columns/rename
  - like does anyone really care if it accepts bitcoin

In [None]:
yelp_data_austin.to_csv('./datasets/yelp_data_austin.csv')