In [2]:
import pandas as pd

## Processing the yelp data
- Load into dataframe
- Choose city with largest size that we have Airbnb data for
- Filter by city
- [ ] TODO: Remove unwanted columns
- Save to CSV


In [3]:
# NOTE: file was too big for github, dataset was downloaded from: https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_business.json
yelp_data = pd.read_json('./datasets/yelp_academic_dataset_business.json', lines=True) 

# 1. Filter for city

In [4]:
yelp_cities = yelp_data.city.unique()

In [5]:
airbnb_cities = ['Austin', 'Boston', 'Cambridge', 'Chicago', 'Los Angeles', 'Oakland', 'San Francisco', 'Seattle']

In [6]:
potential_cities = list(filter(lambda c: c in yelp_cities, airbnb_cities))

In [7]:
def city_yelp_count(c):
  return len(yelp_data[yelp_data.city == c])

In [8]:
city_sizes = {c: city_yelp_count(c) for c in potential_cities}

In [9]:
city_sizes

{'Austin': 22416, 'Boston': 8263, 'Cambridge': 2433, 'Oakland': 11}

We choose Austin since it has the largest available amount of data

In [10]:
list(filter(lambda c: 'aus' in c or 'Aus' in c, yelp_cities))

['Austin',
 'austin',
 'West Austin',
 'Austin SW',
 'Austin ',
 'Greater Austin',
 'Austin - Westlake Hills',
 'Austell',
 'Austin texas',
 'Southeast Austin',
 'Auston',
 'Austin.',
 'Austin, TX',
 'Lake Austin',
 'Downtown Austin',
 'Austin Tx']

In [11]:
yelp_data_austin = yelp_data[['aus' in c or 'Aus' in c for c in yelp_data.city]]

In [12]:
yelp_data_austin.head().state.unique()

array(['TX'], dtype=object)

In [13]:
yelp_data_austin = yelp_data_austin.reset_index().drop('index', 1)

  yelp_data_austin = yelp_data_austin.reset_index().drop('index', 1)


In [14]:
yelp_data_austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"7801 N Lamar Blvd, Ste A140",Austin,TX,78752,30.346169,-97.711458,5.0,30,1,"{'RestaurantsPriceRange2': '1', 'ByAppointment...","Shopping, Jewelry Repair, Appraisal Services, ...","{'Monday': '12:15-17:0', 'Tuesday': '12:15-17:..."
1,tXvdYGvlEceDljN8gt2_3Q,Capital City Barber Shop,"615 W Slaughter Ln, Ste 113",Austin,TX,78748,30.172706,-97.79992,4.0,5,0,"{'BusinessAcceptsCreditCards': 'False', 'Resta...","Barbers, Beauty & Spas","{'Monday': '9:0-17:0', 'Tuesday': '9:0-19:0', ..."
2,nTIhpR7MhsALPwg_Hh14EA,DoubleTree by Hilton Hotel Austin,6505 N Interstate 35,Austin,TX,78752,30.326377,-97.704543,3.0,139,1,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...","Hotels, Hotels & Travel, Event Planning & Serv...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
3,8XyEpVdAO0o6iVkVxkWosQ,PS Property Management Company,"2506 S Lamar Blvd, Ste 2",Austin,TX,78704,30.246465,-97.778738,4.5,9,1,{'BusinessAcceptsCreditCards': 'True'},"Home Services, Real Estate, Property Management","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
4,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"5625 Eiger Rd, Ste 160",Austin,TX,78735,30.244902,-97.857409,5.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Accept...","Chiropractors, Health & Medical","{'Monday': '9:0-17:45', 'Tuesday': '12:0-17:45..."


In [15]:
yelp_data_austin.attributes[0]

{'RestaurantsPriceRange2': '1',
 'ByAppointmentOnly': 'False',
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'BusinessAcceptsCreditCards': 'True',
 'DogsAllowed': 'True',
 'RestaurantsDelivery': 'None',
 'BusinessAcceptsBitcoin': 'False',
 'BikeParking': 'True',
 'RestaurantsTakeOut': 'None',
 'WheelchairAccessible': 'True'}

## Flattening attributes

In [16]:
yelp_data_austin.attributes=yelp_data_austin.attributes.apply(lambda x: {} if pd.isna(x) else x)

In [17]:
yelp_data_austin = yelp_data_austin.join(pd.json_normalize(yelp_data_austin.attributes.tolist()))

In [18]:
yelp_data_austin.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'RestaurantsPriceRange2',
       'ByAppointmentOnly', 'BusinessParking', 'BusinessAcceptsCreditCards',
       'DogsAllowed', 'RestaurantsDelivery', 'BusinessAcceptsBitcoin',
       'BikeParking', 'RestaurantsTakeOut', 'WheelchairAccessible', 'WiFi',
       'AcceptsInsurance', 'RestaurantsGoodForGroups', 'HasTV',
       'RestaurantsReservations', 'OutdoorSeating', 'NoiseLevel', 'Ambience',
       'GoodForKids', 'RestaurantsAttire', 'CoatCheck', 'GoodForDancing',
       'Caters', 'RestaurantsTableService', 'HappyHour', 'Music', 'Alcohol',
       'GoodForMeal', 'Smoking', 'BestNights', 'BYOBCorkage',
       'HairSpecializesIn', 'DriveThru', 'BYOB', 'Corkage',
       'RestaurantsCounterService', 'Open24Hours', 'AgesAllowed',
       'DietaryRestrictions'],
      dtype='object')

In [19]:
yelp_data_austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,BestNights,BYOBCorkage,HairSpecializesIn,DriveThru,BYOB,Corkage,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"7801 N Lamar Blvd, Ste A140",Austin,TX,78752,30.346169,-97.711458,5.0,30,...,,,,,,,,,,
1,tXvdYGvlEceDljN8gt2_3Q,Capital City Barber Shop,"615 W Slaughter Ln, Ste 113",Austin,TX,78748,30.172706,-97.79992,4.0,5,...,,,,,,,,,,
2,nTIhpR7MhsALPwg_Hh14EA,DoubleTree by Hilton Hotel Austin,6505 N Interstate 35,Austin,TX,78752,30.326377,-97.704543,3.0,139,...,,,,,,,,,,
3,8XyEpVdAO0o6iVkVxkWosQ,PS Property Management Company,"2506 S Lamar Blvd, Ste 2",Austin,TX,78704,30.246465,-97.778738,4.5,9,...,,,,,,,,,,
4,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"5625 Eiger Rd, Ste 160",Austin,TX,78735,30.244902,-97.857409,5.0,5,...,,,,,,,,,,


In [20]:
len(yelp_data_austin[~yelp_data_austin.RestaurantsPriceRange2.isnull()])

10262

## Next steps
- [ ] figure out how to flatten the hours sensibly
- [ ] maybe want to drop some of these attribute columns/rename
  - like does anyone really care if it accepts bitcoin

In [21]:
yelp_data_austin.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'RestaurantsPriceRange2',
       'ByAppointmentOnly', 'BusinessParking', 'BusinessAcceptsCreditCards',
       'DogsAllowed', 'RestaurantsDelivery', 'BusinessAcceptsBitcoin',
       'BikeParking', 'RestaurantsTakeOut', 'WheelchairAccessible', 'WiFi',
       'AcceptsInsurance', 'RestaurantsGoodForGroups', 'HasTV',
       'RestaurantsReservations', 'OutdoorSeating', 'NoiseLevel', 'Ambience',
       'GoodForKids', 'RestaurantsAttire', 'CoatCheck', 'GoodForDancing',
       'Caters', 'RestaurantsTableService', 'HappyHour', 'Music', 'Alcohol',
       'GoodForMeal', 'Smoking', 'BestNights', 'BYOBCorkage',
       'HairSpecializesIn', 'DriveThru', 'BYOB', 'Corkage',
       'RestaurantsCounterService', 'Open24Hours', 'AgesAllowed',
       'DietaryRestrictions'],
      dtype='object')

In [57]:
# remove all non-food places
yelp_data_austin = yelp_data_austin[~yelp_data_austin.categories.isnull()]
yelp_data_austin = yelp_data_austin[yelp_data_austin['categories'].str.contains("Restaurant|Food")]

yelp_data_austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,BestNights,BYOBCorkage,HairSpecializesIn,DriveThru,BYOB,Corkage,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
5,Xw8tuI30T-xihpzwBV-zJg,El Pollo Rey,1725 E Riverside Dr,Austin,TX,78741,30.243493,-97.730141,5.0,11,...,,,,,,,,,,
7,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,
10,bRsDZ44CD3uhGnRY3NeQhQ,Wendy's,6247 Mcneil Drive,Austin,TX,78729,30.441875,-97.746581,2.0,46,...,,,,,,,,,,
11,z-0oY7VxQMQw3JHvdPejrA,Olmecas Mexican Restaurant,2121 E Oltorf St,Austin,TX,78741,30.230016,-97.732639,3.0,21,...,,'no',,,,,,,,
13,gjMCRTs0IZbw9bZLG837Bg,CaveMan Cuisine,,Austin,TX,78704,30.245673,-97.768836,1.5,6,...,,,,,,,,,,


In [58]:
# categories: split and map category -> business_id ** hard


5        Food Trucks, Restaurants, Specialty Food, Food...
7        Bars, Nightlife, Cocktail Bars, Seafood, Resta...
10                         Fast Food, Restaurants, Burgers
11       Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food
13                            Food, Food Delivery Services
                               ...                        
22420                        Thai, Restaurants, Vegetarian
22423    Fashion, Department Stores, Grocery, Electroni...
22424                                 Restaurants, Chinese
22425    Restaurants, Bubble Tea, Coffee & Tea, Food, N...
22428                                   Food, Coffee & Tea
Name: categories, Length: 6355, dtype: object

In [67]:
# remove all rows without Rest. Price Range
yelp_data_austin = yelp_data_austin[~yelp_data_austin.RestaurantsPriceRange2.isnull()]

5512

In [68]:
# handle hours: come back to this
len(yelp_data_austin[~yelp_data_austin.hours.isnull()])

4638

In [164]:
# Set default values based on common practice

# By appt only: replace all N/A's with false
yelp_data_austin['ByAppointmentOnly'].fillna(False, inplace=True)

# Dogs allowed: set default to false
yelp_data_austin['DogsAllowed'].fillna(False, inplace=True)

# NoiseLevel
yelp_data_austin['NoiseLevel'].fillna('average', inplace=True)

# Happy Hour
yelp_data_austin['HappyHour'].fillna(False, inplace=True)

# Good For Kids
yelp_data_austin['GoodForKids'].fillna(False, inplace=True)

# Wheelchair Accessible
yelp_data_austin['WheelchairAccessible'].fillna(False, inplace=True)

# Wheelchair Accessible
yelp_data_austin['RestaurantsTakeOut'].fillna(False, inplace=True)


# Wheelchair Accessible
yelp_data_austin['RestaurantsDelivery'].fillna(False, inplace=True)

In [163]:
value = 'RestaurantsDelivery'
yelp_data_austin[~yelp_data_austin[value].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,BestNights,BYOBCorkage,HairSpecializesIn,DriveThru,BYOB,Corkage,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
5,Xw8tuI30T-xihpzwBV-zJg,El Pollo Rey,1725 E Riverside Dr,Austin,TX,78741,30.243493,-97.730141,5.0,11,...,,,,,,,,,,
7,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,
10,bRsDZ44CD3uhGnRY3NeQhQ,Wendy's,6247 Mcneil Drive,Austin,TX,78729,30.441875,-97.746581,2.0,46,...,,,,,,,,,,
11,z-0oY7VxQMQw3JHvdPejrA,Olmecas Mexican Restaurant,2121 E Oltorf St,Austin,TX,78741,30.230016,-97.732639,3.0,21,...,,'no',,,,,,,,
16,qTdz29Nf3zoALiDDEKfWlw,Pho Oanh,"2121 E Oltorf St, Ste 12B",Austin,TX,78741,30.229913,-97.732649,3.5,15,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22419,erLD01SfdUbGWurNoJTAzw,The Vegan Yacht,5212 Manchaca Rd,Austin,TX,78745,30.218816,-97.795414,4.5,205,...,,,,,False,,,,,
22420,bsltkBbgcAUTaW3awpJpmg,Trai Mai Thai,603 W Live Oak,Austin,TX,78704,30.244371,-97.758238,3.0,6,...,,,,,,,,,,
22423,dO-GVMx-WaxDZ4TN6gBvVg,Target,5621 N I H 35,Austin,TX,78723,30.316057,-97.706760,3.0,70,...,,,,,,,,,,
22424,io_UQS_WwonlCPCfKy3Hdw,Super Bowl,"719 W William Cannon Dr, Ste 103",Austin,TX,78745,30.196750,-97.788452,3.5,214,...,,,,,,,,,,


In [165]:
# Columns to drop
cols = ['is_open', 'BusinessParking', 'BusinessAcceptsBitcoin', 
        'BusinessAcceptsCreditCards', 'Corkage', 'BikeParking', 
        'WiFi', 'AcceptsInsurance', 'BestNights', 'RestaurantsTableService', 
        'HairSpecializesIn', 'HasTV', 'RestaurantsReservations',
        'RestaurantsCounterService', 'DietaryRestrictions', 
        'AgesAllowed', 'Open24Hours', 'BYOB', 'DriveThru',
        'BYOBCorkage', 'Smoking', 'GoodForMeal', 'Music', 
        'Caters', 'GoodForDancing', 'CoatCheck', 'RestaurantsAttire'
        ]

# undecided: OutdoorSeating, RestaurantsGoodForGroups, Ambience, 

yelp_data_austin_clean = yelp_data_austin
for c in cols:
    yelp_data_austin_clean = yelp_data_austin_clean.drop(c, axis=1)

yelp_data_austin_clean.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'attributes',
       'categories', 'hours', 'RestaurantsPriceRange2', 'ByAppointmentOnly',
       'DogsAllowed', 'RestaurantsDelivery', 'RestaurantsTakeOut',
       'WheelchairAccessible', 'RestaurantsGoodForGroups', 'OutdoorSeating',
       'NoiseLevel', 'Ambience', 'GoodForKids', 'HappyHour', 'Alcohol'],
      dtype='object')

In [None]:
yelp_data_austin.to_csv('./datasets/yelp_data_austin.csv')