In [15]:
import numpy as np
import pandas as pd
import os
import json
from collections import Counter
from ast import literal_eval

In [16]:
business_df = pd.read_json('data/yelp_reviews/yelp_academic_dataset_business.json', lines=True)

In [17]:
food_data = business_df[business_df.categories.str.lower().str.contains('food|restaurants') == True]

In [18]:


counter = Counter() 

def count(d, counter):
    if d is not None:
        counter.update(list(d.keys()))
        
food_data['attributes'].apply(lambda d : count(d, counter))

counter.most_common()


[('RestaurantsTakeOut', 57816),
 ('BusinessParking', 56465),
 ('BusinessAcceptsCreditCards', 56166),
 ('RestaurantsPriceRange2', 54604),
 ('RestaurantsDelivery', 53775),
 ('OutdoorSeating', 46134),
 ('BikeParking', 44291),
 ('RestaurantsReservations', 43577),
 ('HasTV', 42426),
 ('GoodForKids', 42028),
 ('Ambience', 41940),
 ('WiFi', 41840),
 ('RestaurantsGoodForGroups', 41828),
 ('Alcohol', 40829),
 ('Caters', 39854),
 ('RestaurantsAttire', 39058),
 ('NoiseLevel', 35872),
 ('GoodForMeal', 29074),
 ('RestaurantsTableService', 19858),
 ('WheelchairAccessible', 16605),
 ('HappyHour', 13717),
 ('DogsAllowed', 13270),
 ('BusinessAcceptsBitcoin', 7916),
 ('DriveThru', 7675),
 ('Music', 5828),
 ('ByAppointmentOnly', 4938),
 ('BestNights', 4672),
 ('CoatCheck', 4590),
 ('BYOB', 4298),
 ('GoodForDancing', 3772),
 ('Smoking', 3654),
 ('Corkage', 3474),
 ('BYOBCorkage', 1442),
 ('AgesAllowed', 88),
 ('AcceptsInsurance', 77),
 ('DietaryRestrictions', 31),
 ('Open24Hours', 24),
 ('RestaurantsCount

In [19]:

attribute_counter = Counter() 

def count_options_explicit(d, counter):
    if d is not None:

        entries = list(d.keys())

        temp = []
        for entry in entries:
            if "{" not in d[entry]:
                if ('u\'' in d[entry]): d[entry] = d[entry][1:]
                temp.append(f'{entry}: {d[entry]}')
            else :
                sub_d = literal_eval(d[entry])
                temp += [f'{str(entry)}: {str(sub_key)} : {str(sub_d[sub_key])}' for sub_key in list(sub_d.keys())]
                
        counter.update(temp)
        
food_data['attributes'].apply(lambda d : count_options_explicit(d, attribute_counter))

None #Prevents cell from showing output


In [20]:
#Number of relevant features
len(attribute_counter)

221

In [41]:
#Filters for top 75% of features and stores them as a list
attributes = counter.most_common()[:int(len(counter) * .75)]
attributes = [attribute[0] for _, attribute in enumerate(attributes)]
attributes

['RestaurantsTakeOut',
 'BusinessParking',
 'BusinessAcceptsCreditCards',
 'RestaurantsPriceRange2',
 'RestaurantsDelivery',
 'OutdoorSeating',
 'BikeParking',
 'RestaurantsReservations',
 'HasTV',
 'GoodForKids',
 'Ambience',
 'WiFi',
 'RestaurantsGoodForGroups',
 'Alcohol',
 'Caters',
 'RestaurantsAttire',
 'NoiseLevel',
 'GoodForMeal',
 'RestaurantsTableService',
 'WheelchairAccessible',
 'HappyHour',
 'DogsAllowed',
 'BusinessAcceptsBitcoin',
 'DriveThru',
 'Music',
 'ByAppointmentOnly',
 'BestNights',
 'CoatCheck',
 'BYOB']

In [48]:
attributes = counter.most_common()[:int(len(counter) * .75)]
attributes = [attribute[0] for _, attribute in enumerate(attributes)]

parking_options = ['garage', 'street', 'validated', 'lot', 'valet']
attribute_data = []
unkown = "NaN"
none_count = 0
for id, row in zip(food_data.business_id, food_data.attributes):
    if row is None:
        none_count += 1
        temp = ["unknown" for _,__ in enumerate(attributes)]
        temp = [id] + temp
    else:
        # for att in attributes:
        #     print(f'{att}: {row[att]}')
        temp = [row[att] if att in row else unkown for att in attributes]
        # print(temp)
        #Handles parking options
        dictionary = literal_eval(temp[1]) if type(temp[1]) == str and "{" in temp[1] else {}
        
        temp = temp[:1] + [dictionary[parking] if parking in dictionary else unkown for parking in parking_options] + temp[2:]
        temp = [id] + temp
        # print(row)
        # print(temp)
        # break
    attribute_data.append(tuple(temp))

attributes = ["buisness_id"] + attributes[:1] + parking_options + attributes[2:]
print(none_count)

876


In [49]:
attributes_df = pd.DataFrame(attribute_data, columns=attributes)

In [50]:
attributes_df

Unnamed: 0,buisness_id,RestaurantsTakeOut,garage,street,validated,lot,valet,BusinessAcceptsCreditCards,RestaurantsPriceRange2,RestaurantsDelivery,...,WheelchairAccessible,HappyHour,DogsAllowed,BusinessAcceptsBitcoin,DriveThru,Music,ByAppointmentOnly,BestNights,CoatCheck,BYOB
0,MTSW4McQd7CbVtyjqoe9mw,True,False,True,False,False,False,False,1,False,...,,,,,,,False,,,
1,mWMc6_wTdE0EUBKIGXDVfA,True,,,,True,False,True,,,...,True,,,,,,,,,
2,CF33F8-E6oudUQ46HnavjQ,True,,,,,,True,1,True,...,True,False,False,,True,,False,,False,
3,k0hlBqXX-Bt0vf1op7Jr1w,True,False,False,False,True,False,True,1,False,...,,,,,,,,,,
4,bBDDEgkFA1Otx9Lfe7BZUQ,True,False,False,False,False,False,True,1,True,...,True,False,False,,True,,False,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64611,cM6V90ExQD6KMSU3rRB5ZA,False,False,False,False,True,False,True,1,False,...,,,,,,,,,,
64612,1jx1sfgjgVg0nM6n3p0xWA,True,False,False,False,True,False,True,2,False,...,True,,False,False,,,,,,
64613,WnT9NIzQgLlILjPT0kEcsQ,True,,True,,False,False,True,2,True,...,False,False,,,,,,,,
64614,2O2K6SXPWv56amqxCECd4w,True,False,True,False,False,False,True,1,False,...,True,,,False,,,,,,


In [None]:
type(business_df)


pandas.core.frame.DataFrame

In [None]:
food_data.to_json("data/cleaned/yelp_academic_dataset_restaurant.json")

In [None]:
check_df = pd.read_json('data/cleaned/yelp_academic_dataset_restaurant.json')
check_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': ''full_bar'', 'R...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.768170,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,33,1,"{'WiFi': ''free'', 'RestaurantsGoodForGroups':...","Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '..."
150328,1jx1sfgjgVg0nM6n3p0xWA,Savaya Coffee Market,11177 N Oracle Rd,Oro Valley,AZ,85737,32.409552,-110.943073,4.5,41,1,"{'BusinessParking': '{'garage': False, 'street...","Specialty Food, Food, Coffee & Tea, Coffee Roa...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-14:0', '..."
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
150339,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,DE,19014,39.856185,-75.427725,4.5,14,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [54]:
attributes_df.to_json("data/cleaned/yelp_academic_dataset_restaurant_att.json")

In [55]:
pd.read_json('data/cleaned/yelp_academic_dataset_restaurant_att.json') 


Unnamed: 0,buisness_id,RestaurantsTakeOut,garage,street,validated,lot,valet,BusinessAcceptsCreditCards,RestaurantsPriceRange2,RestaurantsDelivery,...,WheelchairAccessible,HappyHour,DogsAllowed,BusinessAcceptsBitcoin,DriveThru,Music,ByAppointmentOnly,BestNights,CoatCheck,BYOB
0,MTSW4McQd7CbVtyjqoe9mw,True,False,True,False,False,False,False,1,False,...,,,,,,,False,,,
1,mWMc6_wTdE0EUBKIGXDVfA,True,,,,True,False,True,,,...,True,,,,,,,,,
2,CF33F8-E6oudUQ46HnavjQ,True,,,,,,True,1,True,...,True,False,False,,True,,False,,False,
3,k0hlBqXX-Bt0vf1op7Jr1w,True,False,False,False,True,False,True,1,False,...,,,,,,,,,,
4,bBDDEgkFA1Otx9Lfe7BZUQ,True,False,False,False,False,False,True,1,True,...,True,False,False,,True,,False,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64611,cM6V90ExQD6KMSU3rRB5ZA,False,False,False,False,True,False,True,1,False,...,,,,,,,,,,
64612,1jx1sfgjgVg0nM6n3p0xWA,True,False,False,False,True,False,True,2,False,...,True,,False,False,,,,,,
64613,WnT9NIzQgLlILjPT0kEcsQ,True,,True,,False,False,True,2,True,...,False,False,,,,,,,,
64614,2O2K6SXPWv56amqxCECd4w,True,False,True,False,False,False,True,1,False,...,True,,,False,,,,,,
