In [1]:
import pandas as pd
import numpy as np
import math
import requests
from pandas.io.json import json_normalize
import pandas_profiling

### Yelp API

To get more details (price range, rating, cuisine,...) on the establishments, we use the Yelp Fusion API and follow these two steps:


1. **Business Match** : The Business Match Endpoint lets us match business data from other sources against businesses on Yelp. We send data from Chicago Food Inspection (namely name and address of establishments) and get back the Yelp id of the establishment. 


2. **Business Details** : The Business Details Endpoint returns detailed business content based on their id (which we have obtained in the previous step). 

**API Keys**

Along with the parameters we send with each request, we need to send an API key with the headers. This allows Yelp to make sure that we do not exceed 5'000 requests per API Key. 
To get these, we simply need to visit the [Yelp Fusion](https://www.yelp.com/developers/v3/manage_app) and create a project. 

In [7]:
API_KEY = 'ZvQ095-O2_NVNygRjjsFK8NhruSE-V51GV5oBUoEvGh52RnsRBHG_9CTx6aoZ1fpQgp8Si2WEZJv-flkhaIRuyKCw9g5OUStUTY9SNhO5BasXGa7KWhWeB80GIvaXXYx'
HEADERS = {'Authorization': 'bearer %s' % API_KEY}

#### BUSINESS MATCH ENDPOINT: Make requests about establishment from the Chicago Food Inspections dataset

In [3]:
ENDPOINT = 'https://api.yelp.com/v3/businesses/matches'

#### Loading the inspections dataset
This dataframe contains all establishments from the Chicago Food Inspections

In [10]:
estab = pd.read_pickle("insp_businesses_pickle")

In [11]:
estab = estab[['DBA Name', 'AKA Name', 'Address']]
estab.head()

Unnamed: 0,DBA Name,AKA Name,Address
0,YOUNG SCHOLARS ACADEMY,YOUNG SCHOLARS ACADEMY,10926-10928 S WESTERN AVE
1,OSITO'S TAP,OSITO'S TAP,2553 S RIDGEWAY AVE
2,LAS SISTERS INC.,LAS SISTERS INC,2700 S TRUMBULL AVE
3,CARNICERIA Y FRUTERIA 'LOS ALTOS' INC.,CARNICERIA Y FRUTERIA LOS ALTOS,2959 W 40TH ST
4,FOODA- 150 N RIVERSIDE,FOODA 150 N RIVERSIDE,150 N RIVERSIDE PLZ


In [12]:
# to this dataframe, we add the columns where we will record the Yelp answers
estab['response_id'] = None
estab['response_alias'] = None
estab['response_name'] =  None
estab['response_latitude'] =  None
estab['response_longitude'] =  None
estab['response_address1'] =  None

In [13]:
def fill_df(dataframe, ENDPOINT):
    """
    Given a dataframe with restaurants
    and Given a list of responses
    Returns the same dataframe with additional columns coming from Yelp
    (condition : If there was 1 match with Yelp)
    And fills up the list of reponses with all responses
    """
    result = dataframe.copy()
    resps =  []
    for req_nb in range(len(result)):
        if(req_nb%100 == 0):
            print(f'req {req_nb} over {len(result)}')
        PARAM = {
        'name' : result.loc[req_nb]['DBA_Name'],
        'address1':result.loc[req_nb]['Address'],
        'city': 'Chicago',
        'state': 'IL',
        'country': 'US'}
        response = requests.get(url=ENDPOINT, params=PARAM, headers = HEADERS)
        resps.append(response)
        response_df = json_normalize(response.json()['businesses'])
        #if we get a match, we fill the dataframe 
        if (len(response_df) == 1):
            result['response_id'][req_nb] = response_df['id'][0]
            result['response_alias'][req_nb] = response_df['alias'][0]
            result['response_name'][req_nb] = response_df['name'][0]
            result['response_latitude'][req_nb] = response_df['coordinates.latitude'][0]
            result['response_longitude'][req_nb] = response_df['coordinates.longitude'][0]
            result['response_address1'][req_nb] = response_df['location.address1'][0]
    return result, resps

In [None]:
yelp_resp_df_0_5000 = estab[0:5000].copy()
#yelp_resp_df_0_5000, yelp_resp_0_5000 = fill_df(yelp_resp_0_5000, ENDPOINT)

In [27]:
yelp_resp_df_0_5000 =  pd.read_pickle('yelp_match_0_4999.pickle')
yelp_resp_df_0_5000

Unnamed: 0,DBA_Name,AKA_Name,Address,response_id,response_alias,response_name,response_latitude,response_longitude,response_address1
0,YOUNG SCHOLARS ACADEMY,YOUNG SCHOLARS ACADEMY,10926-10928 S WESTERN AVE,,,,,,
1,OSITO'S TAP,OSITO'S TAP,2553 S RIDGEWAY AVE,,,,,,
2,LAS SISTERS INC.,LAS SISTERS INC,2700 S TRUMBULL AVE,,,,,,
3,CARNICERIA Y FRUTERIA 'LOS ALTOS' INC.,CARNICERIA Y FRUTERIA LOS ALTOS,2959 W 40TH ST,jleGfLhZuxvRUdqAFstVGg,carniceria-y-fruteria-los-altos-chicago,Carniceria Y Fruteria Los Altos,41.8206,-87.6993,2959 W 40th St
4,FOODA- 150 N RIVERSIDE,FOODA 150 N RIVERSIDE,150 N RIVERSIDE PLZ,,,,,,
...,...,...,...,...,...,...,...,...,...
4995,CONGAS,CONGAS,7021 W HIGGINS AVE,qSrQDsG2BYdCmphu3-qrvA,congas-chicago,Congas,41.9802,-87.8033,7021 W Higgins Ave
4996,"EL POLLO GRILL, INC.","EL POLLO GRILL, INC.",2839 S PULASKI RD,,,,,,
4997,WRIGLEYVILLE ROOFTOPS,WRIGLEYVILLE ROOFTOPS,3637 N SHEFFIELD AVE,R4wLWK510K8VpYi_WJzX9g,wrigley-rooftops-chicago-9,Wrigley Rooftops,41.9478,-87.654,3609 N Sheffield
4998,CHEF XIONG,CHEF XIONG,2143 S ARCHER AVE,Ydn0Q4uqtokhlHTIwOHeTA,chef-xiong-taste-of-szechuan-chicago,Chef Xiong - Taste of Szechuan,41.8534,-87.6333,2143 S Archer Ave


In [17]:
#yelp_resp_df_0_5000.isna().sum().apply(lambda x: '{}% missing values'.format(100 * x/len(yelp_resp_df_0_5000)))

DBA_Name                0.0% missing values
AKA_Name               0.64% missing values
Address                 0.0% missing values
response_id           30.72% missing values
response_alias        30.72% missing values
response_name         30.72% missing values
response_latitude     30.72% missing values
response_longitude    30.72% missing values
response_address1     30.72% missing values
dtype: object

#### BUSINESS DETAILS ENDPOINT: extract price range, rating and cuisine
Now that we have done the match requests and obtained the establishments' ids, we can do more requests to get details. 

In [18]:
# we get rid of the unmatched establishments
matched_responses_5000 = yelp_resp_df_0_5000.dropna(subset=['response_id']).reset_index(drop=True)

In [19]:
# we send the ids and fill the details_5000 dataframe with the reponses
details_5000 = pd.DataFrame()
for req_nb in range(len(matched_responses_5000)):
    if (req_nb%100 == 0):
        print(f'Request {req_nb} over {len(matched_responses_5000)}')
    #REQUEST_URL = 'https://api.yelp.com/v3/businesses/' + matched_responses_5000['response_id'][req_nb]
    #response = requests.get(url=REQUEST_URL, headers = HEADERS)
    #details_5000 = details_5000.append(json_normalize(response.json()))

Request 0 over 3464


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Request 100 over 3464
Request 200 over 3464
Request 300 over 3464
Request 400 over 3464
Request 500 over 3464
Request 600 over 3464
Request 700 over 3464
Request 800 over 3464
Request 900 over 3464
Request 1000 over 3464
Request 1100 over 3464
Request 1200 over 3464
Request 1300 over 3464
Request 1400 over 3464
Request 1500 over 3464
Request 1600 over 3464
Request 1700 over 3464
Request 1800 over 3464
Request 1900 over 3464
Request 2000 over 3464
Request 2100 over 3464
Request 2200 over 3464
Request 2300 over 3464
Request 2400 over 3464
Request 2500 over 3464
Request 2600 over 3464
Request 2700 over 3464
Request 2800 over 3464
Request 2900 over 3464
Request 3000 over 3464
Request 3100 over 3464
Request 3200 over 3464
Request 3300 over 3464
Request 3400 over 3464


In [29]:
details.columns

Index(['alias', 'categories', 'coordinates.latitude', 'coordinates.longitude',
       'display_phone', 'error.code', 'error.description', 'hours', 'id',
       'image_url', 'is_claimed', 'is_closed', 'location.address1',
       'location.address2', 'location.address3', 'location.city',
       'location.country', 'location.cross_streets',
       'location.display_address', 'location.state', 'location.zip_code',
       'messaging.url', 'messaging.use_case_text', 'name', 'phone', 'photos',
       'price', 'rating', 'review_count', 'special_hours', 'transactions',
       'url'],
      dtype='object')

In [36]:
#details_5000 = details_5000[['alias', 'categories', 'hours', 'id', 'is_claimed', 'is_closed', 'coordinates.latitude', 'coordinates.longitude', 'name', 'price', 'rating', 'review_count', 'special_hours']]
details_5000= pd.read_pickle('details_0_5000.pickle')
details_5000

Unnamed: 0,alias,categories,hours,id,is_claimed,is_closed,coordinates.latitude,coordinates.longitude,name,price,rating,review_count,special_hours
0,carniceria-y-fruteria-los-altos-chicago,"[{'alias': 'grocery', 'title': 'Grocery'}]","[{'open': [{'is_overnight': False, 'start': '0...",jleGfLhZuxvRUdqAFstVGg,True,False,41.820629,-87.699310,Carniceria Y Fruteria Los Altos,$,3.5,2.0,
0,als-beef-chicago-19,"[{'alias': 'italian', 'title': 'Italian'}, {'a...","[{'open': [{'is_overnight': False, 'start': '1...",KwnuxV_YLkHUfP3dhe_qZg,True,True,41.878517,-87.626351,Al's Beef,$,3.0,196.0,
0,peking-mandarin-chicago,"[{'alias': 'chinese', 'title': 'Chinese'}, {'a...","[{'open': [{'is_overnight': False, 'start': '1...",4xQ7x-Td1nFshx7J0Bd6pw,True,False,41.968245,-87.715123,Peking Mandarin,$,4.0,166.0,
0,la-humita-chicago,"[{'alias': 'latin', 'title': 'Latin American'}]","[{'open': [{'is_overnight': False, 'start': '1...",bh55xTHTQGLx95rPIj79ug,True,False,41.944680,-87.727680,La Humita,$$,4.0,81.0,
0,chicago-produce-chicago,"[{'alias': 'markets', 'title': 'Fruits & Veggi...","[{'open': [{'is_overnight': False, 'start': '0...",WzXgyBi6K36UVEiluU7Rlg,False,False,41.968636,-87.716194,Chicago Produce,$,3.5,24.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,forno-rosso-pizzeria-napoletana-chicago-3,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...","[{'open': [{'is_overnight': False, 'start': '1...",2cDt9At556hf146QH_XkhQ,True,False,41.884570,-87.654060,Forno Rosso Pizzeria Napoletana,$$,4.5,403.0,
0,congas-chicago,"[{'alias': 'colombian', 'title': 'Colombian'},...","[{'open': [{'is_overnight': True, 'start': '17...",qSrQDsG2BYdCmphu3-qrvA,True,False,41.980191,-87.803283,Congas,$$,2.5,67.0,
0,wrigley-rooftops-chicago-9,"[{'alias': 'venues', 'title': 'Venues & Event ...","[{'open': [{'is_overnight': False, 'start': '1...",R4wLWK510K8VpYi_WJzX9g,True,False,41.947837,-87.654044,Wrigley Rooftops,,4.0,163.0,
0,chef-xiong-taste-of-szechuan-chicago,"[{'alias': 'szechuan', 'title': 'Szechuan'}]","[{'open': [{'is_overnight': False, 'start': '1...",Ydn0Q4uqtokhlHTIwOHeTA,True,False,41.853382,-87.633317,Chef Xiong - Taste of Szechuan,,4.0,29.0,


In [35]:
#details_5000.to_pickle('details_0_5000.pickle')