In [1]:
#Import dependencies
import pandas as pd
import numpy as np
import requests
import json
import warnings

from yelpapi import YelpAPI
from pprint import pprint

warnings.filterwarnings('ignore')

In [2]:
# Sources: 
#https://github.com/davecom/yelp-api-v3/blob/master/docs/api-references/businesses-id-reviews.md
#https://python.gotrained.com/yelp-fusion-api-tutorial/
#https://github.com/gfairchild/yelpapi

#To get the below parameters, you must visit https://www.yelp.com/developers/v3/manage_app and create an account
#Parameters for connecting Yelp API
Client_ID = '-bO633o8iXUQpt5NQpfM5A'
API_Key = 'Qkz_aqKy7I6YxaTOxpPOgh73GNXNw-eM8Pzjiel-Ffm6aGmrMnxrSbilTI0Ax7riPvNyyoQXvaauKS11-F5KZiyCcSOjFt-yCY9ckQwmLfiZC1C6P6bJvIThzRvVXXYx'

headers = {'Authorization': 'Bearer %s' % API_Key}
headers

{'Authorization': 'Bearer Qkz_aqKy7I6YxaTOxpPOgh73GNXNw-eM8Pzjiel-Ffm6aGmrMnxrSbilTI0Ax7riPvNyyoQXvaauKS11-F5KZiyCcSOjFt-yCY9ckQwmLfiZC1C6P6bJvIThzRvVXXYx'}

In [3]:
yelp_api = YelpAPI(API_Key)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x117ff0950>

In [4]:
#Read the data
cfi_data = pd.read_csv('datasets/food-inspections.csv')

In [5]:
#Keep only the columns needed for sending request
cfi_data_for_yelp = cfi_data[['DBA Name', 'Address', 'City', 'State']]

In [6]:
#Add another column which contains country information since it is needed in the business_match_query below
cfi_data_for_yelp['country'] = 'US'

Look if there is null value in data 

In [7]:
cfi_data_for_yelp.isnull().sum()

DBA Name      0
Address       0
City        138
State        42
country       0
dtype: int64

In [8]:
cfi_data_for_yelp = cfi_data_for_yelp.dropna()

In [9]:
cfi_data_for_yelp.isnull().sum()

DBA Name    0
Address     0
City        0
State       0
country     0
dtype: int64

In [10]:
cfi_data_for_yelp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195818 entries, 0 to 195978
Data columns (total 5 columns):
DBA Name    195818 non-null object
Address     195818 non-null object
City        195818 non-null object
State       195818 non-null object
country     195818 non-null object
dtypes: object(5)
memory usage: 9.0+ MB


In [11]:
cfi_data_for_yelp = cfi_data_for_yelp.drop_duplicates()
cfi_data_for_yelp.reset_index(drop=True)

Unnamed: 0,DBA Name,Address,City,State,country
0,"NEW KNOWLEDGE LEARNING CENTER, INC.",8440 S KEDZIE AVE,CHICAGO,IL,US
1,TACOS MONTANAS,3254 W LAWRENCE AVE,CHICAGO,IL,US
2,MONGOLIAN CUISINE,4640 N CUMBERLAND AVE,CHICAGO,IL,US
3,ALCOCER'S LOCAL SHOP,3413 W 51ST ST,CHICAGO,IL,US
4,GABY'S PANADERIA Y PIZZERIA,5050-5054 W FULLERTON AVE,CHICAGO,IL,US
...,...,...,...,...,...
31615,BUTTER,130 S GREEN ST,CHICAGO,IL,US
31616,SAFAH FOOD & LIQUOR INC,7105 S RACINE AVE,CHICAGO,IL,US
31617,DUNKIN DONUTS,970 W PERSHING RD,CHICAGO,IL,US
31618,RAINBOW GROCERY,2003 W 69TH ST,CHICAGO,IL,US


Split data according to daily request number which is 5000 requests per day

In [12]:
cfi_0_5000      = cfi_data_for_yelp.iloc[:5000].reset_index(drop=True)
cfi_5000_10000  = cfi_data_for_yelp.iloc[5000:10000].reset_index(drop=True)
cfi_10000_15000 = cfi_data_for_yelp.iloc[10000:15000].reset_index(drop=True)
cfi_15000_20000 = cfi_data_for_yelp.iloc[15000:20000].reset_index(drop=True)
cfi_20000_25000 = cfi_data_for_yelp.iloc[20000:25000].reset_index(drop=True)
cfi_25000_30000 = cfi_data_for_yelp.iloc[25000:30000].reset_index(drop=True)
cfi_30000_up    = cfi_data_for_yelp.iloc[30000:].reset_index(drop=True)

In [13]:
list_of_splitted_cfi_df = [cfi_0_5000, cfi_5000_10000, cfi_10000_15000, cfi_15000_20000, 
                           cfi_20000_25000, cfi_25000_30000, cfi_30000_up]    

#add new columns to the splitted data frames to fill after scraping
for df in list_of_splitted_cfi_df:
    df['Yelp_name'] = np.NaN
    df['Yelp_review_rating'] = np.NaN
    df['Yelp_review_count'] = np.NaN #Request returns always 3 reviews per request
    df['Yelp_rating'] = np.NaN
    df['Yelp_total_review_count'] = np.NaN
    df['Yelp_review_text'] = np.NaN
    df['Yelp_review_time_created'] = np.NaN


In [15]:

df = cfi_30000_up #for example
for i in range(len(df)):
    """
    Run this code by changing df to one of the below dataframes in each run
    cfi_0_5000     
    cfi_5000_10000 
    cfi_10000_15000
    cfi_15000_20000
    cfi_20000_25000
    cfi_25000_30000
    cfi_30000_up   
    """
    
    df_new = df.iloc[i]
    #Business search query with selected parameters
    response = yelp_api.business_match_query(name=df_new['DBA Name'],address1=df_new['Address'],city=df_new['City'],state=df_new['State'],country='US') 
    #pprint(response)

    #Example response
#     {'businesses': [{'alias': 'panaderia-y-pizzeria-la-villa-chicago',
#                  'coordinates': {'latitude': 41.92445, 'longitude': -87.75329},
#                  'display_phone': '(773) 622-6489',
#                  'id': 'jiZ-MHkjV1xUa_iUChhPGg',
#                  'location': {'address1': '5050 W Fullerton',
#                               'address2': '',
#                               'address3': '',
#                               'city': 'Chicago',
#                               'country': 'US',
#                               'display_address': ['5050 W Fullerton',
#                                                   'Chicago, IL 60639'],
#                               'state': 'IL',
#                               'zip_code': '60639'},
#                  'name': 'Panaderia Y Pizzeria La Villa',
#                  'phone': '+17736226489'}]}

    #Do following only if the business is not empty in the response and gather review informations for each business id
    if response['businesses'] != []:
        #pprint(response)
        df['Yelp_name'].iloc[i] = response['businesses'][0]['name']

        url = "https://api.yelp.com/v3/businesses/" + response['businesses'][0]['id'] + "/reviews"
        req = requests.get(url, headers=headers)

        parsed = json.loads(req.text)
        #pprint(parsed)
        ratings_given = [parsed['reviews'][i]['rating'] for i in range(len(parsed['reviews']))]
        #users' rating for searched business id
        df['Yelp_review_rating'].iloc[i] = str(ratings_given) 
        #Request returns always 3 reviews per request if there are more than 3 reviews
        df['Yelp_review_count'].iloc[i] = len(ratings_given) 
        df['Yelp_rating'].iloc[i] = np.mean(ratings_given)
        df['Yelp_total_review_count'].iloc[i] = parsed['total']
        #users' review as text for searched business id
        df['Yelp_review_text'].iloc[i] = str([parsed['reviews'][i]['text'] for i in range(len(parsed['reviews']))])
        df['Yelp_review_time_created'] = str([parsed['reviews'][i]['time_created'] for i in range(len(parsed['reviews']))])


In [16]:
#change the name of the output file based on the df that you used above
df.to_csv("datasets/cfi_30000_up"+"_scraped"+".csv") 


In [17]:
df_cfi_30000_up_scraped = pd.read_csv("datasets/cfi_30000_up_scraped.csv")
df_cfi_30000_up_scraped.head()

Unnamed: 0.1,Unnamed: 0,DBA Name,Address,City,State,country,Yelp_name,Yelp_review_rating,Yelp_review_text,Yelp_review_count,Yelp_total_review_count,Yelp_review_time_created,Yelp_rating
0,0,GRACIES,1119-1121 W WEBSTER AVE,CHICAGO,IL,US,,,,,,"['2019-07-30 06:30:37', '2018-10-20 17:53:20',...",
1,1,SWEET CAROLINE'S,324 N LEAVITT ST,CHICAGO,IL,US,Sweet Ride,"[5, 1, 3]","[""Based on the other reviews, I don't know wha...",3.0,49.0,"['2019-07-30 06:30:37', '2018-10-20 17:53:20',...",3.0
2,2,LA BAHIA,4111 N LINCOLN AVE,CHICAGO,IL,US,La Bahia,"[5, 5, 4]",['We live across the street and decided to giv...,3.0,18.0,"['2019-07-30 06:30:37', '2018-10-20 17:53:20',...",4.666667
3,3,Nancy B. Jefferson Alternative School,1100 S Hamilton ST,CHICAGO,IL,US,,,,,,"['2019-07-30 06:30:37', '2018-10-20 17:53:20',...",
4,4,"ERNESTO'S GROCERY STORE, INC.",2758 S RIDGEWAY AVE,CHICAGO,IL,US,,,,,,"['2019-07-30 06:30:37', '2018-10-20 17:53:20',...",


In [19]:
str(df_cfi_30000_up_scraped['Yelp_review_text'][2]).split(',')

["['We live across the street and decided to give our new neighborhood restaurant a shot.  We were very impressed by the wide array of items on the menu - and...'",
 " 'So this place was really great. We were driving around looking for a place to eat and came upon La Bahia. There was even street parking. Sold.\\n\\nWe were the...'",
 ' "Well it\'s about time I try out this place',
 ' as I live right around the corner. I hadn\'t eaten all day and was riding my bike around the neighborhood when La..."]']