# Searching for Sushi in Honolulu
- Andrea Cohen
- 03.01.2023

## Imports

In [1]:
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## Credentials and Accessing the API

In [2]:
# Load API Credentials
with open('/Users/andreacohen/.secret/yelp_api.json') as f:
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

## Define Search

In [3]:
# set API call parameters 
LOCATION = 'New York, NY'
TERM = 'Sushi'

## Create a results-in-progress JSON file, but only if it doesn't exist

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_NewYork_sushi.json"
JSON_FILE

'Data/results_in_progress_NewYork_sushi.json'

In [5]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist:
if file_exists == False:
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_NewYork_sushi.json already exists.


## Determine how many results are already in the file

In [6]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


## Figure out how many pages of results we will need

In [7]:
# use yelp_api variable's search_query method to perform our API call
search_results = yelp_api.search_query(location=LOCATION,
                               term=TERM,
                               offset=n_results)
search_results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
## How many results total?
total_results = search_results['total']
## How many did we get the details for?
results_per_page = len(search_results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((search_results['total']-n_results)/results_per_page)
n_pages

260

## Add this page of results to .json file

In [9]:
# join new results with old list with extend and save to file
previous_results.extend(search_results['businesses'])
with open(JSON_FILE,'w') as f:
    json.dump(previous_results,f)

## For Loop to call each page, including a progress bar

In [10]:
for i in tqdm_notebook(range(1,n_pages+1)):
    ## read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET
    search_results = yelp_api.search_query(location=LOCATION,
                                   term=TERM,
                                   offset=n_results)
    ## append new results and save to file
    previous_results.extend(search_results['businesses'])
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    # add a 200ms pause
    time.sleep(.2)

  0%|          | 0/260 [00:00<?, ?it/s]

YelpAPIError: VALIDATION_ERROR: Too many results requested, limit+offset must be <= 1000.

## Define a function to delete the previous results file and create a new one

In [11]:
def create_json_file(JSON_FILE, delete_if_exists=False):
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    ## If it DOES exist:
    if file_exists == True:
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exists
            os.remove(JSON_FILE)
            ## recursive call to function after old file deleted
            create_json_file(JSON_FILE, delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")
    ## If it does NOT exist:
    else:
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        ## CREATE ANY NEEDED FOLDERS
        # Get the folder name only
        folder = os.path.dirname(JSON_FILE)
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)

## Use the new function 

In [12]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use yelp_api variable's search_query method to perform API call
results = yelp_api.search_query(location=LOCATION,
                               term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results
n_pages = math.ceil((results['total']-n_results)/results_per_page)
n_pages

[!] Data/results_in_progress_NewYork_sushi.json already exists. Deleting previous file...
[i] Data/results_in_progress_NewYork_sushi.json not found. Saving empty list to new file.
- 0 previous results found.


260

## For Loop to call each page, including a progress bar and a break if too many results are requested

In [13]:
for i in tqdm_notebook(range(1,n_pages+1)):
    ## read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results to use as offset
    n_results = len(previous_results)
    if (n_results + results_per_page) >1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    ## use n_results as the OFFSET
    results = yelp_api.search_query(location=LOCATION,
                                   term=TERM,
                                   offset=n_results)
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    time.sleep(.2)

  0%|          | 0/260 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


## After the loop has finished, convert .json to dataframe

In [14]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
0,PjjpgjY_sdawJU1JHbyNTQ,temakase-new-york-4,Temakase,https://s3-media3.fl.yelpcdn.com/bphoto/mPHfCH...,False,https://www.yelp.com/biz/temakase-new-york-4?a...,619,"[{'alias': 'sushi', 'title': 'Sushi Bars'}]",4.5,"{'latitude': 40.729711, 'longitude': -73.987127}","[delivery, pickup]","{'address1': '157 Second Ave', 'address2': Non...",12123811456,(212) 381-1456,2619.986445,
1,BPZfuPvSxCfoppU4lPFm0Q,domodomo-new-york-new-york-4,DOMODOMO - New York,https://s3-media1.fl.yelpcdn.com/bphoto/YX89PD...,False,https://www.yelp.com/biz/domodomo-new-york-new...,1527,"[{'alias': 'japanese', 'title': 'Japanese'}, {...",4.0,"{'latitude': 40.7280299, 'longitude': -74.00167}",[delivery],"{'address1': '140 W Houston St', 'address2': N...",16467070301,(646) 707-0301,3007.453428,$$$
2,KrJ6m_TkxBAPPSNH-G7rvQ,u-gu-brooklyn-4,U-gu,https://s3-media3.fl.yelpcdn.com/bphoto/xi4Z31...,False,https://www.yelp.com/biz/u-gu-brooklyn-4?adjus...,179,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.5,"{'latitude': 40.69393, 'longitude': -73.96284}","[delivery, pickup]","{'address1': '541 Myrtle Ave', 'address2': 'St...",17188570222,(718) 857-0222,2003.832021,$$
3,pqqhHTz_ZpApsaWGa_zMTA,douska-new-york,Douska,https://s3-media1.fl.yelpcdn.com/bphoto/mVPExH...,False,https://www.yelp.com/biz/douska-new-york?adjus...,192,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.5,"{'latitude': 40.7191369, 'longitude': -73.9908...",[],"{'address1': '63 Delancey St', 'address2': '',...",16466570908,(646) 657-0908,1662.674529,
4,W0YGwDTYRjUCBgtpqHXSAg,iwak-brooklyn,IWak,https://s3-media1.fl.yelpcdn.com/bphoto/JesScm...,False,https://www.yelp.com/biz/iwak-brooklyn?adjust_...,44,"[{'alias': 'sushi', 'title': 'Sushi Bars'}]",5.0,"{'latitude': 40.71244591155105, 'longitude': -...","[delivery, pickup]","{'address1': '340 Grand St', 'address2': None,...",19299924925,(929) 992-4925,2028.677522,


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
995,G9SWgxH65OBQaJaE8dHkbw,zest-ramen-new-york-8,Zest Ramen,https://s3-media4.fl.yelpcdn.com/bphoto/WedjEl...,False,https://www.yelp.com/biz/zest-ramen-new-york-8...,317,"[{'alias': 'ramen', 'title': 'Ramen'}, {'alias...",4.5,"{'latitude': 40.71816, 'longitude': -73.99186}","[delivery, restaurant_reservation, pickup]","{'address1': '112 Eldridge St', 'address2': No...",12122262801,(212) 226-2801,1630.2861,$$
996,g4VEbY00UvMdu9X9JWgOBQ,inatome-japanese-steak-sushi-valley-stream,Inatome Japanese Steak + Sushi,https://s3-media3.fl.yelpcdn.com/bphoto/lrCnnz...,False,https://www.yelp.com/biz/inatome-japanese-stea...,281,"[{'alias': 'japanese', 'title': 'Japanese'}, {...",4.0,"{'latitude': 40.65671, 'longitude': -73.70005}","[delivery, pickup]","{'address1': '6 5th St', 'address2': '', 'addr...",15168720419,(516) 872-0419,24186.397466,$$
997,XsrgJT6dl06kAaecLD5ECA,asagao-sushi-croton-on-hudson,Asagao Sushi,https://s3-media4.fl.yelpcdn.com/bphoto/B77b0N...,False,https://www.yelp.com/biz/asagao-sushi-croton-o...,143,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.0,"{'latitude': 41.2005096, 'longitude': -73.8852...",[delivery],"{'address1': '8 Maple St', 'address2': None, '...",19142710770,(914) 271-0770,55443.023224,$$
998,JCWMlkBqLxbAAK9-FQXTmg,kushi-asian-fusion-brooklyn-3,Kushi Asian Fusion,https://s3-media2.fl.yelpcdn.com/bphoto/tXz__4...,False,https://www.yelp.com/biz/kushi-asian-fusion-br...,138,"[{'alias': 'asianfusion', 'title': 'Asian Fusi...",3.5,"{'latitude': 40.6053, 'longitude': -73.99972}","[delivery, pickup]","{'address1': '1934 86st', 'address2': None, 'a...",17189753588,(718) 975-3588,11447.916804,$$
999,9xRIcn0jTyvWNH9aBO3oAA,kenka-new-york,Kenka,https://s3-media2.fl.yelpcdn.com/bphoto/sl7IDI...,False,https://www.yelp.com/biz/kenka-new-york?adjust...,1646,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",3.5,"{'latitude': 40.72913, 'longitude': -73.988465}",[delivery],"{'address1': '25 St Marks Pl', 'address2': '',...",12122546363,(212) 254-6363,2584.349321,$$


## Check for duplicates

In [15]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

3

In [16]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

## Save the final DataFrame to a .csv (or a .csv.gz if its too big for the GitHub file size limit).

In [17]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_NewYork_sushi.csv.gz', compression='gzip',index=False)