# Searching for Sushi in Honolulu
- Andrea Cohen
- 03.01.2023

## Imports

In [1]:
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## Credentials and Accessing the API

In [2]:
# Load API Credentials
with open('/Users/andreacohen/.secret/yelp_api.json') as f:
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

## Define Search

In [3]:
# set API call parameters 
LOCATION = 'Honolulu, HI'
TERM = 'Sushi'

## Create a results-in-progress JSON file, but only if it doesn't exist

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_Honolulu_sushi.json"
JSON_FILE

'Data/results_in_progress_Honolulu_sushi.json'

In [5]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist:
if file_exists == False:
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_Honolulu_sushi.json not found. Saving empty list to file.


## Determine how many results are already in the file

In [6]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


## Figure out how many pages of results we will need

In [7]:
# use yelp_api variable's search_query method to perform our API call
search_results = yelp_api.search_query(location=LOCATION,
                               term=TERM,
                               offset=n_results)
search_results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
## How many results total?
total_results = search_results['total']
total_results

817

In [9]:
## How many did we get the details for?
results_per_page = len(search_results['businesses'])
results_per_page

20

In [10]:
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((search_results['total']-n_results)/results_per_page)
n_pages

41

## Add this page of results to .json file

In [11]:
# join new results with old list with extend and save to file
previous_results.extend(search_results['businesses'])
with open(JSON_FILE,'w') as f:
    json.dump(previous_results,f)

## For Loop to call each page, including a progress bar

In [13]:
for i in tqdm_notebook(range(1,n_pages+1)):
    ## read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET
    search_results = yelp_api.search_query(location=LOCATION,
                                   term=TERM,
                                   offset=n_results)
    ## append new results and save to file
    previous_results.extend(search_results['businesses'])
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    # add a 200ms pause
    time.sleep(.2)

  0%|          | 0/41 [00:00<?, ?it/s]

## After the loop has finished, convert .json to dataframe

In [14]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,iaHVxo0VI0ekdbehlEgTvg,doraku-sushi-waikiki-honolulu-2,Doraku Sushi Waikiki,https://s3-media2.fl.yelpcdn.com/bphoto/bR4N0G...,False,https://www.yelp.com/biz/doraku-sushi-waikiki-...,3098,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.0,"{'latitude': 21.278638, 'longitude': -157.829006}","[delivery, pickup]",$$,"{'address1': '2233 Kalakaua Ave', 'address2': ...",18089223323,(808) 922-3323,4117.076621
1,qp9QUYs-8Ice2Xz5O6EBCg,ginza-sushi-honolulu-2,Ginza Sushi,https://s3-media1.fl.yelpcdn.com/bphoto/gO7kOp...,False,https://www.yelp.com/biz/ginza-sushi-honolulu-...,648,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.5,"{'latitude': 21.292727, 'longitude': -157.852452}",[pickup],$$,"{'address1': '1200 Ala Moana Blvd', 'address2'...",18085939797,(808) 593-9797,2293.353144
2,763KaVsX7VjQLVCruvb9AA,izakaya-torae-torae-honolulu,Izakaya Torae Torae,https://s3-media4.fl.yelpcdn.com/bphoto/-x-5Te...,False,https://www.yelp.com/biz/izakaya-torae-torae-h...,975,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.5,"{'latitude': 21.29593, 'longitude': -157.82958}",[delivery],$$,"{'address1': '1111 McCully St', 'address2': ''...",18089495959,(808) 949-5959,2450.845111
3,7lYS4dPB-QblFrX2kZWbWQ,zigu-honolulu-2,Zigu,https://s3-media3.fl.yelpcdn.com/bphoto/AwQcXC...,False,https://www.yelp.com/biz/zigu-honolulu-2?adjus...,563,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.5,"{'latitude': 21.28043, 'longitude': -157.82658}",[delivery],$$$,"{'address1': '413 Seaside Ave', 'address2': 'S...",18082129252,(808) 212-9252,4065.401169
4,hqIVEF1r-tpB785QnfIilA,katsumidori-sushi-honolulu,Katsumidori Sushi,https://s3-media3.fl.yelpcdn.com/bphoto/upHqZL...,False,https://www.yelp.com/biz/katsumidori-sushi-hon...,915,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.0,"{'latitude': 21.2861463283552, 'longitude': -1...",[delivery],$$$,"{'address1': '100 Holomoana St', 'address2': '...",18089467603,(808) 946-7603,2966.117835


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
812,fu-wEdjWbjwnOtlDx-xLEw,far-east-chop-suey-waipahu-2,Far East Chop Suey,https://s3-media1.fl.yelpcdn.com/bphoto/my74LK...,False,https://www.yelp.com/biz/far-east-chop-suey-wa...,114,"[{'alias': 'cantonese', 'title': 'Cantonese'}]",3.0,"{'latitude': 21.380474, 'longitude': -158.019791}",[],$,"{'address1': '94-300 Farrington Hwy', 'address...",18086717233,(808) 671-7233,19585.912551
813,JjOuJVIZ997U70HIJrewVg,kfc-kaneohe,KFC,https://s3-media1.fl.yelpcdn.com/bphoto/K67DZx...,False,https://www.yelp.com/biz/kfc-kaneohe?adjust_cr...,105,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1.5,"{'latitude': 21.4025995027581, 'longitude': -1...","[delivery, pickup]",$,"{'address1': '45-480 Kaneohe Bay Dr', 'address...",18082360313,(808) 236-0313,11132.781929
814,tc_hpd_zOszjGd7csRZXeA,starbucks-waipahu-3,Starbucks,https://s3-media3.fl.yelpcdn.com/bphoto/06RbiH...,False,https://www.yelp.com/biz/starbucks-waipahu-3?a...,235,"[{'alias': 'coffee', 'title': 'Coffee & Tea'}]",3.5,"{'latitude': 21.42698103, 'longitude': -158.00...",[delivery],$$,"{'address1': '94-1221 Ka Uka Blvd', 'address2'...",18086809213,(808) 680-9213,20598.005468
815,LUombmn-HkWK4jM5EOX-iQ,panda-express-waipahu-3,Panda Express,https://s3-media4.fl.yelpcdn.com/bphoto/DPAWsB...,False,https://www.yelp.com/biz/panda-express-waipahu...,72,"[{'alias': 'chinese', 'title': 'Chinese'}, {'a...",3.5,"{'latitude': 21.4005263170856, 'longitude': -1...","[delivery, pickup]",$,"{'address1': '94-799 Lumiaina Street', 'addres...",18086768899,(808) 676-8899,19559.059281
816,VxLMdMiVDfN5dYLtJiKyPg,five-guys-mililani-3,Five Guys,https://s3-media4.fl.yelpcdn.com/bphoto/uK1izL...,False,https://www.yelp.com/biz/five-guys-mililani-3?...,464,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",3.0,"{'latitude': 21.4526833, 'longitude': -158.006...",[delivery],$$,"{'address1': '95-1249 Meheula Pkwy', 'address2...",18083123407,(808) 312-3407,22840.349833


## Check for duplicates

In [15]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

2

In [16]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

## Save the final DataFrame to a .csv (or a .csv.gz if its too big for the GitHub file size limit).

In [17]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_Honolulu_sushi.csv.gz', compression='gzip',index=False)