# Searching for Bakeries in San Francisco
- Andrea Cohen
- 03.01.23

## Imports

In [1]:
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## Credentials and Accessing the API

In [2]:
# Load API Credentials
with open('/Users/andreacohen/.secret/yelp_api.json') as f:
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

## Define the Search

In [3]:
# set API call parameters 
LOCATION = 'San Francisco, CA'
TERM = 'Bakery'

## Create a results-in-progress JSON file, but only if it doesn't exist

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_SanFrancisco_bakery.json"
JSON_FILE

'Data/results_in_progress_SanFrancisco_bakery.json'

## Define a function to create a previous results file (and delete an existing previous results file)

In [5]:
def create_json_file(JSON_FILE, delete_if_exists=False):
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    ## If it DOES exist:
    if file_exists == True:
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exists
            os.remove(JSON_FILE)
            ## recursive call to function after old file deleted
            create_json_file(JSON_FILE, delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")
    ## If it does NOT exist:
    else:
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        ## CREATE ANY NEEDED FOLDERS
        # Get the folder name only
        folder = os.path.dirname(JSON_FILE)
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)

## Create a results-in-progress JSON file, determine how many results are already in the file, and figure out how many pages of results we will need

In [6]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use yelp_api variable's search_query method to perform API call
results = yelp_api.search_query(location=LOCATION,
                               term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results
n_pages = math.ceil((results['total']-n_results)/results_per_page)
n_pages

[i] Data/results_in_progress_SanFrancisco_bakery.json not found. Saving empty list to new file.
- 0 previous results found.


85

## For Loop to call each page, including a progress bar and a break if too many results are requested

In [7]:
for i in tqdm_notebook(range(1,n_pages+1)):
    ## read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results to use as offset
    n_results = len(previous_results)
    if (n_results + results_per_page) >1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    ## use n_results as the OFFSET
    results = yelp_api.search_query(location=LOCATION,
                                   term=TERM,
                                   offset=n_results)
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    time.sleep(.2)

  0%|          | 0/85 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


## After the loop has finished, convert .json to dataframe

In [8]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,1U9_ZNruMLf4EL0pMoxb_w,arsicault-bakery-san-francisco,Arsicault Bakery,https://s3-media3.fl.yelpcdn.com/bphoto/edKSWJ...,False,https://www.yelp.com/biz/arsicault-bakery-san-...,2050,"[{'alias': 'bakeries', 'title': 'Bakeries'}]",4.5,"{'latitude': 37.783429, 'longitude': -122.459307}",[delivery],$$,"{'address1': '397 Arguello Blvd', 'address2': ...",14157509460.0,(415) 750-9460,3211.384137
1,2XQm-uFcTS7oc8MFP-8olA,b-patisserie-san-francisco-2,B Patisserie,https://s3-media1.fl.yelpcdn.com/bphoto/c9hNm5...,False,https://www.yelp.com/biz/b-patisserie-san-fran...,3247,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.5,"{'latitude': 37.787873, 'longitude': -122.440882}",[delivery],$$,"{'address1': '2821 California St', 'address2':...",14154401700.0,(415) 440-1700,3024.692513
2,Yb7cibCAku1zztMjiGuQHw,jane-the-bakery-san-francisco,Jane The Bakery,https://s3-media3.fl.yelpcdn.com/bphoto/haLvOB...,False,https://www.yelp.com/biz/jane-the-bakery-san-f...,587,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.5,"{'latitude': 37.7838, 'longitude': -122.43411}","[pickup, delivery]",$,"{'address1': '1881 Geary Blvd', 'address2': ''...",14156587971.0,(415) 658-7971,2548.361058
3,JHDwFuHBw_PsPdZu196q3w,jina-bakes-san-francisco,Jina Bakes,https://s3-media4.fl.yelpcdn.com/bphoto/6FTSET...,False,https://www.yelp.com/biz/jina-bakes-san-franci...,307,"[{'alias': 'bakeries', 'title': 'Bakeries'}]",4.5,"{'latitude': 37.78522, 'longitude': -122.43157}",[],,"{'address1': '1581 Webster St', 'address2': 'S...",,,2717.175933
4,ri7UUYmx21AgSpRsf4-9QA,tartine-bakery-san-francisco-3,Tartine Bakery,https://s3-media4.fl.yelpcdn.com/bphoto/QRbC0T...,False,https://www.yelp.com/biz/tartine-bakery-san-fr...,8700,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.0,"{'latitude': 37.76131, 'longitude': -122.42431}",[delivery],$$,"{'address1': '600 Guerrero St', 'address2': ''...",14154872600.0,(415) 487-2600,1087.638933


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,o86gyUxsNYnPIF-kAlznKA,aura-tea-and-coffee-san-francisco-3,Aura Tea & Coffee,https://s3-media3.fl.yelpcdn.com/bphoto/RdDscd...,False,https://www.yelp.com/biz/aura-tea-and-coffee-s...,112,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.0,"{'latitude': 37.79233593640378, 'longitude': -...","[pickup, delivery]",$,"{'address1': '121 Spear St', 'address2': 'Ste ...",14087171881.0,(408) 717-1881,5195.870657
996,18kCA436sevf4njM5YOxug,cibo-pop-up-bakery-sausalito,Cibo Pop Up Bakery,,False,https://www.yelp.com/biz/cibo-pop-up-bakery-sa...,4,"[{'alias': 'bakeries', 'title': 'Bakeries'}]",4.0,"{'latitude': 37.87014, 'longitude': -122.50126}",[],,"{'address1': '200 Gate 5 Rd', 'address2': '', ...",,,13414.843678
997,QLG2cYwd26JRWzCQQngcXA,the-italian-homemade-company-berkeley-3,The Italian Homemade Company,https://s3-media2.fl.yelpcdn.com/bphoto/Dw0vYU...,False,https://www.yelp.com/biz/the-italian-homemade-...,519,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 37.85815, 'longitude': -122.25284}","[pickup, delivery]",$$,"{'address1': '2905 College Ave', 'address2': '...",15106491495.0,(510) 649-1495,19413.845476
998,GiSvj5G8TgOlKPtunkduKA,the-posh-bagel-san-francisco-4,The Posh Bagel,https://s3-media2.fl.yelpcdn.com/bphoto/-fTZ4e...,False,https://www.yelp.com/biz/the-posh-bagel-san-fr...,420,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",3.5,"{'latitude': 37.7899148, 'longitude': -122.405...","[pickup, delivery]",$$,"{'address1': '270 Sutter St', 'address2': '', ...",14159510133.0,(415) 951-0133,4245.559936
999,KhV4RMq9X3LaKa_nRrFUVw,signal-coffee-roasters-alameda,SIGNAL Coffee Roasters,https://s3-media2.fl.yelpcdn.com/bphoto/ptfIqz...,False,https://www.yelp.com/biz/signal-coffee-roaster...,96,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,"{'latitude': 37.77462527998164, 'longitude': -...",[],$$,"{'address1': '1536 Webster St', 'address2': ''...",15102178739.0,(510) 217-8739,14133.597201


## Check for duplicates

In [9]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

8

In [10]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

## Save the final DataFrame to a .csv (or a .csv.gz if its too big for the GitHub file size limit).

In [11]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_SanFrancisco_bakery.csv.gz', compression='gzip',index=False)