In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

**Credentials and Accessing the API**

In [2]:
# Load API Credentials
with open(r"C:\Users\Bijan Emadi\.secret\yelp_api.json") as f:   #use your path here!
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

**Define Search**

To allow us to easily perform different searches in the future, we will define variables for LOCATION and TERM set for our particular search conditions. Then when we want to use a different location or term, we can just redefine the variables. This streamlines our code and makes it more readable and reproducible. 

In [3]:
# set our API call parameters and filename before the first call
LOCATION = 'Baltimore, MD,21202'
TERM = 'Crab Cakes'

**Create a results-in-progress JSON file, but only if it doesn't exist**

This is the file where your results will be saved. Note: you must rename your JSON_FILE for different queries to prevent confusing results from other searches. We recommend you include your search terms in the filename.

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = f"Data/results_in_progress_Crab_cakes.json"
JSON_FILE

'Data/results_in_progress_Crab_cakes.json'

In [5]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    
    
    ## save the first page of results
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
## If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_Crab_cakes.json not found. Saving empty list to file.


**Determine how many results are already in the file**

Load the results file to determine the # of results we have previously retrieved, If you just created the file, you would expect it to be empty. 

We will use this as our offset parameter for our API call. Even if this is your first API call, and the number is 0, we want to define 'n_results' based on the length of 'previous_results'

In [6]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


**Figure out how many pages of results we will need**

We will perform our first query to get our first page of results and the total number of results. We will then (via code) calculate how many pages we will need to retrieve all of our results.

In [7]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
## How many results total?
total_results = results['total']
total_results

434

In [9]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

There are over 400 businesses to retrieve from our API and we can get 20 results at a time per 'page'
- We can calculate the # of results remaining by subtracting our offset (length of our previous results) from our total.
- Then we can determine how many pages we will need by dividing the results by 20 (or whatever the value happens to be for results per page)
- Note that we need to round up the number of pages in order to get all of the results. Even if there is only 1 result on the last page, we want to include that page! 

In [10]:
# Import additional packages for controlling our loop
import time, math
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

22

When this example was written, there were 437 results and 20 results per page. 437/20=21.85. Rounding up gives us 22 pages. We expect the first 21 pages to have 20 results and the last page to have the final 17. Notice that we have assigned the number of pages as n_pages here. We will use this value in our  next segment of code. 

You can see that having to manually go through 22 pages would be quite time consuming and inefficient. First, we are going to save the first page into our file, and then we will add on to it with our for loop.

**Add this page of results to .json file**

Our API returns our results in JSON format, with the businesses in a list of dictionaries. We will append the first page of businesses to our previous_results (which is very likely empty) and then save to disk.

In [11]:
# join new results with old list with extend and save to file
previous_results.extend(results['businesses'])  
with open(JSON_FILE,'w') as f:
     json.dump(previous_results,f)

**Set up a progress bar in our for loop**

TQDM is a package designed for adding animated progress bars to Python processes. **It is not currently included in your dojo-env, so you are going to install it manually** by opening a new Terminal/GitBash window and running the following command. 

In [12]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [13]:
from tqdm.notebook import tqdm_notebook
import time
for i in tqdm_notebook(range(n_pages)):
    # adds 200 ms pause
    time.sleep(.2) 

  0%|          | 0/22 [00:00<?, ?it/s]

**For loop to call each page**

The loop below will iterate through each page of the results by starting at the appropriate offset. It will then append the results to the previous_results. This may take some time, so check out the progress bar!

In [14]:
for i in tqdm_notebook( range(1,n_pages+1)):
    time.sleep(.2)
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
#     display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)

  0%|          | 0/22 [00:00<?, ?it/s]

**After the loop has finished**

Convert .json to dataframe, load the 'results in progress' JSON file into a dataframe

In [15]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,D9A33FM394q99o4QtK5YwA,faidleys-seafood-baltimore-3,Faidley's Seafood,https://s3-media3.fl.yelpcdn.com/bphoto/8j3ynZ...,False,https://www.yelp.com/biz/faidleys-seafood-balt...,1195,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.0,"{'latitude': 39.291696, 'longitude': -76.62224}",[delivery],$$,"{'address1': '203 N Paca St', 'address2': '', ...",14107274898,(410) 727-4898,1349.56072
1,ieS_5zqxDHcWMCm8BKUYbg,thames-street-oyster-house-baltimore,Thames Street Oyster House,https://s3-media1.fl.yelpcdn.com/bphoto/9hGjo5...,False,https://www.yelp.com/biz/thames-street-oyster-...,2747,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.5,"{'latitude': 39.28214, 'longitude': -76.59162}",[delivery],$$$,"{'address1': '1728 Thames St', 'address2': '',...",14434497726,(443) 449-7726,2090.712792
2,u65W69AhbjUlvJJBkEhGNQ,miss-shirleys-cafe-baltimore-9,Miss Shirley's Cafe,https://s3-media4.fl.yelpcdn.com/bphoto/9FsOyV...,False,https://www.yelp.com/biz/miss-shirleys-cafe-ba...,2959,"[{'alias': 'breakfast_brunch', 'title': 'Break...",4.0,"{'latitude': 39.2870995, 'longitude': -76.6053...",[delivery],$$,"{'address1': '750 E Pratt St', 'address2': '',...",14105285373,(410) 528-5373,1028.736468
3,fdo4pqfSYjQyW5_-rElfew,faidley-s-seafood-baltimore-2,Faidley’s Seafood,https://s3-media4.fl.yelpcdn.com/bphoto/7aCi2U...,False,https://www.yelp.com/biz/faidley-s-seafood-bal...,40,"[{'alias': 'seafoodmarkets', 'title': 'Seafood...",4.0,"{'latitude': 39.2915897, 'longitude': -76.6222...",[],$,"{'address1': '203 N Paca St', 'address2': '', ...",14107526461,(410) 752-6461,1342.513104
4,6am8TZAFnvND52MOz-Yctg,mamas-on-the-half-shell-baltimore,Mama's On The Half Shell,https://s3-media2.fl.yelpcdn.com/bphoto/HWY8OF...,False,https://www.yelp.com/biz/mamas-on-the-half-she...,1292,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.0,"{'latitude': 39.27986, 'longitude': -76.5752399}","[delivery, pickup]",$$,"{'address1': '2901 Odonnell St', 'address2': '...",14102763160,(410) 276-3160,3328.825798


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
421,BRJ1HbKcfDg7axFWWhUNzA,golden-west-cafe-baltimore-3,Golden West Cafe,https://s3-media2.fl.yelpcdn.com/bphoto/Tq43MA...,False,https://www.yelp.com/biz/golden-west-cafe-balt...,861,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",3.5,"{'latitude': 39.3308895, 'longitude': -76.6340...","[delivery, pickup]",$$,"{'address1': '1105 W 36th St', 'address2': '',...",14108898891,(410) 889-8891,4475.257151
422,nA8hY_b-yD4U_PQGWGmw9Q,american-wings-and-pizza-baltimore,American Wings & Pizza,https://s3-media2.fl.yelpcdn.com/bphoto/mYgJ5p...,False,https://www.yelp.com/biz/american-wings-and-pi...,59,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",2.5,"{'latitude': 39.3165, 'longitude': -76.61556}","[delivery, pickup]",$,"{'address1': '2400 St Paul St', 'address2': ''...",14102352300,(410) 235-2300,2361.614576
423,WVQqkLAKowuIe51IVNXCZA,outback-steakhouse-baltimore,Outback Steakhouse,https://s3-media2.fl.yelpcdn.com/bphoto/NzwBx9...,False,https://www.yelp.com/biz/outback-steakhouse-ba...,135,"[{'alias': 'steak', 'title': 'Steakhouses'}]",3.0,"{'latitude': 39.2817468, 'longitude': -76.5816...","[delivery, pickup]",$$,"{'address1': '2400 Boston St', 'address2': '',...",14105227757,(410) 522-7757,2755.986468
424,rtlLrtUn35c9Y2eIcxHT8g,kings-pizza-and-subs-baltimore-2,King's Pizza & Subs,https://s3-media2.fl.yelpcdn.com/bphoto/9FEdeU...,False,https://www.yelp.com/biz/kings-pizza-and-subs-...,75,"[{'alias': 'pizza', 'title': 'Pizza'}]",2.5,"{'latitude': 39.33086, 'longitude': -76.63161}","[delivery, pickup]",$$,"{'address1': '907 W 36th St', 'address2': None...",14108893663,(410) 889-3663,4382.210159
425,EWkA_yQ1VPa7cHbkiBbawA,phillys-best-baltimore,Philly's Best,https://s3-media4.fl.yelpcdn.com/bphoto/CXQdUz...,False,https://www.yelp.com/biz/phillys-best-baltimor...,45,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",3.0,"{'latitude': 39.33084, 'longitude': -76.63393}","[delivery, pickup]",$,"{'address1': '1101 W 36th St', 'address2': '',...",14102431301,(410) 243-1301,4467.724059


Check for duplicates

In [16]:
# check for duplicate IDs
final_df.duplicated(subset='id').sum()

0

In [17]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_crab_cakes.csv.gz', compression='gzip',index=False)