<a href="https://colab.research.google.com/github/angelaroanne/ACMDIGW_K32/blob/main/extracting_and_saving_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# most common short-cut keys
# ctrl + / = to comment
# ctrl + enter = run a code block
# ctrl + m, b = create a code block below
# ctrl + m, a = create a code block above

In [4]:
# connect gdrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# install yelpapi
!pip install yelpapi

Collecting yelpapi
  Downloading yelpapi-2.5.1-py3-none-any.whl (7.4 kB)
Installing collected packages: yelpapi
Successfully installed yelpapi-2.5.1


In [6]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# additional libraries
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

# Loading Credentials and Creating the YelpAPI object


In [7]:
# load yelp API credentials and instantiate the YelpAPI object
with open('/content/drive/MyDrive/Colab Notebooks/Credentials/yelp_api.json') as file:
  yelp_credentials = json.load(file)

# create the yelp api engine
yelp_api = YelpAPI(yelp_credentials ['api-key'], timeout_s = 5.0)

In [8]:
# define API call parameters and output file path
LOCATION = 'Greenville, SC'
TERM = 'Sushi'
JSON_FILE = 'Data/results_SC_sushi.json'

# display the destination file path
print(f'Data will be saved to {JSON_FILE}')

Data will be saved to Data/results_SC_sushi.json


In [9]:
# check if JSON_FILE exists and create if it doesn't
if not os.path.isfile(JSON_FILE):

  # create the directory
  os.makedirs(os.path.dirname(JSON_FILE), exist_ok = True)

  # confirm and save an empty list on the file
  print(f'[i]{JSON_FILE} not found. Saving empty list on file')
  with open (JSON_FILE, 'w') as file:
    json.dump([],file)

else:
  # inform the user if the file already exists
  print(f'[i]{JSON_FILE} already exists.')

[i]Data/results_SC_sushi.json not found. Saving empty list on file


In [10]:
# load previous results and set offset based on the number of results
with open(JSON_FILE,'r') as file:
  previous_results = json.load(file)

n_results = len(previous_results)

print(f'{n_results} previous results found.')

0 previous results found.


In [11]:
# make first API call
results = yelp_api.search_query(location = LOCATION,
                                term = TERM,
                                offset = n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [12]:
# return how mmany results were found
total_results = results['total']
total_results

110

In [13]:
business_results = results['businesses']

# specify the filename where you want to save the data
json_file_path = JSON_FILE

# save the business data as a JSON file
with open(json_file_path, 'w') as file:
  json.dump(business_results,file,indent =4)
  #business_data was replaced with business results

# indent - 4 means
# save the data in the format
# key: values,
# key: values,
# key: values,
# key: values
# kasi sabi ko = 4

# otherwise, without indent - 4, it will look like:
# key: value,



In [14]:
# how many details did we get?
results_per_page = len(business_results)
# business_data was replaced with business_results
print(f'number of results retrieved per page', results_per_page)

number of results retrieved per page 20


In [15]:
# using the math.ceil to round up the total number of pages
n_pages = math.ceil(total_results/results_per_page)
print(f'total number of pages: {n_pages}')

total number of pages: 6


In [16]:
# create a loop to extract the data
for i in tqdm_notebook(range(1, total_results + 1)):
  try:
    time.sleep(0.2) # short delay to respect API rate limits
    # so that when we call, pause, call pause, it doesn't think we're attacking it

    # load existing results to append new data
    with open(JSON_FILE, 'r') as file:
      previous_results = json.load(file)

    # fetch new results
    new_results = yelp_api.search_query(location = LOCATION,
                                        term = TERM,
                                        offset = len(previous_results))

    # append and save the updated results
    updated_results = previous_results + new_results['businesses']
    with open(JSON_FILE, 'w') as file: # w should not be capital
      json.dump(updated_results, file)

  except Exception as e:
    if 'Too Many Requests for url' in str(e):
      # if you see 'Too many requests for url', use:
      print('Rate limit exceeded. Stopping data-collection')
      break #exit the loop if the rate limit is exceeded

    else:
      print(f'An error occured: {e}')
      continue #continue to the next iteration in case of other errors

  0%|          | 0/110 [00:00<?, ?it/s]

In [17]:
# load the final JSON file
df = pd.read_json(JSON_FILE)

# display the first 5 rows of the data
df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
0,2jXS4oZkMhAONtd2j7L5Yg,chef-21-sushi-burger-and-korean-bbq-greenville-3,Chef 21 Sushi Burger & Korean BBQ,https://s3-media3.fl.yelpcdn.com/bphoto/8RntNn...,False,https://www.yelp.com/biz/chef-21-sushi-burger-...,38,"[{'alias': 'korean', 'title': 'Korean'}, {'ali...",4.5,"{'latitude': 34.847671, 'longitude': -82.394229}","[delivery, pickup]","{'address1': '500 E McBee Ave', 'address2': 'S...",18642633018,(864) 263-3018,3341.861901,
1,RGRk1ioORwm_FIX8PM732Q,konnichiwa-greenville,Konnichiwa,https://s3-media3.fl.yelpcdn.com/bphoto/p47H0_...,False,https://www.yelp.com/biz/konnichiwa-greenville...,71,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.1,"{'latitude': 34.845952342825115, 'longitude': ...",[],"{'address1': '101 Falls Park Dr', 'address2': ...",18642524436,(864) 252-4436,4184.255183,
2,zG_XOAFi9Y560WJ1RvghBw,sushi-masa-japanese-restaurant-greenville,Sushi-Masa Japanese Restaurant,https://s3-media1.fl.yelpcdn.com/bphoto/zsRavZ...,False,https://www.yelp.com/biz/sushi-masa-japanese-r...,165,"[{'alias': 'sushi', 'title': 'Sushi Bars'}]",4.4,"{'latitude': 34.8512725830078, 'longitude': -8...",[delivery],"{'address1': '8590 Pelham Rd', 'address2': 'St...",18642882227,(864) 288-2227,11481.830881,$$
3,7cJxOV-ANX1qLThK3yV96w,otto-izakaya-greenville-4,Otto Izakaya,https://s3-media1.fl.yelpcdn.com/bphoto/TdPhFy...,False,https://www.yelp.com/biz/otto-izakaya-greenvil...,449,"[{'alias': 'japanese', 'title': 'Japanese'}, {...",4.2,"{'latitude': 34.8228218820722, 'longitude': -8...",[delivery],"{'address1': '15 Market Point Dr', 'address2':...",18645688009,(864) 568-8009,5933.485357,$$
4,Kx1x7Kf6C2gtogQErWSu0A,o-ku-greenville,O-Ku,https://s3-media2.fl.yelpcdn.com/bphoto/7dR0xy...,False,https://www.yelp.com/biz/o-ku-greenville?adjus...,46,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.0,"{'latitude': 34.847954222223294, 'longitude': ...",[],"{'address1': '30 W Broad St', 'address2': None...",18643264812,(864) 326-4812,3931.009612,


#Save the File in the Directory

In [18]:
# specify the directory
directory = '/content/drive/MyDrive/Colab Notebooks/Data'
filename = 'final_results_SC_sushi.csv.gz' #make sure to include the .csv.gz extension
path = os.path.join(directory, filename)

# ensure that the 'Data' directory exists
os.makedirs(directory, exist_ok=True)

# save the dataframe as a compressed csv file (to save space)
df.to_csv(path, compression='gzip', index=False)
# df was defined in the previous code block: df = pd.read_json(JSON_FILE)

In [19]:
# save as JSON file
json_file = '/content/drive/MyDrive/Colab Notebooks/Data/final_results_SC_sushi.json'
# add Data folders copied path before /final_results_SC_sushi.json

# save the json as a dataframe
df.to_json(json_file, orient='records', lines=True)

In [20]:
# save the file as gzip
csv_gz_file = json_file.replace('.json','.csv.gz')

# save the dataframe as a compressed csv without the index
df.to_csv(csv_gz_file, compression='gzip',index=False)