In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint
from scipy.stats import linregress
from datetime import datetime
from yelpapi import YelpAPI

# Impor the API key
from config import api_key

In [None]:
# Old version, hit many issues with offset
# yelp_api = YelpAPI(api_key)
# search_term = 'restaurants'
# location = 'New York City, NY'
# limit = 50  # maximum number of results to return
# daily_results_limit = 12500 # update this by 12500 every day
# offset = 0 # update this by 12500 every day

# # response = yelp_api.search_query(term=search_term, location=location, limit=limit)
# # pprint(response)
# results = []
# api_call_limit = 10  # total API call limit for the day
# calls_made = 0  # counter for API calls made

# while offset < daily_results_limit and calls_made < api_call_limit:
#     response = yelp_api.search_query(term=search_term, location=location, limit=limit, offset=offset)
#     results.extend(response['businesses'])  # Extend the list with new businesses
#     offset += limit  # Increment the offset by the limit
#     calls_made += 1  # Increment the API call counter
    
#     # Print progress
#     print(f'Retrieved {len(results)} results with {calls_made} API calls')

#     # Avoid hitting rate limits by adding a delay
#     time.sleep(1)  # Delay for 1 second

In [2]:
# NYC boundaries (lat, lon)
northwest = (40.893079105643025, -73.9128920326231)
northeast = (40.879497836080574, -73.82796941055598)
southwest = (40.62030456986446, -74.03292689266031)
southeast = (40.65979392491511, -73.74149354297585)

# Step size for approx 2 km
lat_step = 0.018
lon_step = 0.024

# Generate grid points
lat_points = np.arange(southwest[0], northwest[0], lat_step)
lon_points = np.arange(southwest[1], southeast[1], lon_step)

# Create list of points with increased radius
grid_points_with_radius = [{"lat": lat, "lon": lon, "radius": 1000} for lat in lat_points for lon in lon_points]

# Filter points to ensure they fall within the specified quadrilateral
def is_within_boundaries(lat, lon):
    return (southwest[0] <= lat <= northwest[0]) and (southwest[1] <= lon <= southeast[1])
filtered_grid_points = [point for point in grid_points_with_radius if is_within_boundaries(point["lat"], point["lon"])]

# Number of points generated
num_points = len(filtered_grid_points)

# Split points into four parts for four days
num_days = 4
points_per_day = num_points // num_days

# Ensure all points are covered, even if not divisible by 4
split_grid_points = [filtered_grid_points[i:i + points_per_day] for i in range(0, num_points, points_per_day)]

# Handle any remaining points if num_points is not perfectly divisible by num_days
if len(split_grid_points) > num_days:
    split_grid_points[num_days - 1].extend(split_grid_points.pop())

print(f"Generated {num_points} grid points.")
for day in range(num_days):
    starting_point = split_grid_points[day][0]
    print(f"Day {day + 1}, {len(split_grid_points[day])} grid points, starting point: Latitude {starting_point['lat']}, Longitude {starting_point['lon']}")

# split_grid_points

Generated 208 grid points.
Day 1, 52 grid points, starting point: Latitude 40.62030456986446, Longitude -74.03292689266031
Day 2, 52 grid points, starting point: Latitude 40.69230456986446, Longitude -74.03292689266031
Day 3, 52 grid points, starting point: Latitude 40.764304569864464, Longitude -74.03292689266031
Day 4, 52 grid points, starting point: Latitude 40.83630456986447, Longitude -74.03292689266031


In [None]:
#Test API Calls
# headers = {
#     "accept": "application/json",
#     "Authorization": f"Bearer {api_key}"
# }
# url_first_call = f"https://api.yelp.com/v3/businesses/search?latitude=40.62030456986446&longitude=-74.03292689266031&term=restaurants&radius=1000&sort_by=distance&limit=50"
# first_response = requests.get(url_first_call, headers=headers).json()
# pprint(first_response)

In [3]:
day = 1 # change this +1 next day
calls_made = 0 # keep track of calls made for display and to esnure we are not going past limit
search_term = 'restaurants' # variable used for api url
limit = 50 # variable used for api url, makes sure we get 50 restaurants per call
offset_limit = 200 # ensure we check the first 200 restaurants within radius of each point
sort_by = 'distance' # variable used for api url
headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {api_key}"
}
results = []
# loop over every coordiante point built above for the selected day
print('-----------Starting with API calls for the Day-----------')  
for coord in split_grid_points[day]:
    offset = 0 # used to make sure we look at different restaurants on next api call at same coord
    # call api 4 times per coord
    while offset < offset_limit:
        url = f"https://api.yelp.com/v3/businesses/search?latitude={coord['lat']}&longitude={coord['lon']}&term={search_term}&radius={coord['radius']}&sort_by={sort_by}&limit={limit}&offset={offset}"
        response = requests.get(url, headers=headers).json()
        results.extend(response['businesses'])
        offset += 50 # increase offset for next call
        calls_made += 1 # keep track of calls made for display
        time.sleep(1) # sleep to not get timed out for too many fast calls
    print(f'Retrieved {len(results)} results with {calls_made} API calls') 
print('-----------Finished with API calls for the Day-----------')  

-----------Starting with API calls for the Day-----------
Retrieved 2 results with 4 API calls
Retrieved 66 results with 8 API calls
Retrieved 266 results with 12 API calls
Retrieved 466 results with 16 API calls
Retrieved 666 results with 20 API calls
Retrieved 866 results with 24 API calls
Retrieved 931 results with 28 API calls
Retrieved 1041 results with 32 API calls
Retrieved 1216 results with 36 API calls
Retrieved 1380 results with 40 API calls
Retrieved 1459 results with 44 API calls
Retrieved 1509 results with 48 API calls
Retrieved 1556 results with 52 API calls
Retrieved 1686 results with 56 API calls
Retrieved 1886 results with 60 API calls
Retrieved 2086 results with 64 API calls
Retrieved 2286 results with 68 API calls
Retrieved 2486 results with 72 API calls
Retrieved 2686 results with 76 API calls
Retrieved 2806 results with 80 API calls
Retrieved 2883 results with 84 API calls
Retrieved 3008 results with 88 API calls
Retrieved 3176 results with 92 API calls
Retrieved 3

In [None]:
# This code was for testing and looking at how we wanted to pull the data

# print(response['businesses'][0]['name'])
# print(f"{response['businesses'][0]['rating']} ({response['businesses'][0]['review_count']})")
# print(response['businesses'][0]['price'])
# print(response['businesses'][0]['display_phone'])
# print(f"Latitiude: {response['businesses'][0]['coordinates']['latitude']}, Longitude: {response['businesses'][0]['coordinates']['longitude']}")

# test_categories = ''
# for alias in response['businesses'][0]['categories']:
#     test_categories = test_categories + '/' + alias['title']
# print(test_categories)

# test_transactions = ''
# for type in response['businesses'][0]['transactions']:
#     test_transactions = test_transactions + '/' + type
# print(test_transactions)

In [None]:
Restaurant_id = 11791 # edit this to match id at end of csv
restaurant_data = [] # List to hold restaurant dictionaries
for restaurant in results:
    name = restaurant['name'] # get restaurant name
    rating = restaurant['rating'] # get restaurant rating
    review_count = restaurant['review_count'] # get restaurant review count
    try:
        price = restaurant['price']  # get restaurant price, if it fails ...
    except KeyError:
        price = '???' # set price to ???
    phone_number = restaurant['display_phone'] # get restaurant phone number
    latitiude = restaurant['coordinates']['latitude'] # get restaurant lat
    longitude = restaurant['coordinates']['longitude'] # get restaurant lon
    Restaurant_id += 1

    # merge all categories into one line seperated by /
    categories = ''
    x = 0
    for alias in restaurant['categories']:
        if x == 0:
            categories = alias['title']
            x += 1
        else:
            categories = categories + '/' + alias['title']

    # merge all transaction types into one line seperated by /
    transactions = ''
    x = 0
    for type in restaurant['transactions']:
        if x == 0:
            transactions = type
            x += 1
        else:
            transactions = transactions + '/' + type

    # Build a dictionary for the restaurant and add it to the list
    restaurant_data.append({'Restaurant_id':Restaurant_id,
                            'Name':name,
                            'Rating':rating,
                            'Review_Count':review_count,
                            'Price':price,
                            'Phone_Number':phone_number,
                            'Latitude':latitiude,
                            'Longitude':longitude,
                            'Categories':categories,
                            'Transactions':transactions})

# Create a DF from list
restaurant_data_df = pd.DataFrame(restaurant_data,index='Restaurant_id')
restaurant_data_df

Unnamed: 0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
0,Benny’s Burgers,0.0,0,???,,40.701520,-74.043300,Burgers,
1,Ellis Cafe,1.8,36,$$,(212) 363-3180,40.699485,-74.039561,Cafes,
2,Gitano Island,3.5,219,???,(332) 287-0347,40.691539,-74.013978,Mexican/Cocktail Bars,
3,Taco Vista,3.8,86,???,,40.692756,-74.014548,Mexican/Bars/Tex-Mex,
4,Pilot,4.1,346,$$$,,40.693857,-74.003089,Cocktail Bars/Seafood,
...,...,...,...,...,...,...,...,...,...
7059,Eric's Delicatessens,0.0,0,???,(718) 423-4599,40.748400,-73.756515,Delis,pickup
7060,Gowas Bi Sushi,0.0,0,???,,40.754507,-73.738408,Sushi Bars,
7061,Rokstar Chicken,4.0,112,$$,(718) 819-8933,40.754530,-73.738540,Chicken Shop/Korean/Chicken Wings,delivery/pickup
7062,Brooklyn Born - Original Famous Coal Fired Bri...,3.5,437,$$,(718) 819-8889,40.754530,-73.738540,Pizza,delivery/pickup


In [5]:
# Check for duplicates and display them
duplicates = restaurant_data_df.duplicated(keep=False)
restaurant_data_df[duplicates].sort_values('Name')

Unnamed: 0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
1377,Abdul Halal Food Cart,4.5,8,???,,40.701316,-73.807842,Food Trucks/Halal,delivery
3168,Abdul Halal Food Cart,4.5,8,???,,40.701316,-73.807842,Food Trucks/Halal,delivery
4931,B&B Stationery,0.0,0,???,,40.719423,-73.83792,Delis,delivery/pickup
2999,B&B Stationery,0.0,0,???,,40.719423,-73.83792,Delis,delivery/pickup
1503,B&R Spice Jamaica Restaurant and Cuisine,2.8,5,$$,(718) 264-1000,40.701268,-73.76608,Caribbean,delivery
3442,B&R Spice Jamaica Restaurant and Cuisine,2.8,5,$$,(718) 264-1000,40.701268,-73.76608,Caribbean,delivery
892,Deli Grocery,0.0,0,???,,40.70125,-73.88624,Delis,
2770,Deli Grocery,0.0,0,???,,40.70125,-73.88624,Delis,
1244,Khana Khazana,0.0,0,???,(718) 441-4403,40.69152,-73.82129,Restaurants,
1245,Khana Khazana,0.0,0,???,(718) 441-4403,40.69152,-73.82129,Restaurants,


In [6]:
# Remove second copy of duplicates
restaurant_data_clean = restaurant_data_df.drop_duplicates()
restaurant_data_clean

Unnamed: 0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
0,Benny’s Burgers,0.0,0,???,,40.701520,-74.043300,Burgers,
1,Ellis Cafe,1.8,36,$$,(212) 363-3180,40.699485,-74.039561,Cafes,
2,Gitano Island,3.5,219,???,(332) 287-0347,40.691539,-74.013978,Mexican/Cocktail Bars,
3,Taco Vista,3.8,86,???,,40.692756,-74.014548,Mexican/Bars/Tex-Mex,
4,Pilot,4.1,346,$$$,,40.693857,-74.003089,Cocktail Bars/Seafood,
...,...,...,...,...,...,...,...,...,...
7059,Eric's Delicatessens,0.0,0,???,(718) 423-4599,40.748400,-73.756515,Delis,pickup
7060,Gowas Bi Sushi,0.0,0,???,,40.754507,-73.738408,Sushi Bars,
7061,Rokstar Chicken,4.0,112,$$,(718) 819-8933,40.754530,-73.738540,Chicken Shop/Korean/Chicken Wings,delivery/pickup
7062,Brooklyn Born - Original Famous Coal Fired Bri...,3.5,437,$$,(718) 819-8889,40.754530,-73.738540,Pizza,delivery/pickup


In [None]:
# Append dataframe data to a csv file
restaurant_data_clean.to_csv("data/restaurant_data.csv",mode='a',header=False,index='Restaurant_id')

In [None]:
# Read in data from the new york city health csv file
# health_df = pd.read_csv("data/DOHMH_New_York_City_Restaurant_Inspection_Results_20241120.csv")
# health_df.head()

In [None]:
# Make sure phones numbers match bewteen both dataframes, rename columns and get rid of columns not needed
# health_df['PHONE'] = health_df['PHONE'].astype(str).str.replace(r'(\d{3})(\d{3})(\d{4})', r'(\1) \2-\3', regex=True)
# health_df[['Phone Number','Health Grade']] = health_df[['PHONE','GRADE']]
# columns_to_keep = ['Phone Number', 'Health Grade']
# health_df_cleaned = health_df.loc[:, columns_to_keep]
# health_df_cleaned

In [None]:
# Get health grade for phone numbers from restaurant_data_clean found in health_df_cleaned
# restaurant_data_clean.loc[:,'Health Grade'] = restaurant_data_clean['Phone Number'].map(health_df_cleaned['Health Grade'])
# restaurant_data_clean

In [29]:
# read back in csv to make sure no duplicates and save to json
restaurant_data_csv = pd.read_csv("data/restaurant_data.csv", index_col='Restaurant_id')
restaurant_data_csv

Unnamed: 0_level_0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
Restaurant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Nino's Pizza,3.7,234,$,(718) 680-0222,40.619812,-74.032867,Pizza,pickup/delivery
1,Al Horno Lean Mexican Kitchen,4.3,51,$$,(347) 578-7492,40.619730,-74.032760,Mexican,pickup/delivery
2,Foodtown,2.9,130,$$,(718) 680-2884,40.619980,-74.032320,Grocery/Delis,
3,Paneantico Bakery,3.5,475,$$,(718) 680-2347,40.619499,-74.033012,Bakeries/Sandwiches,pickup/delivery
4,Bridgeview Diner,2.5,343,$$,(718) 680-9818,40.620640,-74.031900,Diners/American/Breakfast & Brunch,pickup/delivery
...,...,...,...,...,...,...,...,...,...
11787,Eric's Delicatessens,0.0,0,???,(718) 423-4599,40.748400,-73.756515,Delis,pickup
11788,Gowas Bi Sushi,0.0,0,???,,40.754507,-73.738408,Sushi Bars,
11789,Rokstar Chicken,4.0,112,$$,(718) 819-8933,40.754530,-73.738540,Chicken Shop/Korean/Chicken Wings,delivery/pickup
11790,Brooklyn Born - Original Famous Coal Fired Bri...,3.5,437,$$,(718) 819-8889,40.754530,-73.738540,Pizza,delivery/pickup


In [None]:
# Fixing id issue
# restaurant_data_csv = restaurant_data_csv.rename(columns={'Unnamed: 0':'Restaurant_id'})
# restaurant_data_csv

Unnamed: 0,Restaurant_id,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
0,0,Nino's Pizza,3.7,234,$,(718) 680-0222,40.619812,-74.032867,Pizza,pickup/delivery
1,1,Al Horno Lean Mexican Kitchen,4.3,51,$$,(347) 578-7492,40.619730,-74.032760,Mexican,pickup/delivery
2,2,Foodtown,2.9,130,$$,(718) 680-2884,40.619980,-74.032320,Grocery/Delis,
3,3,Paneantico Bakery,3.5,475,$$,(718) 680-2347,40.619499,-74.033012,Bakeries/Sandwiches,pickup/delivery
4,4,Bridgeview Diner,2.5,343,$$,(718) 680-9818,40.620640,-74.031900,Diners/American/Breakfast & Brunch,pickup/delivery
...,...,...,...,...,...,...,...,...,...,...
11787,7059,Eric's Delicatessens,0.0,0,???,(718) 423-4599,40.748400,-73.756515,Delis,pickup
11788,7060,Gowas Bi Sushi,0.0,0,???,,40.754507,-73.738408,Sushi Bars,
11789,7061,Rokstar Chicken,4.0,112,$$,(718) 819-8933,40.754530,-73.738540,Chicken Shop/Korean/Chicken Wings,delivery/pickup
11790,7062,Brooklyn Born - Original Famous Coal Fired Bri...,3.5,437,$$,(718) 819-8889,40.754530,-73.738540,Pizza,delivery/pickup


In [None]:
# Fixing id issue
# restaurant_data_csv['Restaurant_id'] = range(len(restaurant_data_csv))
# restaurant_data_csv = restaurant_data_csv.set_index('Restaurant_id')
# restaurant_data_csv

Unnamed: 0_level_0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions
Restaurant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Nino's Pizza,3.7,234,$,(718) 680-0222,40.619812,-74.032867,Pizza,pickup/delivery
1,Al Horno Lean Mexican Kitchen,4.3,51,$$,(347) 578-7492,40.619730,-74.032760,Mexican,pickup/delivery
2,Foodtown,2.9,130,$$,(718) 680-2884,40.619980,-74.032320,Grocery/Delis,
3,Paneantico Bakery,3.5,475,$$,(718) 680-2347,40.619499,-74.033012,Bakeries/Sandwiches,pickup/delivery
4,Bridgeview Diner,2.5,343,$$,(718) 680-9818,40.620640,-74.031900,Diners/American/Breakfast & Brunch,pickup/delivery
...,...,...,...,...,...,...,...,...,...
11787,Eric's Delicatessens,0.0,0,???,(718) 423-4599,40.748400,-73.756515,Delis,pickup
11788,Gowas Bi Sushi,0.0,0,???,,40.754507,-73.738408,Sushi Bars,
11789,Rokstar Chicken,4.0,112,$$,(718) 819-8933,40.754530,-73.738540,Chicken Shop/Korean/Chicken Wings,delivery/pickup
11790,Brooklyn Born - Original Famous Coal Fired Bri...,3.5,437,$$,(718) 819-8889,40.754530,-73.738540,Pizza,delivery/pickup


In [None]:
# Fixing id issue
# restaurant_data_csv.to_csv("data/restaurant_data.csv",index='Restaurant_id')

In [None]:
# duplicates = restaurant_data_csv.duplicated(keep=False)
# restaurant_data_csv[duplicates].sort_values('Name')

Unnamed: 0.1,Unnamed: 0,Name,Rating,Review Count,Price,Phone Number,Latitiude,Longitude,Categories,Transactions


In [None]:
# restaurant_data_scv_clean = restaurant_data_csv.drop_duplicates()

In [31]:
# Save data as .json
restaurant_data_json = restaurant_data_csv.reset_index()
restaurant_data_json.to_json('data/restaurant_data.json', orient='records', lines=True)