In [1]:
!pip install fuzzywuzzy python-Levenshtein
!pip install azure-storage-blob

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fuzzywuzzy, rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.1 fuzzywuzzy-0.18.0 python-Levenshtein-0.25.1 rapidfuzz-3.8.1
Collecting azure-storage-blob
  Downloadin

In [2]:
import pandas as pd
import numpy as np
import requests
import json
from sklearn.cluster import DBSCAN
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import math
from azure.storage.blob import BlobServiceClient
import io
from io import StringIO

In [4]:
#specify the path to your JSON file
token_file_path = 'token.json'

#Load the JSON token file
with open(token_file_path,'r') as token_file:
    token=json.load(token_file)

Define Functions

In [5]:
def scoringTable(nametoevaluate, company_list, fuzzObject):

    scorer_list = ['ratio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio','partial_token_sort_ratio','QRatio','UQRatio','WRatio','UWRatio']
    dataFrame = pd.DataFrame()
    for scorer in scorer_list:
        scorer_function = getattr(fuzz, scorer)
        scorer_result = process.extract(nametoevaluate, company_list, scorer=scorer_function)
        scorer_result_list = [result + (scorer,) for result in scorer_result]
        dataFrame = dataFrame.append(pd.DataFrame(scorer_result_list,columns=['Name', 'Score', 'Scorer']))

    df_sorted = dataFrame.sort_values('Score', ascending=False)
    df_sorted.reset_index(drop=True,inplace=True)
    df_grouped = df_sorted.groupby('Name').agg(Score_Sum=('Score', 'sum'), Score_Count=('Name', 'count'))
    df_grouped['Average'] = df_grouped['Score_Sum'] / df_grouped['Score_Count']
    df_grouped = df_grouped.sort_values('Average', ascending=False)
    return df_grouped.iloc[0, 0]


# Mock function simulating the API call, returning multiple restaurants
# 1st token
def query_api(Latitude, Longitude):
  TOKEN = token["token1"]
  headers = { "accept": "application/json", "Authorization": f"Bearer {TOKEN}"}
  url = f"https://api.yelp.com/v3/businesses/search?location=NYC&latitude={Latitude}&longitude={Longitude}&term=restaurants&radius=100&sort_by=review_count&limit=50&offset=1"
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
    data = response.json()
    business_names = [business['name'] for business in data['businesses']]
  return data['businesses'], business_names

Downloaded the source file for extracting Yelp dataset

In [6]:
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

connection_string = config["connectionString"]

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_name = "groupproject"
blob_name = "groupdata2_raw_NYCRest.csv"

blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
data = blob_client.download_blob()
csv_data = data.content_as_text()  # Get CSV data as text

# Process CSV data (example: load into pandas DataFrame)
restaurant_data = pd.read_csv(io.StringIO(csv_data))

In [7]:
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221979 entries, 0 to 221978
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  221979 non-null  int64  
 1   dba                    221979 non-null  object 
 2   boro                   221979 non-null  object 
 3   building               221593 non-null  object 
 4   street                 221979 non-null  object 
 5   zipcode                219240 non-null  float64
 6   phone                  221979 non-null  object 
 7   inspection_date        221979 non-null  object 
 8   critical_flag          221979 non-null  object 
 9   record_date            221979 non-null  object 
 10  latitude               221723 non-null  float64
 11  longitude              221723 non-null  float64
 12  cuisine_description    221979 non-null  object 
 13  action                 221979 non-null  object 
 14  score                  213531 non-nu

In [8]:
# Load the CSV file into a DataFrame
df = restaurant_data

# Specify the columns you want to keep
columns_to_keep = [
    'dba', 'boro', 'building', 'street', 'zipcode', 'phone',
    'cuisine_description', 'latitude', 'longitude'
]

# Filter the DataFrame to include only the columns of interest
df_filtered = df[columns_to_keep]

#
df_filtered_nonan = df_filtered[df_filtered['cuisine_description'].notna()]

# Drop duplicates based on the 'DBA' column
df_unique_restaurants = df_filtered_nonan.drop_duplicates(subset=['dba'])

In [9]:
df_brooklyn = df_unique_restaurants[df_unique_restaurants['boro'] == 'Brooklyn']
df_queens = df_unique_restaurants[df_unique_restaurants['boro'] == 'Queens']
df_manhattan = df_unique_restaurants[df_unique_restaurants['boro'] == 'Manhattan']
df_bronx = df_unique_restaurants[df_unique_restaurants['boro'] == 'Bronx']
df_statenisland = df_unique_restaurants[df_unique_restaurants['boro'] == 'Staten Island']

In [10]:
df_bronx_sorted = df_bronx.sort_values(by=['latitude', 'longitude'], ascending=False)
df_bronx_sorted

Unnamed: 0,dba,boro,building,street,zipcode,phone,cuisine_description,latitude,longitude
3290,COLLEGE OF MOUNT SAINT VINCENT (Mag's Kitchen),Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,Coffee/Tea,40.912822,-73.902504
67407,COLLEGE OF MOUNT ST.VINCENT (HUDSON HEIGHTS; 1...,Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,American,40.912822,-73.902504
99666,COLLEGE OF MOUNT SAINT VINCENT,Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,American,40.912822,-73.902504
54193,Marianne Pizza Cafe,Bronx,6100,RIVERDALE AVENUE,10471.0,9144717274,Pizza,40.910473,-73.903108
2281,EJ'S BAKERY & CAFE,Bronx,462,WEST 261 STREET,10471.0,3479490023,Spanish,40.910012,-73.902671
...,...,...,...,...,...,...,...,...,...
72924,7 SPICES,Bronx,906908,E GUNHILL RD,,3473268646,Caribbean,0.000000,0.000000
85660,GOLDEN EAGLE RESTAURANT,Bronx,97579,MORRIS PARK AVENUE,,7188634028,American,0.000000,0.000000
95074,BAR & RESTAURANT EL SALVADORENO,Bronx,451455,MORRIS PK AVE,,3478794891,Latin American,0.000000,0.000000
105731,CHOP-STICKS RESTAURANT,Bronx,7718,21 AVENUE,,7182048119,Chinese,0.000000,0.000000


In [11]:
Latitude = 40.630629
Longitude = -73.966336

In [12]:
# Set to keep track of restaurants to be removed based on API calls or other criteria
restaurants_to_remove = set()
api_response_restaurants_list = []
index_to_skip = []

In [40]:
# Use iterrows() to iterate through DataFrame rows
for index, row in df_bronx_sorted.iterrows():
    if index in index_to_skip and len(index_to_skip) > 0:
      continue
    # Check if the current restaurantName is in the set of restaurants to remove
    if row['dba'] in restaurants_to_remove and len(restaurants_to_remove) > 0:
      for restaurant_name in restaurants_to_remove:
        # Check against each name in the DataFrame
        match_score = fuzz.token_set_ratio(restaurant_name, row['dba'])
        if match_score >= 60:
          matches = [(business['rating'], business['review_count']) for business in api_response_restaurants_list if business['name'] == restaurant_name]
          # Extract the rating and review count if a match is found
          if matches:
            rating, review_count = matches[0]  # Assuming only one match
            df_bronx_sorted.loc[index, 'yelp_rating'] = rating
            df_bronx_sorted.loc[index, 'yelp_review_count'] = review_count
            break
      continue  # Skip the rest of the loop and move to the next row

    # Call the API for each row's latitude and longitude not in restaurants_to_remove
    api_response_restaurants, restaurant_list = query_api(row['latitude'], row['longitude'])
    print(restaurant_list)
    print(api_response_restaurants)
    api_response_restaurants_list.extend(api_response_restaurants)
    restaurants_to_remove.update(restaurant_list)

    # Send to api_response_restaurants to Blob or anything because I want to keep those values for future references

    for restaurant_name in restaurants_to_remove:
      # Check against each name in the DataFrame
      match_score = fuzz.token_set_ratio(restaurant_name, row['dba'])
      if match_score >= 60:
        matches = [(business['rating'], business['review_count']) for business in api_response_restaurants_list if business['name'] == restaurant_name]
        # Extract the rating and review count if a match is found
        if matches:
          rating, review_count = matches[0]  # Assuming only one match
          print(f"Rating for '{restaurant_name}': {rating}")
          print(f"Review Count for '{restaurant_name}': {review_count}")
          df_bronx_sorted.loc[index, 'yelp_rating'] = rating
          df_bronx_sorted.loc[index, 'yelp_review_count'] = review_count
          break
    index_to_skip.append(index)

['XingLong', 'Fulton Cafe', 'City Gourmet Deli']
[{'id': '2S7AZXymi4xgmcM0XW8mTw', 'alias': 'xinglong-bronx-2', 'name': 'XingLong', 'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/oIfBLKXGH-z45gzyMlLSEQ/o.jpg', 'is_closed': False, 'url': 'https://www.yelp.com/biz/xinglong-bronx-2?adjust_creative=bwRCjbHZyClcjCZ8VSlXnA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=bwRCjbHZyClcjCZ8VSlXnA', 'review_count': 3, 'categories': [{'alias': 'chinese', 'title': 'Chinese'}], 'rating': 4.0, 'coordinates': {'latitude': 40.8325, 'longitude': -73.90401}, 'transactions': ['delivery'], 'price': '$$$$', 'location': {'address1': '553 E 169th St', 'address2': None, 'address3': None, 'city': 'Bronx', 'zip_code': '10456', 'country': 'US', 'state': 'NY', 'display_address': ['553 E 169th St', 'Bronx, NY 10456']}, 'phone': '+17182938888', 'display_phone': '(718) 293-8888', 'distance': 92.09804335143308, 'attributes': {'business_temp_closed': None, 'menu_url': None, 'open24_hours': 

UnboundLocalError: local variable 'data' referenced before assignment

In [41]:
df_bronx_sorted

Unnamed: 0,dba,boro,building,street,zipcode,phone,cuisine_description,latitude,longitude,yelp_rating,yelp_review_count
3290,COLLEGE OF MOUNT SAINT VINCENT (Mag's Kitchen),Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,Coffee/Tea,40.912822,-73.902504,,
67407,COLLEGE OF MOUNT ST.VINCENT (HUDSON HEIGHTS; 1...,Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,American,40.912822,-73.902504,,
99666,COLLEGE OF MOUNT SAINT VINCENT,Bronx,6301,RIVERDALE AVENUE,10471.0,7184053486,American,40.912822,-73.902504,,
54193,Marianne Pizza Cafe,Bronx,6100,RIVERDALE AVENUE,10471.0,9144717274,Pizza,40.910473,-73.903108,,
2281,EJ'S BAKERY & CAFE,Bronx,462,WEST 261 STREET,10471.0,3479490023,Spanish,40.910012,-73.902671,,
...,...,...,...,...,...,...,...,...,...,...,...
72924,7 SPICES,Bronx,906908,E GUNHILL RD,,3473268646,Caribbean,0.000000,0.000000,,
85660,GOLDEN EAGLE RESTAURANT,Bronx,97579,MORRIS PARK AVENUE,,7188634028,American,0.000000,0.000000,,
95074,BAR & RESTAURANT EL SALVADORENO,Bronx,451455,MORRIS PK AVE,,3478794891,Latin American,0.000000,0.000000,,
105731,CHOP-STICKS RESTAURANT,Bronx,7718,21 AVENUE,,7182048119,Chinese,0.000000,0.000000,,


In [42]:
df_bronx_sorted.isna().sum()

dba                      0
boro                     0
building                 1
street                   0
zipcode                 25
phone                    0
cuisine_description      0
latitude                 1
longitude                1
yelp_rating            374
yelp_review_count      374
dtype: int64

In [45]:
df_bronx_final = df_bronx_sorted.dropna(subset=['yelp_rating', 'yelp_review_count'], how='any')
df_bronx_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1351 entries, 66782 to 17810
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dba                  1351 non-null   object 
 1   boro                 1351 non-null   object 
 2   building             1351 non-null   object 
 3   street               1351 non-null   object 
 4   zipcode              1351 non-null   float64
 5   phone                1351 non-null   object 
 6   cuisine_description  1351 non-null   object 
 7   latitude             1351 non-null   float64
 8   longitude            1351 non-null   float64
 9   yelp_rating          1351 non-null   float64
 10  yelp_review_count    1351 non-null   float64
dtypes: float64(5), object(6)
memory usage: 126.7+ KB


In [16]:
# 2nd token
def query_api(Latitude, Longitude):
  TOKEN = token["token2"]
  headers = { "accept": "application/json", "Authorization": f"Bearer {TOKEN}"}
  url = f"https://api.yelp.com/v3/businesses/search?location=NYC&latitude={Latitude}&longitude={Longitude}&term=restaurants&radius=100&sort_by=review_count&limit=50&offset=1"
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
    data = response.json()
    business_names = [business['name'] for business in data['businesses']]
  return data['businesses'], business_names

In [20]:
# 3rd token
def query_api(Latitude, Longitude):
  TOKEN = token["token3"]
  headers = { "accept": "application/json", "Authorization": f"Bearer {TOKEN}"}
  url = f"https://api.yelp.com/v3/businesses/search?location=NYC&latitude={Latitude}&longitude={Longitude}&term=restaurants&radius=100&sort_by=review_count&limit=50&offset=1"
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
    data = response.json()
    business_names = [business['name'] for business in data['businesses']]
  return data['businesses'], business_names

In [34]:
# 4th token
def query_api(Latitude, Longitude):
  TOKEN = token["token4"]
  headers = { "accept": "application/json", "Authorization": f"Bearer {TOKEN}"}
  url = f"https://api.yelp.com/v3/businesses/search?location=NYC&latitude={Latitude}&longitude={Longitude}&term=restaurants&radius=100&sort_by=review_count&limit=50&offset=1"
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
    data = response.json()
    business_names = [business['name'] for business in data['businesses']]
  return data['businesses'], business_names

In [39]:
# 5th token
def query_api(Latitude, Longitude):
  TOKEN = token["token5"]
  headers = { "accept": "application/json", "Authorization": f"Bearer {TOKEN}"}
  url = f"https://api.yelp.com/v3/businesses/search?location=NYC&latitude={Latitude}&longitude={Longitude}&term=restaurants&radius=100&sort_by=review_count&limit=50&offset=1"
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
    data = response.json()
    business_names = [business['name'] for business in data['businesses']]
  return data['businesses'], business_names

In [46]:
config_file_path = 'config.json'

with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

CONNECTION_STRING_AZURE_STORAGE = config["connectionString"]
CONTAINER_AZURE = "groupproject"
blob_name = "groupdata3_Yelp.csv"

# Convert DataFrame to CSV
output = StringIO()
df_bronx_final.to_csv(output, index=False)
data = output.getvalue()
output.close()

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get a blob client using the container name and blob name
blob_client = blob_service_client.get_blob_client(container=CONTAINER_AZURE, blob=blob_name)

# Upload the CSV data
blob_client.upload_blob(data, overwrite=True)

print(f"Uploaded {blob_name} to Azure Blob Storage in container {CONTAINER_AZURE}.")


Uploaded groupdata3_Yelp.csv to Azure Blob Storage in container groupproject.
