In [1]:
!pip install azure-storage-blob # Microoft Azure
!pip install pyarrow
!pip install psycopg2 sqlalchemy

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.19.1-py3-none-any.whl (394 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.5/394.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core<2.0.0,>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.19.1 isodate-0.6.1


In [2]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine

In [3]:
# Azure Functions

def azure_download_blob(connect_str, container_name, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    download_stream = blob_client.download_blob()
    return download_stream.readall()


In [4]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

# Print the configuration
#Connection_STRING = config["connectionString"]

connection_string = config["connectionString"]
container_name = "groupproject"
blob_name = "groupdata4_Merge_df_Bronx.csv"

blob_content = azure_download_blob(connection_string, container_name, blob_name)
blob_content = blob_content.decode('utf-8')
group_df = pd.read_csv(StringIO(blob_content))
group_df.head()

Unnamed: 0,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,cuisine_description,action,score,inspection_type,violation_code,violation_description,grade,grade_date,latitude,longitude,yelp_rating,yelp_review_count
0,CORKY'S DINER,Bronx,2535,GRAND CONCOURSE,10468.0,7189332484,2024-01-24,Critical,American,Violations were cited in the following area(s).,18.0,Cycle Inspection / Initial Inspection,02B,Hot TCS food item not held at or above 140 °F.,,,40.863278,-73.896514,3.0,113.0
1,PAPA JOHN'S (STAND 310),Bronx,1,EAST 161 STREET,10451.0,9172843260,2017-07-25,Not Applicable,Pizza,No violations were recorded at the time of thi...,0.0,Cycle Inspection / Initial Inspection,,,A,2017-07-25,40.829028,-73.928496,1.9,24.0
2,JADE PALACE,Bronx,163,EINSTEIN LOOP,10475.0,7183201584,2022-03-23,Critical,Chinese,Violations were cited in the following area(s).,31.0,Cycle Inspection / Initial Inspection,02B,Hot food item not held at or above 140º F.,,,40.864063,-73.822546,2.6,11.0
3,PINE BAR & GRILL,Bronx,1634,EASTCHESTER ROAD,10461.0,7183190900,2017-10-12,Critical,Italian,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,06F,Wiping cloths soiled or not stored in sanitizi...,A,2017-10-12,40.845277,-73.845095,3.0,2.0
4,LA ROLA RESTAURANT,Bronx,400,EAST 198 STREET,10458.0,9176881449,2024-03-06,Not Applicable,Spanish,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,,Z,2024-03-06,40.866021,-73.886021,0.0,0.0


In [5]:
group_df.columns

Index(['dba', 'boro', 'building', 'street', 'zipcode', 'phone',
       'inspection_date', 'critical_flag', 'cuisine_description', 'action',
       'score', 'inspection_type', 'violation_code', 'violation_description',
       'grade', 'grade_date', 'latitude', 'longitude', 'yelp_rating',
       'yelp_review_count'],
      dtype='object')

In [6]:
#Consolidation
group_df['zipcode'] =group_df['zipcode'].astype('Int64')

In [13]:
# Create Location Dimension
location_df = group_df[['boro', 'building', 'street', 'zipcode','latitude','longitude']].drop_duplicates()
location_df['location_id'] = range(1, len(location_df) + 1)
location_dimension = location_df[['location_id','boro', 'building', 'street', 'zipcode','latitude','longitude']]

location_dimension.head()

Unnamed: 0,location_id,boro,building,street,zipcode,latitude,longitude
0,1,Bronx,2535,GRAND CONCOURSE,10468,40.863278,-73.896514
1,2,Bronx,1,EAST 161 STREET,10451,40.829028,-73.928496
2,3,Bronx,163,EINSTEIN LOOP,10475,40.864063,-73.822546
3,4,Bronx,1634,EASTCHESTER ROAD,10461,40.845277,-73.845095
4,5,Bronx,400,EAST 198 STREET,10458,40.866021,-73.886021


In [14]:
#Rename columns
new_column_names = {
    'boro': 'borough',
    # 'building': 'building',
    # 'street':'street',
    # 'zipcode':'zipcode',
    # 'location_id':'location_id',
    # 'latitude':"latitude",
    # 'longitude':'longitude'

}

location_dimension = location_dimension.rename(columns=new_column_names)
location_dimension

Unnamed: 0,location_id,borough,building,street,zipcode,latitude,longitude
0,1,Bronx,2535,GRAND CONCOURSE,10468,40.863278,-73.896514
1,2,Bronx,1,EAST 161 STREET,10451,40.829028,-73.928496
2,3,Bronx,163,EINSTEIN LOOP,10475,40.864063,-73.822546
3,4,Bronx,1634,EASTCHESTER ROAD,10461,40.845277,-73.845095
4,5,Bronx,400,EAST 198 STREET,10458,40.866021,-73.886021
...,...,...,...,...,...,...,...
10267,1258,Bronx,1544,BOONE AVENUE,10460,40.832998,-73.885099
10885,1259,Bronx,1883,WEBSTER AVENUE,10457,40.847327,-73.901169
10945,1260,Bronx,2559,BOSTON ROAD,10467,40.864550,-73.863699
10955,1261,Bronx,4277,KATONAH AVENUE,10470,40.897845,-73.867251


In [15]:
# Database connection URL
# Replace the placeholders with your actual database credentials
pwd = 'CIS9440-g1'
database_url = f'postgresql://group1:{pwd}@cis9440-group1-dw.postgres.database.azure.com/postgres'

# Create a SQLAlchemy engine
engine = create_engine(database_url)

In [17]:
location_dimension.to_sql('dim_location', con=engine,
                    schema='nyc_restaurant_inspection', if_exists='append', index=False)

DataError: (psycopg2.errors.NumericValueOutOfRange) numeric field overflow
DETAIL:  A field with precision 20, scale 20 must round to an absolute value less than 1.

[SQL: INSERT INTO nyc_resturant_inspection.dim_location (location_id, borough, building, street, zipcode, latitude, longitude) VALUES (%(location_id__0)s, %(borough__0)s, %(building__0)s, %(street__0)s, %(zipcode__0)s, %(latitude__0)s, %(longitude__0)s), ( ... 134006 characters truncated ... _999)s, %(building__999)s, %(street__999)s, %(zipcode__999)s, %(latitude__999)s, %(longitude__999)s)]
[parameters: {'longitude__0': -73.896514265255, 'latitude__0': 40.863277599272, 'borough__0': 'Bronx', 'street__0': 'GRAND CONCOURSE', 'building__0': '2535', 'location_id__0': 1, 'zipcode__0': 10468, 'longitude__1': -73.928496459645, 'latitude__1': 40.829028434834, 'borough__1': 'Bronx', 'street__1': 'EAST  161 STREET', 'building__1': '1', 'location_id__1': 2, 'zipcode__1': 10451, 'longitude__2': -73.822545666882, 'latitude__2': 40.864062770858, 'borough__2': 'Bronx', 'street__2': 'EINSTEIN LOOP', 'building__2': '163', 'location_id__2': 3, 'zipcode__2': 10475, 'longitude__3': -73.84509543926, 'latitude__3': 40.84527746261, 'borough__3': 'Bronx', 'street__3': 'EASTCHESTER ROAD', 'building__3': '1634', 'location_id__3': 4, 'zipcode__3': 10461, 'longitude__4': -73.886021414235, 'latitude__4': 40.866020638107, 'borough__4': 'Bronx', 'street__4': 'EAST  198 STREET', 'building__4': '400', 'location_id__4': 5, 'zipcode__4': 10458, 'longitude__5': -73.867615847807, 'latitude__5': 40.899920230195, 'borough__5': 'Bronx', 'street__5': 'KATONAH AVENUE', 'building__5': '4336', 'location_id__5': 6, 'zipcode__5': 10470, 'longitude__6': -73.924445355612, 'latitude__6': 40.84205237211, 'borough__6': 'Bronx', 'street__6': 'OGDEN AVENUE', 'building__6': '1381', 'location_id__6': 7, 'zipcode__6': 10452, 'longitude__7': -73.925693054974 ... 6900 parameters truncated ... 'zipcode__992': 10461, 'longitude__993': -73.863370222821, 'latitude__993': 40.832751693756, 'borough__993': 'Bronx', 'street__993': 'WESTCHESTER AVENUE', 'building__993': '1864', 'location_id__993': 994, 'zipcode__993': 10472, 'longitude__994': -73.921187045504, 'latitude__994': 40.840631154326, 'borough__994': 'Bronx', 'street__994': 'JESUP AVENUE', 'building__994': '1372', 'location_id__994': 995, 'zipcode__994': 10452, 'longitude__995': -73.910588466456, 'latitude__995': 40.850147630028, 'borough__995': 'Bronx', 'street__995': 'JEROME AVENUE', 'building__995': '1898', 'location_id__995': 996, 'zipcode__995': 10453, 'longitude__996': -73.846133336745, 'latitude__996': 40.837247747129, 'borough__996': 'Bronx', 'street__996': 'WESTCHESTER AVENUE', 'building__996': '2401', 'location_id__996': 997, 'zipcode__996': 10461, 'longitude__997': -73.918477868008, 'latitude__997': 40.818921110065, 'borough__997': 'Bronx', 'street__997': 'COURTLANDT AVENUE', 'building__997': '647', 'location_id__997': 998, 'zipcode__997': 10451, 'longitude__998': -73.897631426006, 'latitude__998': 40.905379915681, 'borough__998': 'Bronx', 'street__998': 'MOSHOLU AVENUE', 'building__998': '5790', 'location_id__998': 999, 'zipcode__998': 10471, 'longitude__999': -73.885844527342, 'latitude__999': 40.842783543227, 'borough__999': 'Bronx', 'street__999': 'EAST TREMONT AVENUE', 'building__999': '880', 'location_id__999': 1000, 'zipcode__999': 10460}]
(Background on this error at: https://sqlalche.me/e/20/9h9h)