In [2]:
!pip install azure-storage-blob # Microoft Azure
!pip install pyarrow
!pip install psycopg2 sqlalchemy

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.19.1-py3-none-any.whl (394 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.5/394.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core<2.0.0,>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.19.1 isodate-0.6.1


In [3]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine

In [4]:
# Azure Functions

def azure_download_blob(connect_str, container_name, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    download_stream = blob_client.download_blob()
    return download_stream.readall()


In [5]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

# Print the configuration
#Connection_STRING = config["connectionString"]

connection_string = config["connectionString"]
container_name = "groupproject"
blob_name = "groupdata4_Merge_df_Bronx.csv"

blob_content = azure_download_blob(connection_string, container_name, blob_name)
blob_content = blob_content.decode('utf-8')
group_df = pd.read_csv(StringIO(blob_content))
group_df.head()

Unnamed: 0,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,cuisine_description,action,score,inspection_type,violation_code,violation_description,grade,grade_date,latitude,longitude,yelp_rating,yelp_review_count
0,CORKY'S DINER,Bronx,2535,GRAND CONCOURSE,10468.0,7189332484,2024-01-24,Critical,American,Violations were cited in the following area(s).,18.0,Cycle Inspection / Initial Inspection,02B,Hot TCS food item not held at or above 140 °F.,,,40.863278,-73.896514,3.0,113.0
1,PAPA JOHN'S (STAND 310),Bronx,1,EAST 161 STREET,10451.0,9172843260,2017-07-25,Not Applicable,Pizza,No violations were recorded at the time of thi...,0.0,Cycle Inspection / Initial Inspection,,,A,2017-07-25,40.829028,-73.928496,1.9,24.0
2,JADE PALACE,Bronx,163,EINSTEIN LOOP,10475.0,7183201584,2022-03-23,Critical,Chinese,Violations were cited in the following area(s).,31.0,Cycle Inspection / Initial Inspection,02B,Hot food item not held at or above 140º F.,,,40.864063,-73.822546,2.6,11.0
3,PINE BAR & GRILL,Bronx,1634,EASTCHESTER ROAD,10461.0,7183190900,2017-10-12,Critical,Italian,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,06F,Wiping cloths soiled or not stored in sanitizi...,A,2017-10-12,40.845277,-73.845095,3.0,2.0
4,LA ROLA RESTAURANT,Bronx,400,EAST 198 STREET,10458.0,9176881449,2024-03-06,Not Applicable,Spanish,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,,Z,2024-03-06,40.866021,-73.886021,0.0,0.0


In [6]:
group_df.columns

Index(['dba', 'boro', 'building', 'street', 'zipcode', 'phone',
       'inspection_date', 'critical_flag', 'cuisine_description', 'action',
       'score', 'inspection_type', 'violation_code', 'violation_description',
       'grade', 'grade_date', 'latitude', 'longitude', 'yelp_rating',
       'yelp_review_count'],
      dtype='object')

In [7]:
#Consolidation
group_df['zipcode'] =group_df['zipcode'].astype('Int64')

In [9]:
# Create Location Dimension
location_df = group_df[['boro', 'building', 'street', 'zipcode','latitude','longitude']].drop_duplicates()
location_df['location_id'] = range(1, len(location_df) + 1)
location_dimension = location_df[['location_id','boro', 'building', 'street', 'zipcode','latitude','longitude']]

location_dimension.head()

Unnamed: 0,location_id,boro,building,street,zipcode,latitude,longitude
0,1,Bronx,2535,GRAND CONCOURSE,10468,40.863278,-73.896514
1,2,Bronx,1,EAST 161 STREET,10451,40.829028,-73.928496
2,3,Bronx,163,EINSTEIN LOOP,10475,40.864063,-73.822546
3,4,Bronx,1634,EASTCHESTER ROAD,10461,40.845277,-73.845095
4,5,Bronx,400,EAST 198 STREET,10458,40.866021,-73.886021


In [12]:
#Rename columns
new_column_names = {
    'boro': 'Borough',
    'building': 'Building',
    'street':'Street',
    'zipcode':'Zipcode',
    'location_id':'Location_ID',
    'latitude':"Latitude",
    'longitude':'Longitude'

}

location_dimension = location_dimension.rename(columns=new_column_names)
location_dimension

Unnamed: 0,LocationID,Borough,Building,Street,Zipcode,Latitude,Longitude
0,1,Bronx,2535,GRAND CONCOURSE,10468,40.863278,-73.896514
1,2,Bronx,1,EAST 161 STREET,10451,40.829028,-73.928496
2,3,Bronx,163,EINSTEIN LOOP,10475,40.864063,-73.822546
3,4,Bronx,1634,EASTCHESTER ROAD,10461,40.845277,-73.845095
4,5,Bronx,400,EAST 198 STREET,10458,40.866021,-73.886021
...,...,...,...,...,...,...,...
11097,1331,Bronx,1544,BOONE AVENUE,10460,40.832998,-73.885099
11750,1332,Bronx,1883,WEBSTER AVENUE,10457,40.847327,-73.901169
11813,1333,Bronx,2559,BOSTON ROAD,10467,40.864550,-73.863699
11824,1334,Bronx,4277,KATONAH AVENUE,10470,40.897845,-73.867251
