In [5]:
!pip install azure-storage-blob
!pip install pyarrow
!pip install psycopg2 sqlalchemy



In [6]:
# import libraries
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine

In [7]:
# Function
def azure_download_blob(connect_str, container_name, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    download_stream = blob_client.download_blob()
    blob_content = download_stream.readall().decode('utf-8')
    return blob_content

### Download Bronx DataFrame from Azure

In [9]:
config_file_path = 'config/config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

connection_string = config["connection_string"]
container_name = "groupproject"
blob_name = "groupdata4_Merge_df_Bronx.csv"

In [10]:
blob_content = azure_download_blob(connection_string, container_name, blob_name)
df_Bronx = pd.read_csv(StringIO(blob_content))
df_Bronx

Unnamed: 0,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,cuisine_description,action,score,inspection_type,violation_code,violation_description,grade,grade_date,latitude,longitude,yelp_rating,yelp_review_count
0,CORKY'S DINER,Bronx,2535,GRAND CONCOURSE,10468.0,7189332484,2024-01-24,Critical,American,Violations were cited in the following area(s).,18.0,Cycle Inspection / Initial Inspection,02B,Hot TCS food item not held at or above 140 °F.,,,40.863278,-73.896514,3.0,113.0
1,PAPA JOHN'S (STAND 310),Bronx,1,EAST 161 STREET,10451.0,9172843260,2017-07-25,Not Applicable,Pizza,No violations were recorded at the time of thi...,0.0,Cycle Inspection / Initial Inspection,,,A,2017-07-25,40.829028,-73.928496,1.9,24.0
2,JADE PALACE,Bronx,163,EINSTEIN LOOP,10475.0,7183201584,2022-03-23,Critical,Chinese,Violations were cited in the following area(s).,31.0,Cycle Inspection / Initial Inspection,02B,Hot food item not held at or above 140º F.,,,40.864063,-73.822546,2.6,11.0
3,PINE BAR & GRILL,Bronx,1634,EASTCHESTER ROAD,10461.0,7183190900,2017-10-12,Critical,Italian,Violations were cited in the following area(s).,12.0,Cycle Inspection / Re-inspection,06F,Wiping cloths soiled or not stored in sanitizi...,A,2017-10-12,40.845277,-73.845095,3.0,2.0
4,LA ROLA RESTAURANT,Bronx,400,EAST 198 STREET,10458.0,9176881449,2024-03-06,Not Applicable,Spanish,Establishment re-opened by DOHMH.,0.0,Cycle Inspection / Reopening Inspection,,,Z,2024-03-06,40.866021,-73.886021,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12581,XIN HI CHINESE BUFFETT,Bronx,2053,BARTOW AVENUE,10475.0,7183792200,2023-01-20,Not Critical,Chinese,Violations were cited in the following area(s).,5.0,Cycle Inspection / Re-inspection,28-06,Contract with a pest management professional n...,A,2023-01-20,40.869468,-73.828515,4.0,5.0
12582,PROSPECT COFFEE SHOP,Bronx,1309,PROSPECT AVENUE,10459.0,7189911249,2023-03-20,Not Critical,American,Violations were cited in the following area(s).,9.0,Cycle Inspection / Initial Inspection,10F,Non-food contact surface or equipment made of ...,A,2023-03-20,40.828042,-73.898148,0.0,0.0
12583,"SAKE II JAPANESE RESTAURANT, SUSHI & HIBACHI",Bronx,690,EAST 187 STREET,10458.0,7182200988,2023-01-11,Critical,Japanese,Violations were cited in the following area(s).,30.0,Cycle Inspection / Re-inspection,04H,"Raw, cooked or prepared food is adulterated, c...",C,2023-01-11,40.854192,-73.884643,2.7,3.0
12584,FOO-HING KITCHEN,Bronx,2895,SEDGWICK AVENUE,10468.0,7188846267,2021-09-07,Critical,Chinese,Violations were cited in the following area(s).,22.0,Cycle Inspection / Initial Inspection,06B,"Tobacco use, eating, or drinking from open con...",,,40.874944,-73.901262,0.0,0.0


### Create Date Dimension

In [11]:
# Functions
def week_of_month(dt):
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + first_day.weekday()
    return int(ceil(adjusted_dom/7.0))

def get_week_of_year(date_str):
    """
    Calculate the ISO week number of the year for a given date.

    Parameters:
    date_str (str): A date string in the format 'YYYY-MM-DD'.

    Returns:
    int: ISO week number of the year.
    """
    # Parse the input string to a datetime object
    date = datetime.strptime(date_str, '%Y-%m-%d')

    # Get the ISO calendar week number
    week_of_year = date.isocalendar()[1]

    return week_of_year

def week_of_month(dt):
    year = dt.year
    month = dt.month
    day = dt.day

    cal = calendar.monthcalendar(year, month)
    week_number = (day - 1) // 7 + 1
    return week_number

In [12]:
min_grade_date = df_Bronx['grade_date'][~pd.isna(df_Bronx['grade_date'])].min()
max_grade_date = df_Bronx['grade_date'][~pd.isna(df_Bronx['grade_date'])].max()

In [13]:
start_date = min(df_Bronx['inspection_date'].min(), min_grade_date)
end_date = max(df_Bronx['inspection_date'].max(), max_grade_date)
print(start_date, end_date)

2016-05-25 2024-04-13


In [14]:
dim_date_df = pd.DataFrame({'date': pd.date_range(start_date, end_date, freq='D')})
dim_date_df.head(10)

Unnamed: 0,date
0,2016-05-25
1,2016-05-26
2,2016-05-27
3,2016-05-28
4,2016-05-29
5,2016-05-30
6,2016-05-31
7,2016-06-01
8,2016-06-02
9,2016-06-03


In [15]:
# Extract attributes
dim_date_df['Date_ID'] = dim_date_df['date'].dt.strftime('%Y%m%d')
dim_date_df['Date_Iso_Fromat'] = dim_date_df['date'].apply(lambda x: x.isoformat()[:10])

dim_date_df['Year_Number'] = dim_date_df['date'].dt.year
dim_date_df['Quarter_Number'] = dim_date_df['date'].dt.quarter
dim_date_df['Month_Number'] = dim_date_df['date'].dt.month
dim_date_df['Day_Number'] = dim_date_df['date'].dt.day

dim_date_df['Month_Name'] = dim_date_df['date'].dt.strftime('%B')
dim_date_df['Day_Name'] = dim_date_df['date'].dt.strftime('%A')

# Add week of the month and week of the year
dim_date_df['Week_of_the_Year'] = dim_date_df['date'].dt.strftime('%U')
dim_date_df['Week_of_the_Month'] = dim_date_df['date'].apply(week_of_month)

dim_date_df.head(20)

Unnamed: 0,date,Date_ID,Date_Iso_Fromat,Year_Number,Quarter_Number,Month_Number,Day_Number,Month_Name,Day_Name,Week_of_the_Year,Week_of_the_Month
0,2016-05-25,20160525,2016-05-25,2016,2,5,25,May,Wednesday,21,4
1,2016-05-26,20160526,2016-05-26,2016,2,5,26,May,Thursday,21,4
2,2016-05-27,20160527,2016-05-27,2016,2,5,27,May,Friday,21,4
3,2016-05-28,20160528,2016-05-28,2016,2,5,28,May,Saturday,21,4
4,2016-05-29,20160529,2016-05-29,2016,2,5,29,May,Sunday,22,5
5,2016-05-30,20160530,2016-05-30,2016,2,5,30,May,Monday,22,5
6,2016-05-31,20160531,2016-05-31,2016,2,5,31,May,Tuesday,22,5
7,2016-06-01,20160601,2016-06-01,2016,2,6,1,June,Wednesday,22,1
8,2016-06-02,20160602,2016-06-02,2016,2,6,2,June,Thursday,22,1
9,2016-06-03,20160603,2016-06-03,2016,2,6,3,June,Friday,22,1


In [16]:
# delete 'date' column in dim_date as final date dimension table
dim_date_final_df = dim_date_df.drop(dim_date_df.columns[0], axis=1)
dim_date_final_df

Unnamed: 0,Date_ID,Date_Iso_Fromat,Year_Number,Quarter_Number,Month_Number,Day_Number,Month_Name,Day_Name,Week_of_the_Year,Week_of_the_Month
0,20160525,2016-05-25,2016,2,5,25,May,Wednesday,21,4
1,20160526,2016-05-26,2016,2,5,26,May,Thursday,21,4
2,20160527,2016-05-27,2016,2,5,27,May,Friday,21,4
3,20160528,2016-05-28,2016,2,5,28,May,Saturday,21,4
4,20160529,2016-05-29,2016,2,5,29,May,Sunday,22,5
...,...,...,...,...,...,...,...,...,...,...
2876,20240409,2024-04-09,2024,2,4,9,April,Tuesday,14,2
2877,20240410,2024-04-10,2024,2,4,10,April,Wednesday,14,2
2878,20240411,2024-04-11,2024,2,4,11,April,Thursday,14,2
2879,20240412,2024-04-12,2024,2,4,12,April,Friday,14,2
