## Import and Install Libraries

In [224]:
# Install altair
# !pip install "altair[all]==5.5.0"



In [225]:
# Install faker
# !pip install faker



In [226]:
# Install psycopg3
# !pip install psycopg[binary]



In [353]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime
import altair as alt
from faker import Faker
import random
from itertools import chain
import psycopg
from psycopg import sql
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

# PART 2(A): ETL

## Import Data from CSV

In [354]:
# Define file paths
FILE_PATHS = [f'..\\Data\\0_NYC311_raw\\NYC311_{i}.csv' for i in range(10)]

# Read all files
dfs = [pd.read_csv(f) for f in FILE_PATHS]

# Concatenate data
df = pd.concat(dfs)

# Check info and head
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 29999
Data columns (total 41 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   unique_key                      300000 non-null  int64  
 1   created_date                    300000 non-null  object 
 2   agency                          300000 non-null  object 
 3   agency_name                     300000 non-null  object 
 4   complaint_type                  300000 non-null  object 
 5   descriptor                      294808 non-null  object 
 6   location_type                   265981 non-null  object 
 7   incident_zip                    297401 non-null  float64
 8   incident_address                290795 non-null  object 
 9   street_name                     290785 non-null  object 
 10  cross_street_1                  193748 non-null  object 
 11  cross_street_2                  193849 non-null  object 
 12  intersection_street_1 

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,location_type,incident_zip,incident_address,street_name,...,resolution_description,resolution_action_updated_date,vehicle_type,facility_type,bridge_highway_name,bridge_highway_segment,taxi_company_borough,bridge_highway_direction,road_ramp,due_date
0,66932951,2025-11-24T02:06:15.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,Residential Building/House,10306.0,1742 RICHMOND ROAD,RICHMOND ROAD,...,,,,,,,,,,
1,66927366,2025-11-24T02:05:57.000,NYPD,New York City Police Department,Noise - Residential,Loud Television,Residential Building/House,10002.0,280 MADISON STREET,MADISON STREET,...,,,,,,,,,,
2,66927332,2025-11-24T02:05:27.000,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Store/Commercial,11103.0,42-12 BROADWAY,BROADWAY,...,,,,,,,,,,
3,66927308,2025-11-24T02:04:27.000,NYPD,New York City Police Department,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,10470.0,343 EAST 238 STREET,EAST 238 STREET,...,,,,,,,,,,
4,66931824,2025-11-24T02:03:41.000,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk,11385.0,2025 WOODBINE STREET,WOODBINE STREET,...,,,,,,,,,,


## Data Cleaning

In [355]:
# Convert date fields into datetime format for ingest into DB
date_cols = ['created_date', 'closed_date', 'due_date', 'resolution_action_updated_date']

# define conversion function
def time_nan(x):
  if pd.isna(x):
    return None
  else:
    return datetime.fromisoformat(str(x)) 

# Apply function
for col in date_cols:
  df[col] = df[col].apply(time_nan).replace({pd.NaT: None})
  print(df[df[col].notna()][col].head())

0   2025-11-24 02:06:15
1   2025-11-24 02:05:57
2   2025-11-24 02:05:27
3   2025-11-24 02:04:27
4   2025-11-24 02:03:41
Name: created_date, dtype: datetime64[ns]
13    2025-11-24 02:02:17
14    2025-11-24 01:59:06
21    2025-11-24 01:52:21
49    2025-11-24 01:57:41
53    2025-11-24 01:57:43
Name: closed_date, dtype: object
3908    2025-12-23 15:29:17
4429    2025-12-23 14:15:37
6002    2025-12-23 11:25:12
7067    2025-12-23 08:56:44
7258    2025-12-23 08:28:57
Name: due_date, dtype: object
13    2025-11-24 02:02:26
14    2025-11-24 01:59:12
21    2025-11-24 01:52:25
34    2025-11-24 01:46:22
39    2025-11-24 01:44:02
Name: resolution_action_updated_date, dtype: object


In [356]:
# Drop unneeded columns
drop_cols = ['street_name', # duplicate of incident_address
             'bbl', # duplicate of borough
             'park_borough', # duplicate of borough
             'location', # repeat of x and y coords
             'facility_type', # underused, not related to subtables (below)
             'vehicle_type'] # underused, not related to subtables (below)
df = df.drop(columns=drop_cols)

## Organize Data into Tables



### Agency names table

In [357]:
# Make table of agency_name and agency(pk)
agency_table = df[['agency_name', 'agency']].drop_duplicates().reset_index(drop=True)
print(agency_table.info())
agency_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   agency_name  15 non-null     object
 1   agency       15 non-null     object
dtypes: object(2)
memory usage: 372.0+ bytes
None


Unnamed: 0,agency_name,agency
0,New York City Police Department,NYPD
1,Taxi and Limousine Commission,TLC
2,Department of Sanitation,DSNY
3,Department of Consumer and Worker Protection,DCWP
4,Department of Health and Mental Hygiene,DOHMH


In [358]:
# Drop agency_name from main table
df_main = df.drop(columns='agency_name')

### Complaint details table

In [359]:
# Make a table of unique Agency/Compaint/Description/Location Type combinations
complaint_table = df[['agency', 'complaint_type', 'descriptor', 'location_type']].drop_duplicates().reset_index(drop=True)
complaint_table['descriptor'].rename('complaint_descriptor')
print(complaint_table.info())
complaint_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1286 entries, 0 to 1285
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   agency          1286 non-null   object
 1   complaint_type  1286 non-null   object
 2   descriptor      1240 non-null   object
 3   location_type   1011 non-null   object
dtypes: object(4)
memory usage: 40.3+ KB
None


Unnamed: 0,agency,complaint_type,descriptor,location_type
0,NYPD,Noise - Residential,Loud Music/Party,Residential Building/House
1,NYPD,Noise - Residential,Loud Television,Residential Building/House
2,NYPD,Noise - Commercial,Loud Music/Party,Store/Commercial
3,NYPD,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk
4,NYPD,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk


In [360]:
# Add unique key(pk) to complaint table
complaint_table['complaint_id'] = complaint_table['agency'].str.cat(others=complaint_table.index.astype(str))
complaint_table.head()

Unnamed: 0,agency,complaint_type,descriptor,location_type,complaint_id
0,NYPD,Noise - Residential,Loud Music/Party,Residential Building/House,NYPD0
1,NYPD,Noise - Residential,Loud Television,Residential Building/House,NYPD1
2,NYPD,Noise - Commercial,Loud Music/Party,Store/Commercial,NYPD2
3,NYPD,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,NYPD3
4,NYPD,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk,NYPD4


In [361]:
# Join complaint_table and df_main
df_main = df_main.merge(complaint_table, on=['agency', 'complaint_type', 'descriptor', 'location_type'])

# Check
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   unique_key                      300000 non-null  int64         
 1   created_date                    300000 non-null  datetime64[ns]
 2   agency                          300000 non-null  object        
 3   complaint_type                  300000 non-null  object        
 4   descriptor                      294808 non-null  object        
 5   location_type                   265981 non-null  object        
 6   incident_zip                    297401 non-null  float64       
 7   incident_address                290795 non-null  object        
 8   cross_street_1                  193748 non-null  object        
 9   cross_street_2                  193849 non-null  object        
 10  intersection_street_1           182406 non-null  object 

In [362]:
# Now we can drop agency, complaint_type, descriptor, and location_type from df_main
df_main = df_main.drop(columns=['agency', 'complaint_type', 'descriptor', 'location_type'])
df_main.info()
df_main.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   unique_key                      300000 non-null  int64         
 1   created_date                    300000 non-null  datetime64[ns]
 2   incident_zip                    297401 non-null  float64       
 3   incident_address                290795 non-null  object        
 4   cross_street_1                  193748 non-null  object        
 5   cross_street_2                  193849 non-null  object        
 6   intersection_street_1           182406 non-null  object        
 7   intersection_street_2           182567 non-null  object        
 8   address_type                    298727 non-null  object        
 9   city                            288420 non-null  object        
 10  landmark                        165089 non-null  object 

Unnamed: 0,unique_key,created_date,incident_zip,incident_address,cross_street_1,cross_street_2,intersection_street_1,intersection_street_2,address_type,city,...,closed_date,resolution_description,resolution_action_updated_date,bridge_highway_name,bridge_highway_segment,taxi_company_borough,bridge_highway_direction,road_ramp,due_date,complaint_id
0,66932951,2025-11-24 02:06:15,10306.0,1742 RICHMOND ROAD,DONGAN HILLS AVENUE,SEAVER AVENUE,DONGAN HILLS AVENUE,SEAVER AVENUE,ADDRESS,STATEN ISLAND,...,,,,,,,,,,NYPD0
1,66930659,2025-11-24 01:51:19,11226.0,530 PARKSIDE AVENUE,BEDFORD AVENUE,ROGERS AVENUE,BEDFORD AVENUE,ROGERS AVENUE,ADDRESS,BROOKLYN,...,,,,,,,,,,NYPD0
2,66928426,2025-11-24 01:49:36,10468.0,150 WEST 197 STREET,WEBB AVENUE,SEDGWICK AVENUE,WEBB AVENUE,SEDGWICK AVENUE,ADDRESS,BRONX,...,,,,,,,,,,NYPD0
3,66930657,2025-11-24 01:49:36,10468.0,150 WEST 197 STREET,WEBB AVENUE,SEDGWICK AVENUE,WEBB AVENUE,SEDGWICK AVENUE,ADDRESS,BRONX,...,,,,,,,,,,NYPD0
4,66934040,2025-11-24 01:49:36,10466.0,655 EAST 230 STREET,CARPENTER AVENUE,LOWERRE PLACE,CARPENTER AVENUE,LOWERRE PLACE,ADDRESS,BRONX,...,,,,,,,,,,NYPD0


### Taxi information

In [363]:
# Create table of taxi details
taxi_table = df[['unique_key','taxi_company_borough','taxi_pick_up_location']]
taxi_table = taxi_table.dropna(subset=['taxi_company_borough','taxi_pick_up_location'], how='all').reset_index(drop=True)
taxi_table['taxi_id'] = taxi_table['unique_key'].apply(lambda x: str(x) + '_taxi')

print(taxi_table.info())
taxi_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3183 entries, 0 to 3182
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   unique_key             3183 non-null   int64 
 1   taxi_company_borough   221 non-null    object
 2   taxi_pick_up_location  3175 non-null   object
 3   taxi_id                3183 non-null   object
dtypes: int64(1), object(3)
memory usage: 99.6+ KB
None


Unnamed: 0,unique_key,taxi_company_borough,taxi_pick_up_location,taxi_id
0,66935031,,"510 HUDSON STREET, MANHATTAN (NEW YORK), NY, 1...",66935031_taxi
1,66930601,,"JOHN F KENNEDY AIRPORT, QUEENS (JAMAICA) ,NY, ...",66930601_taxi
2,66929498,,"JOHN F KENNEDY AIRPORT, QUEENS (JAMAICA) ,NY, ...",66929498_taxi
3,66927281,,"LA GUARDIA AIRPORT, QUEENS (EAST ELMHURST) ,NY...",66927281_taxi
4,66929484,,"7 AVENUE AND WEST 38 STREET, MANHATTAN, NY, ...",66929484_taxi


In [364]:
# Drop taxi information from df_main
df_main = df_main.drop(columns=['taxi_company_borough','taxi_pick_up_location'])

### Road/Highway information

In [365]:
# Create table of road/highway information
road_table = df[['unique_key','bridge_highway_name','bridge_highway_direction','bridge_highway_segment','road_ramp']]
road_table = road_table.dropna(subset=['bridge_highway_name','bridge_highway_direction','bridge_highway_segment','road_ramp'], how='all').reset_index(drop=True)
road_table['road_id'] = road_table['unique_key'].apply(lambda x: str(x) + '_road')

print(road_table.info())
road_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   unique_key                1800 non-null   int64 
 1   bridge_highway_name       1799 non-null   object
 2   bridge_highway_direction  801 non-null    object
 3   bridge_highway_segment    1795 non-null   object
 4   road_ramp                 636 non-null    object
 5   road_id                   1800 non-null   object
dtypes: int64(1), object(5)
memory usage: 84.5+ KB
None


Unnamed: 0,unique_key,bridge_highway_name,bridge_highway_direction,bridge_highway_segment,road_ramp,road_id
0,66930741,F,,Other,,66930741_road
1,66930751,J,,Mezzanine,,66930751_road
2,66935182,7,,Mezzanine,,66935182_road
3,66928500,D,,Entrance,,66928500_road
4,66927415,Cross Island Pkwy,North/Bronx Bound,Bell Blvd (Exit 32) - Throgs Neck Br Bronx New...,Roadway,66927415_road


In [366]:
# Drop road/highway information from main
df_main = df_main.drop(columns=['bridge_highway_name','bridge_highway_direction','bridge_highway_segment','road_ramp'])
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   unique_key                      300000 non-null  int64         
 1   created_date                    300000 non-null  datetime64[ns]
 2   incident_zip                    297401 non-null  float64       
 3   incident_address                290795 non-null  object        
 4   cross_street_1                  193748 non-null  object        
 5   cross_street_2                  193849 non-null  object        
 6   intersection_street_1           182406 non-null  object        
 7   intersection_street_2           182567 non-null  object        
 8   address_type                    298727 non-null  object        
 9   city                            288420 non-null  object        
 10  landmark                        165089 non-null  object 

### Status information

In [367]:
# Create table with status and resolution data
status_table = df[['unique_key','status','resolution_description','resolution_action_updated_date','due_date','closed_date']]
status_table = status_table.dropna(subset=['status','resolution_description','resolution_action_updated_date','due_date'], how='all').reset_index(drop=True)
status_table['status_id'] = status_table['unique_key'].apply(lambda x: str(x) + '_status')

print(status_table.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 7 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   unique_key                      300000 non-null  int64 
 1   status                          300000 non-null  object
 2   resolution_description          283148 non-null  object
 3   resolution_action_updated_date  285506 non-null  object
 4   due_date                        1278 non-null    object
 5   closed_date                     256460 non-null  object
 6   status_id                       300000 non-null  object
dtypes: int64(1), object(6)
memory usage: 16.0+ MB
None


In [368]:
# Drop status columns from df_main
df_main = df_main.drop(columns=['status','resolution_description','resolution_action_updated_date','due_date','closed_date'])
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   unique_key                300000 non-null  int64         
 1   created_date              300000 non-null  datetime64[ns]
 2   incident_zip              297401 non-null  float64       
 3   incident_address          290795 non-null  object        
 4   cross_street_1            193748 non-null  object        
 5   cross_street_2            193849 non-null  object        
 6   intersection_street_1     182406 non-null  object        
 7   intersection_street_2     182567 non-null  object        
 8   address_type              298727 non-null  object        
 9   city                      288420 non-null  object        
 10  landmark                  165089 non-null  object        
 11  community_board           300000 non-null  object        
 12  bo

## Generate Supplemental Information

In [369]:
# Import list of streets in NYC, for more realistic synthetic data
# Source: https://www.theofficeproviders.com/new-york-city-street-address-guide-a-z/
df_streets = pd.read_csv('..\\Data\\1_Supplemental\\street_names.csv')
df_streets.head()

Unnamed: 0,Street Names
0,First Avenue
1,Second Avenue
2,Third Avenue
3,Fourth Avenue
4,Fifth Avenue


In [370]:
# Create a table of synthetic data using Faker & random generator:
# 311 Call operators

# Initialize Faker and numpy random generator
fake = Faker()
rng = np.random.default_rng()

n = 5000 # base number of operators
cc = 500 # base number of call centers

boroughs = ['Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island'] # borough names
# brs = ['MAN', 'BNX', 'BRK', 'QNS', 'STI'] # borough codes

# Note: population from 2020 census, source: https://en.wikipedia.org/wiki/Boroughs_of_New_York_City
b_pops = [1694251, 1472654, 2736074, 2405464, 495747] # population by borough
city_pop = sum(b_pops) # total population

b_prop =[ b / city_pop for b in b_pops] # proportion of population per borough

b_cc = [int(np.round(cc * b)) for b in b_prop]# set proportional number of call centers per borough
b_n = [int(np.round(n * b)) for b in b_prop] # set proportional number of operators per borough

n = sum(b_n) # recount total number of operators, just in case
cc = sum(b_cc) # recount total number of call centers, just in case

# --- GENERATE SYNTHETIC DATA ---

# Create call center IDs (pk)
cc_id = [f'CC_{j}' for j in range(1, cc + 1)]

# Create fake call center street addresses
cc_street = [f'{fake.unique.building_number()} {rng.choice(df_streets["Street Names"])}' for i in range(cc)]

# Randomly assign boroughs, using proportional distribution
cc_br = rng.choice(boroughs, size=cc, p=b_prop)

# Create operator IDs (pk)
op_id = [f'OP_{j}' for j in range(1, n + 1)]

# Assign operators to call centers (assuming equal-sized-ish centers)
op_cc = rng.choice(cc_id, size=n)

# Create fake operator names
op_name = [fake.unique.name() for i in range(n)]

# Create fake operator SSN
op_ssn = [fake.unique.ssn().replace('-','') for i in range(n)]

# Create fake operator salaries, pulling from GlassDoor for a rough estimate
op_salary = rng.uniform(low=32000, high=47000, size=n).round(2)

# --- CREATE DATAFRAMES ---

# Create dataframe of call center information
call_centers = {
    'cc_id': cc_id,
    'cc_street': cc_street,
    'cc_borough': cc_br
}
cc_df = pd.DataFrame(call_centers)
display(cc_df.head())

print('\n')

# Create dataframe of operator information
operators = {
    'op_id': op_id,
    'op_name': op_name,
    'op_ssn': op_ssn,
    'op_salary': op_salary,
    'op_cc': op_cc
}
op_df = pd.DataFrame(operators)
display(op_df.head())

Unnamed: 0,cc_id,cc_street,cc_borough
0,CC_1,167 Avenue C,Queens
1,CC_2,89155 34th Street,Manhattan
2,CC_3,11963 Astor Row,Queens
3,CC_4,739 Ludlow Street,Brooklyn
4,CC_5,774 Astor Row,Manhattan






Unnamed: 0,op_id,op_name,op_ssn,op_salary,op_cc
0,OP_1,Todd Vargas,109504585,32689.15,CC_71
1,OP_2,Melinda Ibarra,230862090,41227.76,CC_363
2,OP_3,Megan Figueroa,782650115,37682.83,CC_254
3,OP_4,Jennifer Shields,265220904,35955.96,CC_134
4,OP_5,Anthony Barnes,144422394,38392.37,CC_465


In [371]:
# Sanity check: observe distributions
print('Call Center Distribution:')
print(cc_df['cc_borough'].value_counts(), '\n')

# Create quick histogram of call center staffing
op_cc_list = op_df['op_cc'].value_counts().reset_index()
op_graph = alt.Chart(op_cc_list).mark_bar().encode(
    x=alt.X('count:Q', title='Staff Size',bin=True),
    y=alt.Y('count()', title='Call Center Count')
).properties(
    title='Call Center Staffing'
)
op_graph

Call Center Distribution:
cc_borough
Brooklyn         160
Queens           136
Bronx             94
Manhattan         85
Staten Island     25
Name: count, dtype: int64 



In [372]:
# Add Operator IDs to the df_main
# Also, let's rename df_main to incidents

df_main['op_id'] = rng.choice(op_id, size=len(df_main))
incidents = df_main

print(incidents.info())
display(incidents.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   unique_key                300000 non-null  int64         
 1   created_date              300000 non-null  datetime64[ns]
 2   incident_zip              297401 non-null  float64       
 3   incident_address          290795 non-null  object        
 4   cross_street_1            193748 non-null  object        
 5   cross_street_2            193849 non-null  object        
 6   intersection_street_1     182406 non-null  object        
 7   intersection_street_2     182567 non-null  object        
 8   address_type              298727 non-null  object        
 9   city                      288420 non-null  object        
 10  landmark                  165089 non-null  object        
 11  community_board           300000 non-null  object        
 12  bo

Unnamed: 0,unique_key,created_date,incident_zip,incident_address,cross_street_1,cross_street_2,intersection_street_1,intersection_street_2,address_type,city,...,community_board,borough,x_coordinate_state_plane,y_coordinate_state_plane,open_data_channel_type,park_facility_name,latitude,longitude,complaint_id,op_id
0,66932951,2025-11-24 02:06:15,10306.0,1742 RICHMOND ROAD,DONGAN HILLS AVENUE,SEAVER AVENUE,DONGAN HILLS AVENUE,SEAVER AVENUE,ADDRESS,STATEN ISLAND,...,02 STATEN ISLAND,STATEN ISLAND,955581.0,153408.0,PHONE,Unspecified,40.587699,-74.10322,NYPD0,OP_411
1,66930659,2025-11-24 01:51:19,11226.0,530 PARKSIDE AVENUE,BEDFORD AVENUE,ROGERS AVENUE,BEDFORD AVENUE,ROGERS AVENUE,ADDRESS,BROOKLYN,...,09 BROOKLYN,BROOKLYN,996619.0,178251.0,PHONE,Unspecified,40.655926,-73.955421,NYPD0,OP_974
2,66928426,2025-11-24 01:49:36,10468.0,150 WEST 197 STREET,WEBB AVENUE,SEDGWICK AVENUE,WEBB AVENUE,SEDGWICK AVENUE,ADDRESS,BRONX,...,08 BRONX,BRONX,1011478.0,257128.0,MOBILE,Unspecified,40.872389,-73.901549,NYPD0,OP_2885
3,66930657,2025-11-24 01:49:36,10468.0,150 WEST 197 STREET,WEBB AVENUE,SEDGWICK AVENUE,WEBB AVENUE,SEDGWICK AVENUE,ADDRESS,BRONX,...,08 BRONX,BRONX,1011478.0,257128.0,MOBILE,Unspecified,40.872389,-73.901549,NYPD0,OP_4644
4,66934040,2025-11-24 01:49:36,10466.0,655 EAST 230 STREET,CARPENTER AVENUE,LOWERRE PLACE,CARPENTER AVENUE,LOWERRE PLACE,ADDRESS,BRONX,...,12 BRONX,BRONX,1022911.0,264242.0,MOBILE,Unspecified,40.891872,-73.860168,NYPD0,OP_1721


## Data Cleaning for DB input

In [373]:
# Try recasting complaint_incident.incident_zip/x_coor/y_coor to integer

# define conversion function
def int_nan(x):
  if pd.isna(x):
    return None
  else:
    return int(x) 

# Apply to relevant cols
int_cols = ['incident_zip', 'x_coordinate_state_plane', 'y_coordinate_state_plane']
for col in int_cols:
    incidents[col] = incidents[col].astype('Int64')

print(incidents[int_cols].head())

   incident_zip  x_coordinate_state_plane  y_coordinate_state_plane
0         10306                    955581                    153408
1         11226                    996619                    178251
2         10468                   1011478                    257128
3         10468                   1011478                    257128
4         10466                   1022911                    264242


In [374]:
# Make sure all columns are in the right order
agency = agency_table[['agency','agency_name']]
call_center = cc_df[['cc_id', 'cc_street', 'cc_borough']]
call_operator = op_df[['op_id', 'op_name', 'op_ssn', 'op_salary', 'op_cc']]
complaint_type = complaint_table[['complaint_id', 'agency', 'complaint_type', 'descriptor', 'location_type']]
complaint_incident = incidents[['unique_key', 'complaint_id', 'op_id', 'created_date', 'incident_zip', 'incident_address', 'cross_street_1', 'cross_street_2', 'intersection_street_1', 'intersection_street_2', 'address_type', 'city', 'landmark', 'community_board', 'borough', 'x_coordinate_state_plane', 'y_coordinate_state_plane', 'open_data_channel_type', 'park_facility_name', 'latitude', 'longitude']]
complaint_status = status_table[['status_id', 'unique_key', 'status', 'resolution_description', 'resolution_action_updated_date', 'due_date', 'closed_date']]
taxi_details = taxi_table[['taxi_id', 'unique_key', 'taxi_company_borough','taxi_pick_up_location']]
road_details = road_table[['road_id', 'unique_key', 'bridge_highway_name', 'bridge_highway_direction', 'bridge_highway_segment','road_ramp']]

In [375]:
# Deal with null text (and int) values 

# Incidents table
incident_text_cols = ['incident_address', 'cross_street_1', 'cross_street_2', 'intersection_street_1', 'intersection_street_2', 'address_type']
for col in incident_text_cols:
    complaint_incident[col] = complaint_incident[col].fillna(value="")
complaint_incident['landmark'] = complaint_incident['landmark'].fillna(value='Unspecified')
complaint_incident['park_facility_name'] = complaint_incident['park_facility_name'].fillna(value='Unspecified')
complaint_incident['city'] = complaint_incident['city'].fillna(complaint_incident['borough'])
int_cols = ['incident_zip', 'x_coordinate_state_plane', 'y_coordinate_state_plane']
for col in int_cols:
    complaint_incident[col] = complaint_incident[col].fillna(value=0)

# Status table
complaint_status['resolution_description'] = complaint_status['resolution_description'].fillna(value="N/A")

# Type table
complaint_type['descriptor'] = complaint_type['descriptor'].fillna(value="N/A")
complaint_type['location_type'] = complaint_type['location_type'].fillna(value="N/A")

# Taxi table
taxi_details['taxi_company_borough'] = taxi_details['taxi_company_borough'].fillna(value="Unspecified")
taxi_details['taxi_pick_up_location'] = taxi_details['taxi_pick_up_location'].fillna(value="Unknown")

# Road table
road_details['bridge_highway_name'] = road_details['bridge_highway_name'].fillna(value='Long Island Expwy') # Looked this one up on a map
road_details['bridge_highway_direction'] = road_details['bridge_highway_direction'].fillna(value='Unspecified')
road_details['road_ramp'] = road_details['road_ramp'].fillna("")

## Copy table data into database

NOTE: before running this code, please execute the SQL files to generate the mesa8413 database and the relevant tables and indexes.

In [376]:
# Define copy function
# With thanks to https://stackoverflow.com/questions/78732362/how-to-upload-pandas-data-frames-fast-with-psycopg3    
def copy_table_from_df(
    conn,
    df_table: pd.DataFrame,
    table_name: str,
    verbosity=1000
) -> None:
    """Upload a single table to the database using the COPY command."""
    t0 = time.time()
    print(f'COPYING {table_name}')

    # Cast df to list of lists
    list_table = df_table.values.tolist()
    
    with conn.cursor() as cur:
        query = f'COPY {table_name} FROM STDIN'
        copy_sql = sql.SQL(query)
        
        with cur.copy(copy_sql) as copy:
            r = 0
            for record in list_table:
                r += 1
                copy.write_row(record)

                if r % verbosity == 0:
                    print(f'...{r} ROWS COPIED: {time.time() - t0} SEC')
    conn.commit()
    print(f'TABLE {table_name} COPIED: {time.time() - t0} SEC\n')

# Define delete function
def drop_table(conn, table_name) -> None:
    """USE WITH CAUTION: Delete data from a given table."""
    print(f'DROPPING ALL DATA FROM TABLE {table_name}\n')
    query = f'TRUNCATE {table_name} RESTART IDENTITY CASCADE'
    query_sql = sql.SQL(query)
    with conn.cursor() as cur:
        cur.execute(query_sql)
    conn.commit()   

# Define list of table soruces, table names, and desired verbosity
df_tables = [
    agency, 
    call_center, 
    call_operator, 
    complaint_type, 
    complaint_incident, 
    complaint_status, 
    taxi_details, 
    road_details
]
table_names = [
    'nyc311.agency',
    'nyc311.call_center',
    'nyc311.call_operator',
    'nyc311.complaint_type',
    'nyc311.complaint_incident',
    'nyc311.complaint_status',
    'nyc311.taxi_details',
    'nyc311.road_details',
]
verbs = [
    10,
    100,
    1000,
    500,
    10000,
    10000,
    500,
    500
]

# Establish database connection
time0 = time.time()
with psycopg.connect(dbname="mesa8413", user="postgres", password="MESA8413") as conn:
    print(f'DATABASE CONNECTED: {time.time() - time0} SEC\n')
    
    # for each table, write to table
    for df_table, table_name, verb in zip(df_tables, table_names, verbs):

        # UNCOMMENT IF RERUNNING PROGRAM
        drop_table(conn, table_name)

        # Run our function
        copy_table_from_df(conn, df_table, table_name, verb)

# Connection should exit when complete
print(f'CONNECTION CLOSED. TOTAL TRANSACTION TIME: {time.time() - time0} SEC')

DATABASE CONNECTED: 0.1317141056060791 SEC

DROPPING ALL DATA FROM TABLE nyc311.agency

COPYING nyc311.agency
...10 ROWS COPIED: 0.0010128021240234375 SEC
TABLE nyc311.agency COPIED: 0.005892753601074219 SEC

DROPPING ALL DATA FROM TABLE nyc311.call_center

COPYING nyc311.call_center
...100 ROWS COPIED: 0.0010089874267578125 SEC
...200 ROWS COPIED: 0.002023458480834961 SEC
...300 ROWS COPIED: 0.002023458480834961 SEC
...400 ROWS COPIED: 0.002023458480834961 SEC
...500 ROWS COPIED: 0.002023458480834961 SEC
TABLE nyc311.call_center COPIED: 0.013154029846191406 SEC

DROPPING ALL DATA FROM TABLE nyc311.call_operator

COPYING nyc311.call_operator
...1000 ROWS COPIED: 0.012711286544799805 SEC
...2000 ROWS COPIED: 0.016580820083618164 SEC
...3000 ROWS COPIED: 0.02111530303955078 SEC
...4000 ROWS COPIED: 0.02660512924194336 SEC
...5000 ROWS COPIED: 0.0301058292388916 SEC
TABLE nyc311.call_operator COPIED: 0.12043547630310059 SEC

DROPPING ALL DATA FROM TABLE nyc311.complaint_type

COPYING nyc3

## Export Table Data as CSV

In [377]:
def csv_tables():
  # incidents
  complaint_incident.to_csv('..\\Data\\2_Preprocessed\\complaint_incident.csv', index=False)

  # agencies
  agency.to_csv('..\\Data\\2_Preprocessed\\agency.csv', index=False)

  # complaints
  complaint_type.to_csv('..\\Data\\2_Preprocessed\\complaint_type.csv', index=False)

  # taxi
  taxi_details.to_csv('..\\Data\\2_Preprocessed\\taxi_details.csv', index=False)

  # roads
  road_details.to_csv('..\\Data\\2_Preprocessed\\road_details.csv', index=False)

  # status
  complaint_status.to_csv('..\\Data\\2_Preprocessed\\complaint_status.csv', index=False)

  # call centers
  call_center.to_csv('..\\Data\\2_Preprocessed\\call_center.csv', index=False)

  # operators
  call_operator.to_csv('..\\Data\\2_Preprocessed\\call_operator.csv', index=False)

# Comment out below if you do not want to export the files.
csv_tables()

# PART 2(B): Data-Quality Notebook

## Missingness

Note: We'll perform an analysis of missing data using the original dataset (before replacing Nulls with empty strings or other values).
The only exception is the missing value for road_table.bridge_highway_name, which seems to be a (fixable) entry error.

In [378]:
# Fix road_table missing highway name
road_table['bridge_highway_name'] = road_table['bridge_highway_name'].fillna(value='Long Island Expwy')

# Define function to report table size & nulls
def null_counter(df, table_name):
  print(f'--- {table_name} ---\n')
  print('TOTAL TABLE LENGTH:', len(df),'\n')
  print('TOTAL NULL VALUES:', df.isnull().sum().sum(), '\n')
  print(f'TOTAL PERCENT OF DATA MISSING: {((df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100):.2f}%', '\n')
  print('MISSING VALUES PER COLUMN:')
  print(df.isnull().sum(), "\n\n")

# Run function on each table
tables = [incidents, status_table, complaint_table, agency_table, taxi_table, road_table, cc_df, op_df]
names = ['INCIDENTS', 'STATUS', 'COMPLAINT TYPE', 'AGENCY', 'TAXI', 'ROAD', 'CALL CENTER', 'OPERATIOR']
for table, name in zip(tables, names):
  null_counter(table, name)

--- INCIDENTS ---

TOTAL TABLE LENGTH: 300000 

TOTAL NULL VALUES: 619869 

TOTAL PERCENT OF DATA MISSING: 9.84% 

MISSING VALUES PER COLUMN:
unique_key                       0
created_date                     0
incident_zip                  2599
incident_address              9205
cross_street_1              106252
cross_street_2              106151
intersection_street_1       117594
intersection_street_2       117433
address_type                  1273
city                         11580
landmark                    134911
community_board                  0
borough                          0
x_coordinate_state_plane      3133
y_coordinate_state_plane      3127
open_data_channel_type           0
park_facility_name             337
latitude                      3137
longitude                     3137
complaint_id                     0
op_id                            0
dtype: int64 


--- STATUS ---

TOTAL TABLE LENGTH: 300000 

TOTAL NULL VALUES: 373608 

TOTAL PERCENT OF DATA MISSING: 17.

In [379]:
# Determine how many rows are missing multiple values

# Define function to count missingness by row
def missing_row_count(df, table_name):
    
    # Count missingness by row
    row_miss = df.isnull().sum(axis=1)

    # Determine value counts
    val_c = row_miss.value_counts().sort_index()
    df_c = val_c.to_frame().reset_index()

    # Set chart size appropriately
    w = max(df_c['index'].max() * 25, 200)
    
    # Display results
    chart = alt.Chart(df_c).mark_bar().encode(
        x=alt.X('index:N', title='Missing Value Count'),
        y=alt.Y('count:Q', title='Frequency')
    ).properties(
        title=f'{table_name}: Missing Values by Row',
        width=w
    )
    return chart

# Run function on each table
tables = [incidents, status_table, complaint_table, agency_table, taxi_table, road_table, cc_df, op_df]
names = ['INCIDENTS', 'STATUS', 'COMPLAINT TYPE', 'AGENCY', 'TAXI', 'ROAD', 'CALL CENTER', 'OPERATIOR']
for table, name in zip(tables, names):
    chart = missing_row_count(table, name)
    chart.display()

## Distribution & Outlier Analysis

In [380]:
# Look at distribution of variables

# Group incidents by borough
incidents_by_borough = incidents.groupby('borough')['unique_key'].count().reset_index()

ibb = alt.Chart(incidents_by_borough).mark_bar().encode(
    x=alt.X('borough:N', title=None),
    y=alt.Y('unique_key:Q', title='Count of Complaints')
).properties(
    title='Distribution of Complaints by Borough',
    width=300
)
ibb.display()

# Get number of incidents over date and over hour (by created_date)
incidents['date'] = incidents['created_date'].dt.floor('D')
incidents['hour'] = incidents['created_date'].dt.floor('H')
i_by_day = incidents.groupby('date').size().reset_index().rename(columns={0:'count'})
i_by_hour = incidents.groupby('hour').size().reset_index().rename(columns={0:'count'})

ibd = alt.Chart(i_by_day).mark_line().encode(
    x=alt.X('date:T'),
    y=alt.Y('count:Q')
).properties(
    title='Complaints per Day',
    width=400
)
ibd.display()

ibh = alt.Chart(i_by_hour).mark_line().encode(
    x=alt.X('hour:T'),
    y=alt.Y('count:Q')
).properties(
    title='Complaints per Hour',
    width=600
)
ibh.display()

# Identify the outlier observed in the graph of incidents_per_hour
outlier_hour = pd.to_datetime(i_by_hour.loc[i_by_hour['count'].idxmax(),'hour'])
outlier_count = i_by_hour['count'].max()
print(f'OUTLIER: {outlier_count} reports on {outlier_hour.strftime("%B %d")} at {outlier_hour.strftime("%H:%M")}')

OUTLIER: 1332 reports on October 30 at 15:00


In [381]:
# Drilling down on our outlier hour specifically

# Try grouping by hour AND borough
i_by_hour = incidents.groupby(['hour','borough']).size().reset_index().rename(columns={0:'count'})
oct30_by_borough = i_by_hour[i_by_hour['hour'] == outlier_hour]

# print('Complaints by Borough')
# print(oct30_by_borough, '\n')

# Recall borough population distribution from before:
b_pop_df = pd.DataFrame(data={
    'borough': ['MANHATTAN', 'BRONX', 'BROOKLYN', 'QUEENS', 'STATEN ISLAND'],
    'b_pops': [1694251, 1472654, 2736074, 2405464, 495747]
})

# Join our two tables
oct30_by_borough_pop = oct30_by_borough.set_index('borough').join(other=b_pop_df.set_index('borough'), on='borough')

# Calculate proportionality of calls and risk ratio
oct30_by_borough_pop['b_proport'] = oct30_by_borough_pop['b_pops'] / oct30_by_borough_pop['b_pops'].sum()
oct30_by_borough_pop['call_proport'] = oct30_by_borough_pop['count'] / oct30_by_borough_pop['count'].sum()
oct30_by_borough_pop['risk_ratio'] = oct30_by_borough_pop['call_proport'] / oct30_by_borough_pop['b_proport']

print('Complaints by Borough:')
print(oct30_by_borough_pop.drop(columns='hour'), '\n')

Complaints by Borough:
               count   b_pops  b_proport  call_proport  risk_ratio
borough                                                           
BRONX            142  1472654   0.167267      0.106607    0.637342
BROOKLYN         702  2736074   0.310770      0.527027    1.695877
MANHATTAN        207  1694251   0.192437      0.155405    0.807566
QUEENS           224  2405464   0.273218      0.168168    0.615509
STATEN ISLAND     57   495747   0.056308      0.042793    0.759976 



Clearly, the disproportionate number of calls were coming from Brooklyn. We will dig deeper in Part 4.

## Transformations

First, we will truncate all of our timestamp columns to be hourly columns, as our later analysis will aggregate over time at the hourly level
(making minute and second values unnecessary). We have already performed some hour-level analysis in this notebook, and
will go further in Part 4.

In [382]:
# Define function to truncate a timestamp to hour precision
def trunc_hour(x):
    if pd.isna(x):
        return None
    else:
        return x.floor('H')
    
# Note: we have already truncated incidents["created_date"] above
date_cols = ['closed_date', 'due_date', 'resolution_action_updated_date']

# Apply function
for col in date_cols:
  status_table[col] = status_table[col].apply(trunc_hour)
  print(status_table[status_table[col].notna()][col].head())

13   2025-11-24 02:00:00
14   2025-11-24 01:00:00
21   2025-11-24 01:00:00
49   2025-11-24 01:00:00
53   2025-11-24 01:00:00
Name: closed_date, dtype: datetime64[ns]
3908   2025-12-23 15:00:00
4429   2025-12-23 14:00:00
6002   2025-12-23 11:00:00
7067   2025-12-23 08:00:00
7258   2025-12-23 08:00:00
Name: due_date, dtype: datetime64[ns]
13   2025-11-24 02:00:00
14   2025-11-24 01:00:00
21   2025-11-24 01:00:00
34   2025-11-24 01:00:00
39   2025-11-24 01:00:00
Name: resolution_action_updated_date, dtype: datetime64[ns]


Next, we will split the call operator name field into first name and last name, as this makes sorting by last name possible.

In [383]:
# Split op_name into op_first and op_last using regular expressions
op_df[['op_first', 'op_last']] = op_df['op_name'].str.extract(r'^(.*)\s(\w+)$')

# Check work 
print(op_df.head())

  op_id           op_name     op_ssn  op_salary   op_cc  op_first   op_last
0  OP_1       Todd Vargas  109504585   32689.15   CC_71      Todd    Vargas
1  OP_2    Melinda Ibarra  230862090   41227.76  CC_363   Melinda    Ibarra
2  OP_3    Megan Figueroa  782650115   37682.83  CC_254     Megan  Figueroa
3  OP_4  Jennifer Shields  265220904   35955.96  CC_134  Jennifer   Shields
4  OP_5    Anthony Barnes  144422394   38392.37  CC_465   Anthony    Barnes
