In [64]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://heartspringhealth.com/wp-content/uploads/2014/05/cars-c.jpg", width=1000, height=250)

The Motor Vehicle Collisions crash table contains details on the crash event. Each row represents a crash event. The Motor Vehicle Collisions data tables contain information from all police reported motor vehicle collisions in NYC. The police report (MV104-AN) is required to be filled out for collisions where someone is injured or killed, or where there is at least $1000 worth of damage

Below are the steps to perform our ETL processes:
1. Extract data daily from NYC Open Data (https://opendata.cityofnewyork.us/) via Socrata API
2. For our initial load, we will only be using data from 2021 - now to populate our table
3. For our daily load, we will perform upsert functions against our data to only inserting newer data to the table
4. Data will be stored in the Amazon S3 bucket
5. We then proceed to perform necessary transformations then load to our Data Warehouse in AWS SQL Server 

In [65]:
#Install required libraries
!pip install sodapy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [66]:
#Import required Libraries 
import pandas as pd 
from sodapy import Socrata 

#**Data Extraction**#

In [75]:
client = Socrata("data.cityofnewyork.us", None)

# Get 10000 results for initial load, then 1000 each day for delta load, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("h9gi-nx95", order='crash_date DESC', limit=1000)

# Convert to pandas DataFrame
df = pd.DataFrame.from_records(results)



In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     10000 non-null  object
 1   crash_time                     10000 non-null  object
 2   latitude                       9022 non-null   object
 3   longitude                      9022 non-null   object
 4   location                       9022 non-null   object
 5   on_street_name                 7221 non-null   object
 6   number_of_persons_injured      10000 non-null  object
 7   number_of_persons_killed       10000 non-null  object
 8   number_of_pedestrians_injured  10000 non-null  object
 9   number_of_pedestrians_killed   10000 non-null  object
 10  number_of_cyclist_injured      10000 non-null  object
 11  number_of_cyclist_killed       10000 non-null  object
 12  number_of_motorist_injured     10000 non-null  object
 13  nu

In [77]:
#See all the columns in the dataframe
pd.set_option('display.max_columns', None)

In [78]:
#Check all the column names 
df.columns

Index(['crash_date', 'crash_time', 'latitude', 'longitude', 'location',
       'on_street_name', 'number_of_persons_injured',
       'number_of_persons_killed', 'number_of_pedestrians_injured',
       'number_of_pedestrians_killed', 'number_of_cyclist_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_motorist_killed', 'contributing_factor_vehicle_1',
       'contributing_factor_vehicle_2', 'collision_id', 'vehicle_type_code1',
       'off_street_name', 'vehicle_type_code2', 'borough', 'zip_code',
       'cross_street_name', 'contributing_factor_vehicle_3',
       'vehicle_type_code_3', 'contributing_factor_vehicle_4',
       'vehicle_type_code_4', 'contributing_factor_vehicle_5',
       'vehicle_type_code_5'],
      dtype='object')

In [79]:
sorted_df = df.sort_values(by=['crash_date', 'crash_time'], ascending=True)

In [80]:
sorted_df = sorted_df[['crash_date', 'crash_time', 'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured',              
                      'number_of_pedestrians_killed', 'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured',                             
                      'number_of_motorist_killed', 'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2', 'vehicle_type_code1', 
                      'vehicle_type_code2', 'latitude', 'longitude', 'on_street_name', 'borough', 'zip_code',]].reset_index(drop=True)

In [81]:
sorted_df.head()

Unnamed: 0,crash_date,crash_time,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,vehicle_type_code1,vehicle_type_code2,latitude,longitude,on_street_name,borough,zip_code
0,2022-06-10T00:00:00.000,0:00,0,0,0,0,0,0,0,0,Failure to Yield Right-of-Way,Unspecified,PK,Sedan,40.829754,-73.87374,WESTCHESTER AVENUE,,
1,2022-06-10T00:00:00.000,0:00,0,0,0,0,0,0,0,0,Backing Unsafely,Unspecified,Station Wagon/Sport Utility Vehicle,,40.750572,-73.89301,,QUEENS,11372.0
2,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0,Driver Inattention/Distraction,Unspecified,Sedan,Station Wagon/Sport Utility Vehicle,40.716343,-73.761406,199 STREET,QUEENS,11423.0
3,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0,Following Too Closely,Unspecified,Station Wagon/Sport Utility Vehicle,Sedan,40.63082,-73.88636,ROCKAWAY PARKWAY,,
4,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0,Unspecified,Unspecified,Sedan,Sedan,40.60915,-74.14987,STATEN ISLAND EXPRESSWAY,,


In [103]:
#Create individual tables based on the denormalized data
crashes_df = sorted_df[['crash_date', 'crash_time', 'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured',              
                      'number_of_pedestrians_killed', 'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured',                             
                      'number_of_motorist_killed']].reset_index(drop=True)
contributing_factor_vehicle_df = sorted_df[['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2']].drop_duplicates().reset_index(drop=True)
vehicle_type_code_df = sorted_df[['vehicle_type_code1', 'vehicle_type_code2']].drop_duplicates().reset_index(drop=True)
address_df = sorted_df[['latitude', 'longitude', 'on_street_name','borough', 'zip_code']].drop_duplicates().reset_index(drop=True)

In [104]:
crashes_df

Unnamed: 0,crash_date,crash_time,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed
0,2022-06-10T00:00:00.000,0:00,0,0,0,0,0,0,0,0
1,2022-06-10T00:00:00.000,0:00,0,0,0,0,0,0,0,0
2,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0
3,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0
4,2022-06-10T00:00:00.000,0:00,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9995,2022-07-15T00:00:00.000,9:04,1,0,0,0,0,0,1,0
9996,2022-07-15T00:00:00.000,9:10,1,0,0,0,0,0,1,0
9997,2022-07-15T00:00:00.000,9:20,1,0,0,0,0,0,1,0
9998,2022-07-15T00:00:00.000,9:30,0,0,0,0,0,0,0,0


In [97]:
contributing_factor_vehicle_df

Unnamed: 0,contributing_factor_vehicle_1,contributing_factor_vehicle_2
0,Failure to Yield Right-of-Way,Unspecified
1,Backing Unsafely,Unspecified
2,Driver Inattention/Distraction,Unspecified
3,Following Too Closely,Unspecified
4,Unspecified,Unspecified
...,...,...
9515,Reaction to Uninvolved Vehicle,Aggressive Driving/Road Rage
9568,Failure to Yield Right-of-Way,Turning Improperly
9585,Unsafe Speed,Backing Unsafely
9657,Eating or Drinking,Unspecified


In [83]:
sorted_df.to_csv('motor_vehicle_collisions.csv')