In [1]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://heartspringhealth.com/wp-content/uploads/2014/05/cars-c.jpg", width=1000, height=250)

The Motor Vehicle Collisions crash table contains details on the crash event. Each row represents a crash event. The Motor Vehicle Collisions data tables contain information from all police reported motor vehicle collisions in NYC. The police report (MV104-AN) is required to be filled out for collisions where someone is injured or killed, or where there is at least $1000 worth of damage

Below are the steps to perform our ETL processes:
1. Extract data daily from NYC Open Data (https://opendata.cityofnewyork.us/) via Socrata API
2. For our initial load, we will only be using data from 2021 - now to populate our table
3. For our daily load, we will perform upsert functions against our data to only inserting newer data to the table
4. Data will be stored in the Amazon S3 bucket
5. We then proceed to perform necessary transformations then load to our Data Warehouse in AWS SQL Server 

In [2]:
#Install required libraries
!pip install sodapy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sodapy
  Downloading sodapy-2.1.1-py2.py3-none-any.whl (14 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.1.1


In [52]:
#Import required Libraries 
import pandas as pd 
from sodapy import Socrata 
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

#**Data Extraction**#

In [53]:
client = Socrata("data.cityofnewyork.us", None)

# Get 1000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("h9gi-nx95", order='crash_date DESC', limit=2000)

# Convert to pandas DataFrame
df = pd.DataFrame.from_records(results)



In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     2000 non-null   object
 1   crash_time                     2000 non-null   object
 2   latitude                       1832 non-null   object
 3   longitude                      1832 non-null   object
 4   location                       1832 non-null   object
 5   on_street_name                 1452 non-null   object
 6   number_of_persons_injured      2000 non-null   object
 7   number_of_persons_killed       2000 non-null   object
 8   number_of_pedestrians_injured  2000 non-null   object
 9   number_of_pedestrians_killed   2000 non-null   object
 10  number_of_cyclist_injured      2000 non-null   object
 11  number_of_cyclist_killed       2000 non-null   object
 12  number_of_motorist_injured     2000 non-null   object
 13  num

In [55]:
#See all the columns in the dataframe
pd.set_option('display.max_columns', None)

In [56]:
#Check all the column names 
df.columns

Index(['crash_date', 'crash_time', 'latitude', 'longitude', 'location',
       'on_street_name', 'number_of_persons_injured',
       'number_of_persons_killed', 'number_of_pedestrians_injured',
       'number_of_pedestrians_killed', 'number_of_cyclist_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_motorist_killed', 'contributing_factor_vehicle_1',
       'contributing_factor_vehicle_2', 'collision_id', 'vehicle_type_code1',
       'off_street_name', 'vehicle_type_code2', 'borough', 'zip_code',
       'cross_street_name', 'contributing_factor_vehicle_3',
       'vehicle_type_code_3', 'contributing_factor_vehicle_4',
       'vehicle_type_code_4', 'contributing_factor_vehicle_5',
       'vehicle_type_code_5'],
      dtype='object')

In [57]:
sorted_df = df.sort_values(by=['crash_date', 'crash_time'], ascending=True)

In [62]:
sorted_df = sorted_df[['crash_date', 'crash_time', 'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured',              
                      'number_of_pedestrians_killed', 'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured',                             
                      'number_of_motorist_killed', 'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2', 'vehicle_type_code1', 
                      'vehicle_type_code2', 'latitude', 'longitude', 'on_street_name', 'borough', 'zip_code',]].reset_index(drop=True)

In [63]:
sorted_df.head()

Unnamed: 0,crash_date,crash_time,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,vehicle_type_code1,vehicle_type_code2,latitude,longitude,on_street_name,borough,zip_code
0,2022-07-06T00:00:00.000,0:35,0,0,0,0,0,0,0,0,Unsafe Lane Changing,Unspecified,Tractor Truck Diesel,Station Wagon/Sport Utility Vehicle,40.730442,-73.91367,LONG ISLAND EXPRESSWAY,,
1,2022-07-06T00:00:00.000,10:00,0,0,0,0,0,0,0,0,Unspecified,,Sedan,,40.751854,-74.002396,,MANHATTAN,10001.0
2,2022-07-06T00:00:00.000,10:02,0,0,0,0,0,0,0,0,Unspecified,,Garbage or Refuse,,40.869843,-73.91588,BROADWAY,,
3,2022-07-06T00:00:00.000,10:15,1,0,0,0,1,0,0,0,Illnes,,Bike,,,,,,
4,2022-07-06T00:00:00.000,10:25,1,0,0,0,0,0,1,0,Unspecified,,Station Wagon/Sport Utility Vehicle,,40.668648,-73.92835,EASTERN PARKWAY,,
