In [1]:
#Import necessary functions
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime as dt

%matplotlib inline

In [2]:
#Import the dataset and merge Crash Date and Crash Time columns
df = pd.read_csv('data/Motor_Vehicle_Collisions_-_Crashes.csv', low_memory=False, parse_dates=[['CRASH DATE', 'CRASH TIME']])

#Initial look into dataset
df.head()

Unnamed: 0,CRASH DATE_CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2021-09-11 02:39:00,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,2.0,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,2022-03-26 11:45:00,,,,,,QUEENSBORO BRIDGE UPPER,,,1.0,...,,,,,4513547,Sedan,,,,
2,2022-06-29 06:55:00,,,,,,THROGS NECK BRIDGE,,,0.0,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,2021-09-11 09:35:00,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,0.0,...,,,,,4456314,Sedan,,,,
4,2021-12-14 08:13:00,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,0.0,...,,,,,4486609,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979921 entries, 0 to 1979920
Data columns (total 28 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   CRASH DATE_CRASH TIME          datetime64[ns]
 1   BOROUGH                        object        
 2   ZIP CODE                       object        
 3   LATITUDE                       float64       
 4   LONGITUDE                      float64       
 5   LOCATION                       object        
 6   ON STREET NAME                 object        
 7   CROSS STREET NAME              object        
 8   OFF STREET NAME                object        
 9   NUMBER OF PERSONS INJURED      float64       
 10  NUMBER OF PERSONS KILLED       float64       
 11  NUMBER OF PEDESTRIANS INJURED  int64         
 12  NUMBER OF PEDESTRIANS KILLED   int64         
 13  NUMBER OF CYCLIST INJURED      int64         
 14  NUMBER OF CYCLIST KILLED       int64         
 15  NUMBER OF MOTOR

In [4]:
#Check what the dates range from
df['CRASH DATE_CRASH TIME'].describe()

  df['CRASH DATE_CRASH TIME'].describe()


count                 1979921
unique                1067748
top       2015-01-18 08:00:00
freq                       51
first     2012-07-01 00:05:00
last      2023-03-27 23:46:00
Name: CRASH DATE_CRASH TIME, dtype: object

## Data Preparation

### Changing DateTime Index

In [5]:
#Rename crash date column
df.rename(columns = {'CRASH DATE_CRASH TIME':'CRASH DATE TIME'}, inplace = True)

#Set to datetime index
df.set_index('CRASH DATE TIME', inplace=True)

The collisions beyond 2023 will be considered future data. It will not be considered within the modeling process.

In [6]:
#Create dataframe with only 2023 data - to be considered future
df_2023 = df['2023':]

The collisions within the last full 5 years will be considered in the model only.

In [7]:
#Filter to crashes between 2018-2023
df = df['2018-01-01':'2022-12-31']

In [8]:
#Sanity check of the modeling data time frame
print (df.index.min())
print (df.index.max())

2018-01-01 00:00:00
2022-12-31 23:50:00


In [9]:
#Sanity check of the future data time frame
print (df_2023.index.min())
print (df_2023.index.max())

2023-01-01 00:00:00
2023-03-27 23:46:00


In [10]:
#Sort the datetime indexes in ascending order
df = df.sort_index()
df_2023 = df_2023.sort_index()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 770281 entries, 2018-01-01 00:00:00 to 2022-12-31 23:50:00
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   BOROUGH                        500874 non-null  object 
 1   ZIP CODE                       500755 non-null  object 
 2   LATITUDE                       711151 non-null  float64
 3   LONGITUDE                      711151 non-null  float64
 4   LOCATION                       711151 non-null  object 
 5   ON STREET NAME                 576100 non-null  object 
 6   CROSS STREET NAME              372257 non-null  object 
 7   OFF STREET NAME                193437 non-null  object 
 8   NUMBER OF PERSONS INJURED      770275 non-null  float64
 9   NUMBER OF PERSONS KILLED       770267 non-null  float64
 10  NUMBER OF PEDESTRIANS INJURED  770281 non-null  int64  
 11  NUMBER OF PEDESTRIANS KILLED   770281 non-null  int64  
 

In [15]:
#Check for any duplicated rows
df.duplicated().value_counts()

False    735372
True      34909
dtype: int64

In [16]:
df_2023.duplicated().value_counts()

False    20714
True        82
dtype: int64

### Borough Selection

The two of the five NYC boroughs with the largest populations will be analyzed: Queens and Brooklyn.

In [18]:
#Filter for collisions in Brooklyn and Queens borough
df = df.loc[(df['BOROUGH'] == 'BROOKLYN') | (df['BOROUGH'] == 'QUEENS')]

In [17]:
#Filter for collisions in Brooklyn and Queens borough
df_2023 = df_2023.loc[(df_2023['BOROUGH'] == 'BROOKLYN') | (df_2023['BOROUGH'] == 'QUEENS')]

In [24]:
#Sanity check to ensure only two boroughs are included
print(df['BOROUGH'].unique())
print(df_2023['BOROUGH'].unique())

['BROOKLYN' 'QUEENS']
['QUEENS' 'BROOKLYN']


### Dropping Columns/Values

In [21]:
#Check for NaN values in street name columns
print(df['ON STREET NAME'].isnull().sum())
print(df['CROSS STREET NAME'].isnull().sum())
print(df['OFF STREET NAME'].isnull().sum())

111932
112132
193629


There are 6 columns dedicated to where the collision took place. The 'LOCATION' column is to be dropped as it contains repetitive information from the 'LATITUDE' and 'LONGITUDE' columns. The columns regarding street name will be dropped as well due to the significant amount of missing values. 

There are a total of 8 columns dedicated to the number of people injured and killed as a result of the collision. The columns with the total number of injured and killed, whereas the columns specifying the type of person will be dropped. 

In [22]:
print(df['LOCATION'].isnull().sum())
print(df_2023['LOCATION'].isnull().sum())

6935
98


In [25]:
#Drop non-relevant and repetitive location columns
df.drop(['LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)
df_2023.drop(['LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)

#Drop repetitive injury/death columns
df.drop(['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 
         'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED'], 
        axis=1, inplace=True)

df_2023.drop(['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 
         'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED'], 
        axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',


The focus will be on the Vehicle 1 as it is to be considered the primary vehicle of the collision. The columns for the contributing factor and vehicle type code for vehicles 2-5 have a significant amount of missing values and therefore will be dropped from the dataset.

In [26]:
#Drop additional contributing factor vehicle columns
df.drop(['CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 
         'CONTRIBUTING FACTOR VEHICLE 5'], axis=1, inplace=True)

df_2023.drop(['CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 
         'CONTRIBUTING FACTOR VEHICLE 5'], axis=1, inplace=True)

#Drop additional vehicle type columns
df.drop(['VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'], 
        axis=1, inplace=True)

df_2023.drop(['VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'], 
        axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],


In [27]:
#Drop NaN values from columns of contributing factors
df = df.dropna(subset=['CONTRIBUTING FACTOR VEHICLE 1'])
df_2023 = df_2023.dropna(subset=['CONTRIBUTING FACTOR VEHICLE 1'])

In [28]:
#Filter out the collisions with Unspecified contributing factor
df = df[df['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']
df_2023 = df_2023[df_2023['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']

df

Unnamed: 0_level_0,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01 00:00:00,BROOKLYN,11239,40.653060,-73.882020,0.0,0.0,Driver Inattention/Distraction,Sedan
2018-01-01 00:00:00,BROOKLYN,11234,40.628770,-73.918250,1.0,0.0,Failure to Yield Right-of-Way,Sedan
2018-01-01 00:00:00,QUEENS,11419,40.682970,-73.828240,0.0,0.0,Passing Too Closely,Sedan
2018-01-01 00:00:00,BROOKLYN,11230,40.624180,-73.970480,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle
2018-01-01 00:00:00,BROOKLYN,11230,40.623220,-73.961020,0.0,0.0,Driver Inattention/Distraction,Taxi
...,...,...,...,...,...,...,...,...
2022-12-31 22:02:00,QUEENS,11102,40.766150,-73.919785,2.0,0.0,Following Too Closely,Sedan
2022-12-31 22:10:00,BROOKLYN,11221,40.697582,-73.929830,0.0,0.0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle
2022-12-31 23:20:00,QUEENS,11369,40.760098,-73.859300,2.0,0.0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle
2022-12-31 23:40:00,BROOKLYN,11249,40.699482,-73.961040,1.0,0.0,Driver Inattention/Distraction,Sedan


#### Number of Persons Injured & Number of Persons Killed

In [29]:
#Check for NaN values in injury/death columns
print(df['NUMBER OF PERSONS KILLED'].isnull().sum())
print(df['NUMBER OF PERSONS INJURED'].isnull().sum())
print()
print(df_2023['NUMBER OF PERSONS KILLED'].isnull().sum())
print(df_2023['NUMBER OF PERSONS INJURED'].isnull().sum())

1
0

0
0


Since there were only 2 collisions with missing values from the 'NUMBER OF PERSONS KILLED' column and only 3 from the 'NUMBER OF PERSONS INJURED', these rows will be dropped.  

In [30]:
#Drop NaN values from injury/death columns
df = df.dropna(subset=['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED'])

#Sanity check to verify no more NaN values
print(df['NUMBER OF PERSONS KILLED'].isnull().sum())
print(df['NUMBER OF PERSONS INJURED'].isnull().sum())

0
0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 221593 entries, 2018-01-01 00:00:00 to 2022-12-31 23:50:00
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   BOROUGH                        221593 non-null  object 
 1   ZIP CODE                       221519 non-null  object 
 2   LATITUDE                       216600 non-null  float64
 3   LONGITUDE                      216600 non-null  float64
 4   NUMBER OF PERSONS INJURED      221593 non-null  float64
 5   NUMBER OF PERSONS KILLED       221593 non-null  float64
 6   CONTRIBUTING FACTOR VEHICLE 1  221593 non-null  object 
 7   VEHICLE TYPE CODE 1            220099 non-null  object 
dtypes: float64(4), object(4)
memory usage: 15.2+ MB


In [32]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6342 entries, 2023-01-01 00:00:00 to 2023-03-27 23:46:00
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   BOROUGH                        6342 non-null   object 
 1   ZIP CODE                       6340 non-null   object 
 2   LATITUDE                       6273 non-null   float64
 3   LONGITUDE                      6273 non-null   float64
 4   NUMBER OF PERSONS INJURED      6342 non-null   float64
 5   NUMBER OF PERSONS KILLED       6342 non-null   float64
 6   CONTRIBUTING FACTOR VEHICLE 1  6342 non-null   object 
 7   VEHICLE TYPE CODE 1            6253 non-null   object 
dtypes: float64(4), object(4)
memory usage: 445.9+ KB


#### Latitude and Longitude
The Latitude and Longitude columns will be kept to ensure the location of the collisions can be mapped using Folium. First, we must verify if there are any missing values.

In [22]:
#Check for NaN values in Latitude and Longitude columns
print(df['LATITUDE'].isnull().sum())
print(df['LONGITUDE'].isnull().sum())

4993
4993


Since there are less than 5,000 rows missing values for Latitude and Longitude, these rows will be dropped.

In [23]:
#Drop missing values from Latitude and Longitude columns
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

Now that we know there are no duplicate 

In [24]:
#Check the minimum/maximum values of Latitude
df['LATITUDE'].describe()

count    216600.000000
mean         40.466047
std           2.997899
min           0.000000
25%          40.657055
50%          40.688150
75%          40.723910
max          40.896500
Name: LATITUDE, dtype: float64

In [25]:
#Check the minimum/maximum values of Longitude
df['LONGITUDE'].describe()

count    216600.000000
mean        -73.489859
std           5.444236
min         -74.194600
25%         -73.951164
50%         -73.907300
75%         -73.833290
max           0.000000
Name: LONGITUDE, dtype: float64

In [26]:
#Check how many collisions have a Latitude of 0
df[df['LATITUDE'] == 0]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-02-14 14:23:00,4025794,BROOKLYN,11219,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
2018-02-23 11:00:00,3851213,QUEENS,11374,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
2018-02-28 12:00:00,3853952,BROOKLYN,11217,0.0,0.0,0.0,0.0,Alcohol Involvement,Van
2018-02-28 12:30:00,3855414,QUEENS,11385,0.0,0.0,0.0,0.0,Oversized Vehicle,TRAIL
2018-02-28 12:44:00,3853954,BROOKLYN,11201,0.0,0.0,0.0,0.0,Oversized Vehicle,Box Truck
...,...,...,...,...,...,...,...,...,...
2022-12-29 19:36:00,4594448,BROOKLYN,11234,0.0,0.0,0.0,0.0,Driver Inattention/Distraction,Sedan
2022-12-29 20:16:00,4594530,BROOKLYN,11221,0.0,0.0,1.0,0.0,Driver Inexperience,Sedan
2022-12-29 21:01:00,4594607,BROOKLYN,11226,0.0,0.0,1.0,0.0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Sedan
2022-12-29 21:15:00,4594184,BROOKLYN,11201,0.0,0.0,0.0,0.0,Unsafe Speed,Sedan


In [27]:
#Check how many collisions have a Longitude of 0
df[df['LONGITUDE'] == 0]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-02-14 14:23:00,4025794,BROOKLYN,11219,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
2018-02-23 11:00:00,3851213,QUEENS,11374,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
2018-02-28 12:00:00,3853952,BROOKLYN,11217,0.0,0.0,0.0,0.0,Alcohol Involvement,Van
2018-02-28 12:30:00,3855414,QUEENS,11385,0.0,0.0,0.0,0.0,Oversized Vehicle,TRAIL
2018-02-28 12:44:00,3853954,BROOKLYN,11201,0.0,0.0,0.0,0.0,Oversized Vehicle,Box Truck
...,...,...,...,...,...,...,...,...,...
2022-12-29 19:36:00,4594448,BROOKLYN,11234,0.0,0.0,0.0,0.0,Driver Inattention/Distraction,Sedan
2022-12-29 20:16:00,4594530,BROOKLYN,11221,0.0,0.0,1.0,0.0,Driver Inexperience,Sedan
2022-12-29 21:01:00,4594607,BROOKLYN,11226,0.0,0.0,1.0,0.0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Sedan
2022-12-29 21:15:00,4594184,BROOKLYN,11201,0.0,0.0,0.0,0.0,Unsafe Speed,Sedan


There are 161 rows with both Longitude and Latitude of 0. Looking at the head and tail, we can assume these's are all the same 161 rows. To ensure our data is well-rounded with complete and accurate values, let's drop the collisions with a Longitude and Latitude of 0.

In [28]:
#Filter for collisions that do NOT have Longitude and Latitude of 0
df = df[df['LONGITUDE'] != 0]
df

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:00:00,3820851,BROOKLYN,11239,40.653060,-73.882020,0.0,0.0,Driver Inattention/Distraction,Sedan
2018-01-01 00:00:00,3820945,BROOKLYN,11234,40.628770,-73.918250,1.0,0.0,Failure to Yield Right-of-Way,Sedan
2018-01-01 00:00:00,3819067,QUEENS,11419,40.682970,-73.828240,0.0,0.0,Passing Too Closely,Sedan
2018-01-01 00:00:00,3822296,BROOKLYN,11230,40.624180,-73.970480,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle
2018-01-01 00:00:00,3821055,BROOKLYN,11230,40.623220,-73.961020,0.0,0.0,Driver Inattention/Distraction,Taxi
...,...,...,...,...,...,...,...,...,...
2022-12-31 22:02:00,4594338,QUEENS,11102,40.766150,-73.919785,2.0,0.0,Following Too Closely,Sedan
2022-12-31 22:10:00,4594636,BROOKLYN,11221,40.697582,-73.929830,0.0,0.0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle
2022-12-31 23:20:00,4594416,QUEENS,11369,40.760098,-73.859300,2.0,0.0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle
2022-12-31 23:40:00,4594910,BROOKLYN,11249,40.699482,-73.961040,1.0,0.0,Driver Inattention/Distraction,Sedan


In [29]:
#Sanity check how many collisions have a Longitude of 0
df[df['LONGITUDE'] == 0]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [30]:
#Sanity check how many collisions have a Latitude of 0
df[df['LATITUDE'] == 0]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


### Imputation

In [31]:
#Check rows with missing zip code
df[df['ZIP CODE'].isna()]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-17 20:49:00,3830150,QUEENS,,40.72010,-73.79038,0.0,0.0,Passing or Lane Usage Improper,Sedan
2018-01-18 17:20:00,3831700,QUEENS,,40.72010,-73.79038,0.0,0.0,Driver Inattention/Distraction,Sedan
2018-02-01 09:00:00,3839658,QUEENS,,40.72010,-73.79038,0.0,0.0,Following Too Closely,Sedan
2018-02-09 17:22:00,3843234,QUEENS,,40.72010,-73.79038,0.0,0.0,Following Too Closely,Bus
2018-03-19 21:00:00,3865703,QUEENS,,40.76092,-73.82680,0.0,0.0,Following Too Closely,Sedan
...,...,...,...,...,...,...,...,...,...
2022-02-22 18:52:00,4507585,QUEENS,,40.72013,-73.79038,0.0,0.0,Driver Inattention/Distraction,Sedan
2022-06-26 22:45:00,4540962,QUEENS,,40.72013,-73.79038,0.0,0.0,Failure to Yield Right-of-Way,Sedan
2022-06-27 09:27:00,4541523,QUEENS,,40.72013,-73.79038,1.0,0.0,Unsafe Lane Changing,Station Wagon/Sport Utility Vehicle
2022-08-09 12:30:00,4554698,QUEENS,,40.75089,-73.93663,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle


There are 71 missing values from the Zip Code column, however, we see that numerous rows have the same latitude and longitude. Let's impute those rows instead of dropping them.

In [32]:
#Create a dictionary that corresponds the Latitude, Longitude with the correct Zip Code
zipcode_dict = {(40.72010, -73.79038): 11432,
                (40.76092, -73.82680): 11354,
                (40.72013, -73.79038): 11433,
                (40.75089, -73.93663): 11101,
                (40.724792, -73.722916): 11426,
                (40.713050, -73.916990): 11385,
                (40.711930, -73.919365): 11385,
                (40.724792, -73.722916): 11426,
                (40.707447, -73.903870): 11385,
                (40.707485, -73.918365): 11385,
                (40.733120, -73.727900): 11426,
                (40.719124, -73.791405): 11432, 
                (40.606260, -73.744170): 11691,
                (40.707317, -73.903595): 11385,
                (40.695072, -73.990100): 11201,}

# use the map function to update the 'zipcode' column based on the dictionary
df['ZIP CODE'] = df.apply(lambda row: zipcode_dict.get((row['LATITUDE'], row['LONGITUDE']), row['ZIP CODE']), axis=1)

In [33]:
#Sanity check to confirm there's no more NaN values in Zip Code column
df[df['ZIP CODE'].isna()]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [37]:
#Sanity check to confirm zipcode was imputed for indexed row
df[df.index.isin(['2018-01-17 20:49:00'])]

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-17 20:49:00,3830150,QUEENS,11432,40.7201,-73.79038,0.0,0.0,Passing or Lane Usage Improper,Sedan


### Feature Engineering

#### Contributing Categories

In [38]:
#Review the contributing factors
df['CONTRIBUTING FACTOR VEHICLE 1'].value_counts(sort=True).head()

Driver Inattention/Distraction    74498
Failure to Yield Right-of-Way     25743
Following Too Closely             15932
Backing Unsafely                  15242
Passing Too Closely               12787
Name: CONTRIBUTING FACTOR VEHICLE 1, dtype: int64

In [39]:
#Check the unique values of contributing factors for vehicle 1
df['CONTRIBUTING FACTOR VEHICLE 1'].unique()

array(['Driver Inattention/Distraction', 'Failure to Yield Right-of-Way',
       'Passing Too Closely', 'Unsafe Speed',
       'Traffic Control Device Improper/Non-Working',
       'Traffic Control Disregarded', 'Pavement Slippery',
       'Following Too Closely', 'Backing Unsafely',
       'Passenger Distraction', 'Passing or Lane Usage Improper',
       'Other Vehicular', 'Alcohol Involvement', 'Driver Inexperience',
       'Fell Asleep', 'Brakes Defective', 'View Obstructed/Limited',
       'Unsafe Lane Changing', 'Obstruction/Debris', 'Turning Improperly',
       'Pavement Defective', 'Reaction to Uninvolved Vehicle',
       'Aggressive Driving/Road Rage', 'Tire Failure/Inadequate',
       'Steering Failure',
       'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
       'Animals Action', 'Glare', 'Oversized Vehicle',
       'Outside Car Distraction', 'Failure to Keep Right',
       'Other Electronic Device', 'Lost Consciousness',
       'Driverless/Runaway Vehicle', 'Other

In [40]:
#Create duplicate columns of contributing factor columns for categorizing
df['CONTRIBUTING CATEGORY V1'] = df['CONTRIBUTING FACTOR VEHICLE 1']

#Replace value with correct spelling of 'Illness'
df['CONTRIBUTING FACTOR VEHICLE 1'].replace('Illnes', 'Illness', inplace=True)

In [41]:
#Replace specific contributing factor to a more generalized category in Contributing Category Vehicle 1
df['CONTRIBUTING CATEGORY V1'].replace(('Driver Inattention/Distraction', 'Driver Inexperience', 
                                        'Reaction to Uninvolved Vehicle', 'Aggressive Driving/Road Rage', 
                                        'Eating or Drinking'), ('Driver Error'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Tire Failure/Inadequate', 'Headlights Defective', 'Steering Failure', 
                                        'Brakes Defective', 'Accelerator Defective', 'Tow Hitch Defective', 
                                        'Other Lighting Defects', 'Tinted Windows', 'Vehicle Vandalism', 
                                        'Windshield Inadequate'), ('Vehicle Defects'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Failure to Yield Right-of-Way','Passing or Lane Usage Improper', 
                                        'Unsafe Lane Changing','Failure to Keep Right', 
                                        'Traffic Control Disregarded','Passing Too Closely', 
                                        'Backing Unsafely', 'Unsafe Speed', 'Following Too Closely', 
                                        'Turning Improperly'), ('Moving Violation'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Glare', 'Obstruction/Debris', 'View Obstructed/Limited'), 
                                       ('Environmental Factors'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Cell Phone (hand-Held)', 'Texting', 'Using On Board Navigation Device', 
                                        'Other Electronic Device', 'Listening/Using Headphones', 
                                        'Cell Phone (hands-free)'), ('Internal Electronics Usage'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Illnes', 'Illness', 'Drugs (illegal)', 'Fell Asleep', 'Fatigued/Drowsy', 
                                        'Lost Consciousness', 'Physical Disability', 'Alcohol Involvement', 
                                        'Prescription Medication'), ('Bodily Impairment'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Traffic Control Device Improper/Non-Working', 'Pavement Slippery', 
                                        'Pavement Defective', 'Shoulders Defective/Improper', 
                                        'Lane Marking Improper/Inadequate'), ('Road Conditions'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Driverless/Runaway Vehicle', 'Other Vehicular', 'Oversized Vehicle'), 
                                       ('Third-Party (Vehicular)'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Animals Action', 
                                        'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'), 
                                       ('Third-Party (Non-Vehicular)'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Passenger Distraction', 'Outside Car Distraction'), 
                                       ('Other Distractions'), inplace=True)

In [42]:
#Sanity check to confirm only 10 categories
df['CONTRIBUTING CATEGORY V1'].value_counts()

Moving Violation               105285
Driver Error                    83986
Third-Party (Vehicular)          8147
Bodily Impairment                6332
Environmental Factors            3483
Road Conditions                  2460
Vehicle Defects                  2325
Third-Party (Non-Vehicular)      1993
Other Distractions               1242
Internal Electronics Usage        165
Name: CONTRIBUTING CATEGORY V1, dtype: int64

In [43]:
#Convert columns from float to integers
df = df.astype({'ZIP CODE':'int64', 'NUMBER OF PERSONS INJURED':'int64', 'NUMBER OF PERSONS KILLED':'int64'})

In [45]:
#Drop remaining NaN values
df = df.dropna()

#### Season

In [48]:
#Create Season column
## Months 3-5 = Spring
## Months 6-8 = Summer
## Months 9-11 = Autumn
## Months 12-2 = Winter

df['SEASON'] = np.where(((df.index.month >= 3) & (df.index.month <= 5)), 'Spring',
                          np.where(((df.index.month >= 6) & (df.index.month <= 8)), 'Summer',
                                  np.where(((df.index.month >= 9) & (df.index.month <= 11)), 'Autumn', 'Winter')))

df['SEASON'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SEASON'] = np.where(((df.index.month >= 3) & (df.index.month <= 5)), 'Spring',


Summer    55723
Autumn    54181
Winter    52489
Spring    51569
Name: SEASON, dtype: int64

#### Time of Day

In [49]:
#Define the bins
bins=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 24]

#Custom labels for time of day/hour intervals
labels=['Late Night', 'Early Morning', 'Dawn', 'Early AM', 'Morning', 'Late Morning', 'Early Afternoon', 
        'Afternoon', 'Evening', 'Night']

#Add the bins to the dataframe
df['TIME OF DAY'] = pd.cut(df.index.hour, bins, labels=labels, right=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TIME OF DAY'] = pd.cut(df.index.hour, bins, labels=labels, right=False)


#### Is Rush Hour

In [50]:
#Create and join dataframes indicating rush hour traffic
rush_hour = pd.concat([df.between_time('6:00', '9:00'), df.between_time('16:00', '19:00')])

#Create new column indicating if rush hour -- return True if index is in rush_hour, False if not
df['IS RUSH HOUR'] = df.index.isin(rush_hour.index)

#Sanity check to confirm there are True/False values
df['IS RUSH HOUR'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['IS RUSH HOUR'] = df.index.isin(rush_hour.index)


False    143320
True      70642
Name: IS RUSH HOUR, dtype: int64

In [51]:
df

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1,CONTRIBUTING CATEGORY V1,SEASON,TIME OF DAY,IS RUSH HOUR
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01 00:00:00,3820851,BROOKLYN,11239,40.653060,-73.882020,0,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3820945,BROOKLYN,11234,40.628770,-73.918250,1,0,Failure to Yield Right-of-Way,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3819067,QUEENS,11419,40.682970,-73.828240,0,0,Passing Too Closely,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3822296,BROOKLYN,11230,40.624180,-73.970480,0,0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3821055,BROOKLYN,11230,40.623220,-73.961020,0,0,Driver Inattention/Distraction,Taxi,Driver Error,Winter,Late Night,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 22:02:00,4594338,QUEENS,11102,40.766150,-73.919785,2,0,Following Too Closely,Sedan,Moving Violation,Winter,Night,False
2022-12-31 22:10:00,4594636,BROOKLYN,11221,40.697582,-73.929830,0,0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle,Moving Violation,Winter,Night,False
2022-12-31 23:20:00,4594416,QUEENS,11369,40.760098,-73.859300,2,0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle,Bodily Impairment,Winter,Night,False
2022-12-31 23:40:00,4594910,BROOKLYN,11249,40.699482,-73.961040,1,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Night,False


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 213962 entries, 2018-01-01 00:00:00 to 2022-12-31 23:50:00
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   COLLISION_ID                   213962 non-null  int64   
 1   BOROUGH                        213962 non-null  object  
 2   ZIP CODE                       213962 non-null  int64   
 3   LATITUDE                       213962 non-null  float64 
 4   LONGITUDE                      213962 non-null  float64 
 5   NUMBER OF PERSONS INJURED      213962 non-null  int64   
 6   NUMBER OF PERSONS KILLED       213962 non-null  int64   
 7   CONTRIBUTING FACTOR VEHICLE 1  213962 non-null  object  
 8   VEHICLE TYPE CODE 1            213962 non-null  object  
 9   CONTRIBUTING CATEGORY V1       213962 non-null  object  
 10  SEASON                         213962 non-null  object  
 11  TIME OF DAY                    213962 non-nu

Export smaller dataset that can be pushed to GitHub.

In [None]:
#Export 2018-2023 Queens and Brooklyn dataset as a .csv as main dataset
#df.to_csv('data/Motor_Vehicle_Collisions_QuBr_2018-2023.csv')

In [None]:
#Export as a .csv as final cleaned dataset
#df.to_csv('data/Final_Motor_Vehicle_Collisions_QuBr_2018-2023_V1.csv')