In [1]:
#Import necessary functions
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime


from datetime import datetime as dt

%matplotlib inline

In [2]:
#Import the dataset
df = pd.read_csv('data/Motor_Vehicle_Collisions_QuBr_2018_2023.csv')

#Initial look into dataset
df.head()

Unnamed: 0,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2018-01-01 00:00:00,BROOKLYN,11239.0,40.65306,-73.88202,0.0,0.0,Driver Inattention/Distraction,,,,,3820851,Sedan,,,,
1,2018-01-01 00:00:00,BROOKLYN,11234.0,40.62877,-73.91825,1.0,0.0,Failure to Yield Right-of-Way,Passing or Lane Usage Improper,,,,3820945,Sedan,Bike,,,
2,2018-01-01 00:00:00,QUEENS,11419.0,40.68297,-73.82824,0.0,0.0,Passing Too Closely,Unspecified,,,,3819067,Sedan,,,,
3,2018-01-01 00:00:00,BROOKLYN,11230.0,40.62418,-73.97048,0.0,0.0,Driver Inattention/Distraction,Unspecified,,,,3822296,Station Wagon/Sport Utility Vehicle,Sedan,,,
4,2018-01-01 00:00:00,BROOKLYN,11230.0,40.62322,-73.96102,0.0,0.0,Driver Inattention/Distraction,Unspecified,,,,3821055,Taxi,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305561 entries, 0 to 305560
Data columns (total 18 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH DATE TIME                305561 non-null  object 
 1   BOROUGH                        305561 non-null  object 
 2   ZIP CODE                       305458 non-null  float64
 3   LATITUDE                       298626 non-null  float64
 4   LONGITUDE                      298626 non-null  float64
 5   NUMBER OF PERSONS INJURED      305558 non-null  float64
 6   NUMBER OF PERSONS KILLED       305559 non-null  float64
 7   CONTRIBUTING FACTOR VEHICLE 1  304182 non-null  object 
 8   CONTRIBUTING FACTOR VEHICLE 2  246135 non-null  object 
 9   CONTRIBUTING FACTOR VEHICLE 3  23826 non-null   object 
 10  CONTRIBUTING FACTOR VEHICLE 4  6181 non-null    object 
 11  CONTRIBUTING FACTOR VEHICLE 5  1913 non-null    object 
 12  COLLISION_ID                  

## EDA

In [4]:
#Remove 'Collision ID' column from current location
first_column = df.pop('COLLISION_ID')
  
#Insert column to desired location
df.insert(0, 'COLLISION_ID', first_column)

In [5]:
#Check for any duplicated rows
df.duplicated().value_counts()

False    305561
dtype: int64

### Dropping Columns/Values

#### Contributing Factor & Vehicle Type

The focus will be on the Vehicle 1 as it is to be considered the primary vehicle of the collision. The columns for the contributing factor and vehicle type code for vehicles 2-5 have a significant amount of missing values and therefore will be dropped from the dataset.

In [6]:
#Drop additional contributing factor vehicle columns
df.drop(['CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5'], axis=1, inplace=True)

#Drop additional vehicle type columns
df.drop(['VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'], axis=1, inplace=True)

In [7]:
#Drop NaN values from columns of contributing factors
df = df.dropna(subset=['CONTRIBUTING FACTOR VEHICLE 1'])

In [8]:
#Filter out the collisions with Unspecified contributing factor
df = df[df['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']

df

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
0,3820851,2018-01-01 00:00:00,BROOKLYN,11239.0,40.653060,-73.882020,0.0,0.0,Driver Inattention/Distraction,Sedan
1,3820945,2018-01-01 00:00:00,BROOKLYN,11234.0,40.628770,-73.918250,1.0,0.0,Failure to Yield Right-of-Way,Sedan
2,3819067,2018-01-01 00:00:00,QUEENS,11419.0,40.682970,-73.828240,0.0,0.0,Passing Too Closely,Sedan
3,3822296,2018-01-01 00:00:00,BROOKLYN,11230.0,40.624180,-73.970480,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle
4,3821055,2018-01-01 00:00:00,BROOKLYN,11230.0,40.623220,-73.961020,0.0,0.0,Driver Inattention/Distraction,Taxi
...,...,...,...,...,...,...,...,...,...,...
305550,4594338,2022-12-31 22:02:00,QUEENS,11102.0,40.766150,-73.919785,2.0,0.0,Following Too Closely,Sedan
305551,4594636,2022-12-31 22:10:00,BROOKLYN,11221.0,40.697582,-73.929830,0.0,0.0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle
305557,4594416,2022-12-31 23:20:00,QUEENS,11369.0,40.760098,-73.859300,2.0,0.0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle
305558,4594910,2022-12-31 23:40:00,BROOKLYN,11249.0,40.699482,-73.961040,1.0,0.0,Driver Inattention/Distraction,Sedan


#### Number of Persons Injured & Number of Persons Killed

In [9]:
#Check for NaN values in injury/death columns
print(df['NUMBER OF PERSONS KILLED'].isnull().sum())
print(df['NUMBER OF PERSONS INJURED'].isnull().sum())

1
0


Since there were only 2 collisions with missing values from the 'NUMBER OF PERSONS KILLED' column and only 3 from the 'NUMBER OF PERSONS INJURED', these rows will be dropped.  

In [10]:
#Drop NaN values from injury/death columns
df = df.dropna(subset=['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED'])

#Sanity check to verify no more NaN values
print(df['NUMBER OF PERSONS KILLED'].isnull().sum())
print(df['NUMBER OF PERSONS INJURED'].isnull().sum())

0
0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221593 entries, 0 to 305560
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   COLLISION_ID                   221593 non-null  int64  
 1   CRASH DATE TIME                221593 non-null  object 
 2   BOROUGH                        221593 non-null  object 
 3   ZIP CODE                       221519 non-null  float64
 4   LATITUDE                       216600 non-null  float64
 5   LONGITUDE                      216600 non-null  float64
 6   NUMBER OF PERSONS INJURED      221593 non-null  float64
 7   NUMBER OF PERSONS KILLED       221593 non-null  float64
 8   CONTRIBUTING FACTOR VEHICLE 1  221593 non-null  object 
 9   VEHICLE TYPE CODE 1            220099 non-null  object 
dtypes: float64(5), int64(1), object(4)
memory usage: 18.6+ MB


#### Latitude and Longitude
The Latitude and Longitude columns will be kept to ensure the location of the collisions can be mapped using Folium. First, we must verify if there are any missing values.

In [12]:
#Check for NaN values in Latitude and Longitude columns
print(df['LATITUDE'].isnull().sum())
print(df['LONGITUDE'].isnull().sum())

4993
4993


Since there are less than 5,000 rows missing values for Latitude and Longitude, these rows will be dropped.

In [13]:
#Drop missing values from Latitude and Longitude columns
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

Now that we know there are no duplicate 

In [14]:
#Check the minimum/maximum values of Latitude
df['LATITUDE'].describe()

count    216600.000000
mean         40.466047
std           2.997899
min           0.000000
25%          40.657055
50%          40.688150
75%          40.723910
max          40.896500
Name: LATITUDE, dtype: float64

In [15]:
#Check the minimum/maximum values of Longitude
df['LONGITUDE'].describe()

count    216600.000000
mean        -73.489859
std           5.444236
min         -74.194600
25%         -73.951164
50%         -73.907300
75%         -73.833290
max           0.000000
Name: LONGITUDE, dtype: float64

In [16]:
#Check how many collisions have a Latitude of 0
df[df['LATITUDE'] == 0]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
10047,4025794,2018-02-14 14:23:00,BROOKLYN,11219.0,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
11899,3851213,2018-02-23 11:00:00,QUEENS,11374.0,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
12975,3853952,2018-02-28 12:00:00,BROOKLYN,11217.0,0.0,0.0,0.0,0.0,Alcohol Involvement,Van
12982,3855414,2018-02-28 12:30:00,QUEENS,11385.0,0.0,0.0,0.0,0.0,Oversized Vehicle,TRAIL
12983,3853954,2018-02-28 12:44:00,BROOKLYN,11201.0,0.0,0.0,0.0,0.0,Oversized Vehicle,Box Truck
...,...,...,...,...,...,...,...,...,...,...
305311,4594448,2022-12-29 19:36:00,BROOKLYN,11234.0,0.0,0.0,0.0,0.0,Driver Inattention/Distraction,Sedan
305316,4594530,2022-12-29 20:16:00,BROOKLYN,11221.0,0.0,0.0,1.0,0.0,Driver Inexperience,Sedan
305317,4594607,2022-12-29 21:01:00,BROOKLYN,11226.0,0.0,0.0,1.0,0.0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Sedan
305318,4594184,2022-12-29 21:15:00,BROOKLYN,11201.0,0.0,0.0,0.0,0.0,Unsafe Speed,Sedan


In [17]:
#Check how many collisions have a Longitude of 0
df[df['LONGITUDE'] == 0]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
10047,4025794,2018-02-14 14:23:00,BROOKLYN,11219.0,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
11899,3851213,2018-02-23 11:00:00,QUEENS,11374.0,0.0,0.0,0.0,0.0,Failure to Yield Right-of-Way,Station Wagon/Sport Utility Vehicle
12975,3853952,2018-02-28 12:00:00,BROOKLYN,11217.0,0.0,0.0,0.0,0.0,Alcohol Involvement,Van
12982,3855414,2018-02-28 12:30:00,QUEENS,11385.0,0.0,0.0,0.0,0.0,Oversized Vehicle,TRAIL
12983,3853954,2018-02-28 12:44:00,BROOKLYN,11201.0,0.0,0.0,0.0,0.0,Oversized Vehicle,Box Truck
...,...,...,...,...,...,...,...,...,...,...
305311,4594448,2022-12-29 19:36:00,BROOKLYN,11234.0,0.0,0.0,0.0,0.0,Driver Inattention/Distraction,Sedan
305316,4594530,2022-12-29 20:16:00,BROOKLYN,11221.0,0.0,0.0,1.0,0.0,Driver Inexperience,Sedan
305317,4594607,2022-12-29 21:01:00,BROOKLYN,11226.0,0.0,0.0,1.0,0.0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Sedan
305318,4594184,2022-12-29 21:15:00,BROOKLYN,11201.0,0.0,0.0,0.0,0.0,Unsafe Speed,Sedan


There are 161 rows with both Longitude and Latitude of 0. Looking at the head and tail, we can assume these's are all the same 161 rows. To ensure our data is well-rounded with complete and accurate values, let's drop the collisions with a Longitude and Latitude of 0.

In [18]:
#Filter for collisions that do NOT have Longitude and Latitude of 0
df = df[df['LONGITUDE'] != 0]
df

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
0,3820851,2018-01-01 00:00:00,BROOKLYN,11239.0,40.653060,-73.882020,0.0,0.0,Driver Inattention/Distraction,Sedan
1,3820945,2018-01-01 00:00:00,BROOKLYN,11234.0,40.628770,-73.918250,1.0,0.0,Failure to Yield Right-of-Way,Sedan
2,3819067,2018-01-01 00:00:00,QUEENS,11419.0,40.682970,-73.828240,0.0,0.0,Passing Too Closely,Sedan
3,3822296,2018-01-01 00:00:00,BROOKLYN,11230.0,40.624180,-73.970480,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle
4,3821055,2018-01-01 00:00:00,BROOKLYN,11230.0,40.623220,-73.961020,0.0,0.0,Driver Inattention/Distraction,Taxi
...,...,...,...,...,...,...,...,...,...,...
305550,4594338,2022-12-31 22:02:00,QUEENS,11102.0,40.766150,-73.919785,2.0,0.0,Following Too Closely,Sedan
305551,4594636,2022-12-31 22:10:00,BROOKLYN,11221.0,40.697582,-73.929830,0.0,0.0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle
305557,4594416,2022-12-31 23:20:00,QUEENS,11369.0,40.760098,-73.859300,2.0,0.0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle
305558,4594910,2022-12-31 23:40:00,BROOKLYN,11249.0,40.699482,-73.961040,1.0,0.0,Driver Inattention/Distraction,Sedan


In [19]:
#Sanity check how many collisions have a Longitude of 0
df[df['LONGITUDE'] == 0]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1


In [20]:
#Sanity check how many collisions have a Latitude of 0
df[df['LATITUDE'] == 0]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1


### Imputation

In [21]:
#Check rows with missing zip code
df[df['ZIP CODE'].isna()]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
4050,3830150,2018-01-17 20:49:00,QUEENS,,40.72010,-73.79038,0.0,0.0,Passing or Lane Usage Improper,Sedan
4230,3831700,2018-01-18 17:20:00,QUEENS,,40.72010,-73.79038,0.0,0.0,Driver Inattention/Distraction,Sedan
7082,3839658,2018-02-01 09:00:00,QUEENS,,40.72010,-73.79038,0.0,0.0,Following Too Closely,Sedan
9031,3843234,2018-02-09 17:22:00,QUEENS,,40.72010,-73.79038,0.0,0.0,Following Too Closely,Bus
17666,3865703,2018-03-19 21:00:00,QUEENS,,40.76092,-73.82680,0.0,0.0,Following Too Closely,Sedan
...,...,...,...,...,...,...,...,...,...,...
269385,4507585,2022-02-22 18:52:00,QUEENS,,40.72013,-73.79038,0.0,0.0,Driver Inattention/Distraction,Sedan
284087,4540962,2022-06-26 22:45:00,QUEENS,,40.72013,-73.79038,0.0,0.0,Failure to Yield Right-of-Way,Sedan
284136,4541523,2022-06-27 09:27:00,QUEENS,,40.72013,-73.79038,1.0,0.0,Unsafe Lane Changing,Station Wagon/Sport Utility Vehicle
289173,4554698,2022-08-09 12:30:00,QUEENS,,40.75089,-73.93663,0.0,0.0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle


There are 71 missing values from the Zip Code column, however, we see that numerous rows have the same latitude and longitude. Let's impute those rows instead of dropping them.

In [22]:
#Create a dictionary that corresponds the Latitude, Longitude with the correct Zip Code
zipcode_dict = {(40.72010, -73.79038): 11432,
                (40.76092, -73.82680): 11354,
                (40.72013, -73.79038): 11433,
                (40.75089, -73.93663): 11101,
                (40.724792, -73.722916): 11426,
                (40.713050, -73.916990): 11385,
                (40.711930, -73.919365): 11385,
                (40.724792, -73.722916): 11426,
                (40.707447, -73.903870): 11385,
                (40.707485, -73.918365): 11385,
                (40.733120, -73.727900): 11426,
                (40.719124, -73.791405): 11432, 
                (40.606260, -73.744170): 11691,
                (40.707317, -73.903595): 11385,
                (40.695072, -73.990100): 11201,}

# use the map function to update the 'zipcode' column based on the dictionary
df['ZIP CODE'] = df.apply(lambda row: zipcode_dict.get((row['LATITUDE'], row['LONGITUDE']), row['ZIP CODE']), axis=1)

In [23]:
#Sanity check to confirm there's no more NaN values in Zip Code column
df[df['ZIP CODE'].isna()]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1


In [24]:
#Sanity check to confirm zipcode was imputed for indexed row
#df[df.index.isin(['2018-01-18 17:20:00'])]

In [25]:
#Sanity check to confirm zipcode was imputed for indexed row
df.loc[[4050]]

Unnamed: 0,COLLISION_ID,CRASH DATE TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1
4050,3830150,2018-01-17 20:49:00,QUEENS,11432.0,40.7201,-73.79038,0.0,0.0,Passing or Lane Usage Improper,Sedan


### Feature Engineering

#### Contributing Categories

In [26]:
#Review the contributing factors
df['CONTRIBUTING FACTOR VEHICLE 1'].value_counts(sort=True).head()

Driver Inattention/Distraction    74498
Failure to Yield Right-of-Way     25743
Following Too Closely             15932
Backing Unsafely                  15242
Passing Too Closely               12787
Name: CONTRIBUTING FACTOR VEHICLE 1, dtype: int64

In [27]:
#Check the unique values of contributing factors for vehicle 1
df['CONTRIBUTING FACTOR VEHICLE 1'].unique()

array(['Driver Inattention/Distraction', 'Failure to Yield Right-of-Way',
       'Passing Too Closely', 'Unsafe Speed',
       'Traffic Control Device Improper/Non-Working',
       'Traffic Control Disregarded', 'Pavement Slippery',
       'Following Too Closely', 'Backing Unsafely',
       'Passenger Distraction', 'Passing or Lane Usage Improper',
       'Other Vehicular', 'Alcohol Involvement', 'Driver Inexperience',
       'Fell Asleep', 'Brakes Defective', 'View Obstructed/Limited',
       'Unsafe Lane Changing', 'Obstruction/Debris', 'Turning Improperly',
       'Pavement Defective', 'Reaction to Uninvolved Vehicle',
       'Aggressive Driving/Road Rage', 'Tire Failure/Inadequate',
       'Steering Failure',
       'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion',
       'Animals Action', 'Glare', 'Oversized Vehicle',
       'Outside Car Distraction', 'Failure to Keep Right',
       'Other Electronic Device', 'Lost Consciousness',
       'Driverless/Runaway Vehicle', 'Other

In [28]:
#Create duplicate columns of contributing factor columns for categorizing
df['CONTRIBUTING CATEGORY V1'] = df['CONTRIBUTING FACTOR VEHICLE 1']

#Replace value with correct spelling of 'Illness'
df['CONTRIBUTING FACTOR VEHICLE 1'].replace('Illnes', 'Illness', inplace=True)

In [29]:
#Replace specific contributing factor to a more generalized category in Contributing Category Vehicle 1
df['CONTRIBUTING CATEGORY V1'].replace(('Driver Inattention/Distraction', 'Driver Inexperience', 
                                        'Reaction to Uninvolved Vehicle', 'Aggressive Driving/Road Rage', 
                                        'Eating or Drinking'), ('Driver Error'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Tire Failure/Inadequate', 'Headlights Defective', 'Steering Failure', 
                                        'Brakes Defective', 'Accelerator Defective', 'Tow Hitch Defective', 
                                        'Other Lighting Defects', 'Tinted Windows', 'Vehicle Vandalism', 
                                        'Windshield Inadequate'), ('Vehicle Defects'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Failure to Yield Right-of-Way','Passing or Lane Usage Improper', 
                                        'Unsafe Lane Changing','Failure to Keep Right', 
                                        'Traffic Control Disregarded','Passing Too Closely', 
                                        'Backing Unsafely', 'Unsafe Speed', 'Following Too Closely', 
                                        'Turning Improperly'), ('Moving Violation'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Glare', 'Obstruction/Debris', 'View Obstructed/Limited'), 
                                       ('Environmental Factors'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Cell Phone (hand-Held)', 'Texting', 'Using On Board Navigation Device', 
                                        'Other Electronic Device', 'Listening/Using Headphones', 
                                        'Cell Phone (hands-free)'), ('Internal Electronics Usage'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Illnes', 'Illness', 'Drugs (illegal)', 'Fell Asleep', 'Fatigued/Drowsy', 
                                        'Lost Consciousness', 'Physical Disability', 'Alcohol Involvement', 
                                        'Prescription Medication'), ('Bodily Impairment'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Traffic Control Device Improper/Non-Working', 'Pavement Slippery', 
                                        'Pavement Defective', 'Shoulders Defective/Improper', 
                                        'Lane Marking Improper/Inadequate'), ('Road Conditions'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Driverless/Runaway Vehicle', 'Other Vehicular', 'Oversized Vehicle'), 
                                       ('Third-Party (Vehicular)'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Animals Action', 
                                        'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'), 
                                       ('Third-Party (Non-Vehicular)'), inplace=True)

df['CONTRIBUTING CATEGORY V1'].replace(('Passenger Distraction', 'Outside Car Distraction'), 
                                       ('Other Distractions'), inplace=True)

In [30]:
#Sanity check to confirm only 10 categories
df['CONTRIBUTING CATEGORY V1'].value_counts()

Moving Violation               105285
Driver Error                    83986
Third-Party (Vehicular)          8147
Bodily Impairment                6332
Environmental Factors            3483
Road Conditions                  2460
Vehicle Defects                  2325
Third-Party (Non-Vehicular)      1993
Other Distractions               1242
Internal Electronics Usage        165
Name: CONTRIBUTING CATEGORY V1, dtype: int64

In [31]:
#Convert columns from float to integers
df = df.astype({'ZIP CODE':'int64', 'NUMBER OF PERSONS INJURED':'int64', 'NUMBER OF PERSONS KILLED':'int64'})

In [32]:
#Change to datetime datatype
df['CRASH DATE TIME'] = pd.to_datetime(df['CRASH DATE TIME']) #changing to datetime datatype

In [33]:
#Drop remaining NaN values
df = df.dropna()

#### Season

In [34]:
#Create Season column
## Months 3-5 = Spring
## Months 6-8 = Summer
## Months 9-11 = Autumn
## Months 12-2 = Winter

df['SEASON'] = np.where(((df["CRASH DATE TIME"].dt.month >= 3) & (df["CRASH DATE TIME"].dt.month <= 5)), 'Spring',
                          np.where(((df["CRASH DATE TIME"].dt.month >= 6) & (df["CRASH DATE TIME"].dt.month <= 8)), 'Summer',
                                  np.where(((df["CRASH DATE TIME"].dt.month >= 9) & (df["CRASH DATE TIME"].dt.month <= 11)), 'Autumn', 'Winter')))

df['SEASON'].value_counts()

Summer    55723
Autumn    54181
Winter    52489
Spring    51569
Name: SEASON, dtype: int64

#### Time of Day

In [35]:
#Define the bins
bins=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 24]

#Custom labels for time of day/hour intervals
labels=['Late Night', 'Early Morning', 'Dawn', 'Early AM', 'Morning', 'Late Morning', 'Early Afternoon', 
        'Afternoon', 'Evening', 'Night']

#Add the bins to the dataframe
df['TIME OF DAY'] = pd.cut(df['CRASH DATE TIME'].dt.hour, bins, labels=labels, right=False)

#### Rush Hour

In [36]:
#Set to datetime index
df.set_index('CRASH DATE TIME', inplace=True)

In [37]:
#Create and join dataframes indicating rush hour traffic
rush_hour = pd.concat([df.between_time('6:00', '9:00'), df.between_time('16:00', '19:00')])

#Create new column indicating if rush hour -- return True if index is in rush_hour, False if not
df['IS RUSH HOUR'] = df.index.isin(rush_hour.index)

#Sanity check to confirm there are True/False values
df['IS RUSH HOUR'].value_counts()

False    143320
True      70642
Name: IS RUSH HOUR, dtype: int64

In [38]:
df

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1,CONTRIBUTING CATEGORY V1,SEASON,TIME OF DAY,IS RUSH HOUR
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01 00:00:00,3820851,BROOKLYN,11239,40.653060,-73.882020,0,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3820945,BROOKLYN,11234,40.628770,-73.918250,1,0,Failure to Yield Right-of-Way,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3819067,QUEENS,11419,40.682970,-73.828240,0,0,Passing Too Closely,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3822296,BROOKLYN,11230,40.624180,-73.970480,0,0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3821055,BROOKLYN,11230,40.623220,-73.961020,0,0,Driver Inattention/Distraction,Taxi,Driver Error,Winter,Late Night,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 22:02:00,4594338,QUEENS,11102,40.766150,-73.919785,2,0,Following Too Closely,Sedan,Moving Violation,Winter,Night,False
2022-12-31 22:10:00,4594636,BROOKLYN,11221,40.697582,-73.929830,0,0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle,Moving Violation,Winter,Night,False
2022-12-31 23:20:00,4594416,QUEENS,11369,40.760098,-73.859300,2,0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle,Bodily Impairment,Winter,Night,False
2022-12-31 23:40:00,4594910,BROOKLYN,11249,40.699482,-73.961040,1,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Night,False


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 213962 entries, 2018-01-01 00:00:00 to 2022-12-31 23:50:00
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   COLLISION_ID                   213962 non-null  int64   
 1   BOROUGH                        213962 non-null  object  
 2   ZIP CODE                       213962 non-null  int64   
 3   LATITUDE                       213962 non-null  float64 
 4   LONGITUDE                      213962 non-null  float64 
 5   NUMBER OF PERSONS INJURED      213962 non-null  int64   
 6   NUMBER OF PERSONS KILLED       213962 non-null  int64   
 7   CONTRIBUTING FACTOR VEHICLE 1  213962 non-null  object  
 8   VEHICLE TYPE CODE 1            213962 non-null  object  
 9   CONTRIBUTING CATEGORY V1       213962 non-null  object  
 10  SEASON                         213962 non-null  object  
 11  TIME OF DAY                    213962 non-nu

In [40]:
df

Unnamed: 0_level_0,COLLISION_ID,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,VEHICLE TYPE CODE 1,CONTRIBUTING CATEGORY V1,SEASON,TIME OF DAY,IS RUSH HOUR
CRASH DATE TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01 00:00:00,3820851,BROOKLYN,11239,40.653060,-73.882020,0,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3820945,BROOKLYN,11234,40.628770,-73.918250,1,0,Failure to Yield Right-of-Way,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3819067,QUEENS,11419,40.682970,-73.828240,0,0,Passing Too Closely,Sedan,Moving Violation,Winter,Late Night,False
2018-01-01 00:00:00,3822296,BROOKLYN,11230,40.624180,-73.970480,0,0,Driver Inattention/Distraction,Station Wagon/Sport Utility Vehicle,Driver Error,Winter,Late Night,False
2018-01-01 00:00:00,3821055,BROOKLYN,11230,40.623220,-73.961020,0,0,Driver Inattention/Distraction,Taxi,Driver Error,Winter,Late Night,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 22:02:00,4594338,QUEENS,11102,40.766150,-73.919785,2,0,Following Too Closely,Sedan,Moving Violation,Winter,Night,False
2022-12-31 22:10:00,4594636,BROOKLYN,11221,40.697582,-73.929830,0,0,Passing or Lane Usage Improper,Station Wagon/Sport Utility Vehicle,Moving Violation,Winter,Night,False
2022-12-31 23:20:00,4594416,QUEENS,11369,40.760098,-73.859300,2,0,Alcohol Involvement,Station Wagon/Sport Utility Vehicle,Bodily Impairment,Winter,Night,False
2022-12-31 23:40:00,4594910,BROOKLYN,11249,40.699482,-73.961040,1,0,Driver Inattention/Distraction,Sedan,Driver Error,Winter,Night,False


In [41]:
#Export as a .csv as final cleaned dataset
df.to_csv('data/Final_Motor_Vehicle_Collisions_QuBr_2018-2023_V1.csv')