In [1]:
#Import necessary functions
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime as dt

%matplotlib inline

In [2]:
#Import the dataset and merge Crash Date and Crash Time columns
df = pd.read_csv('data/Motor_Vehicle_Collisions_-_Crashes.csv', low_memory=False, parse_dates=[['CRASH DATE', 'CRASH TIME']])

#Initial look into dataset
df.head()

Unnamed: 0,CRASH DATE_CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,2021-09-11 02:39:00,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,2.0,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,2022-03-26 11:45:00,,,,,,QUEENSBORO BRIDGE UPPER,,,1.0,...,,,,,4513547,Sedan,,,,
2,2022-06-29 06:55:00,,,,,,THROGS NECK BRIDGE,,,0.0,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,2021-09-11 09:35:00,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,0.0,...,,,,,4456314,Sedan,,,,
4,2021-12-14 08:13:00,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,0.0,...,,,,,4486609,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979921 entries, 0 to 1979920
Data columns (total 28 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   CRASH DATE_CRASH TIME          datetime64[ns]
 1   BOROUGH                        object        
 2   ZIP CODE                       object        
 3   LATITUDE                       float64       
 4   LONGITUDE                      float64       
 5   LOCATION                       object        
 6   ON STREET NAME                 object        
 7   CROSS STREET NAME              object        
 8   OFF STREET NAME                object        
 9   NUMBER OF PERSONS INJURED      float64       
 10  NUMBER OF PERSONS KILLED       float64       
 11  NUMBER OF PEDESTRIANS INJURED  int64         
 12  NUMBER OF PEDESTRIANS KILLED   int64         
 13  NUMBER OF CYCLIST INJURED      int64         
 14  NUMBER OF CYCLIST KILLED       int64         
 15  NUMBER OF MOTOR

#### Changing DateTime Index

In [5]:
#Rename crash date column
df.rename(columns = {'CRASH DATE_CRASH TIME':'CRASH DATE TIME'}, inplace = True)

#Set to datetime index
df.set_index('CRASH DATE TIME', inplace=True)

In [6]:
#Filter to crashes between 2018-2023
df = df['2018-01-01':'2023-01-01']

In [7]:
#Sanity check of data time frame
print (df.index.min())
print (df.index.max())

2018-01-01 00:00:00
2023-01-01 23:45:00


In [8]:
#Sort the index in ascending order
df = df.sort_index()

#### Borough Selection

In [9]:
#Filter for collisions in Brooklyn and Queens borough
df = df.loc[(df['BOROUGH'] == 'BROOKLYN') | (df['BOROUGH'] == 'QUEENS')]

In [10]:
#Sanity check to ensure only two boroughs are included
df['BOROUGH'].unique()

array(['BROOKLYN', 'QUEENS'], dtype=object)

#### Dropping Columns

In [11]:
#Drop non-relevant and repetitive location columns
df.drop(['LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)

#Drop repetitive injury/death columns
df.drop(['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 305681 entries, 2018-01-01 00:00:00 to 2023-01-01 23:30:00
Data columns (total 17 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   BOROUGH                        305681 non-null  object 
 1   ZIP CODE                       305578 non-null  object 
 2   LATITUDE                       298746 non-null  float64
 3   LONGITUDE                      298746 non-null  float64
 4   NUMBER OF PERSONS INJURED      305678 non-null  float64
 5   NUMBER OF PERSONS KILLED       305679 non-null  float64
 6   CONTRIBUTING FACTOR VEHICLE 1  304302 non-null  object 
 7   CONTRIBUTING FACTOR VEHICLE 2  246228 non-null  object 
 8   CONTRIBUTING FACTOR VEHICLE 3  23851 non-null   object 
 9   CONTRIBUTING FACTOR VEHICLE 4  6190 non-null    object 
 10  CONTRIBUTING FACTOR VEHICLE 5  1919 non-null    object 
 11  COLLISION_ID                   305681 non-null  int64  
 

In [13]:
#Export 2018-2023 Queens and Brooklyn dataset as a .csv as main dataset
df.to_csv('data/Motor_Vehicle_Collisions_QuBr_2018-2023.csv')