# NYC Motor Vehicle Collisions - Crashes

The purpose of this notebook is to clean and prepare the collision data, so that it eventually can be merged with CitiBike data

## 1. Imports and data loading

In [1]:
import numpy as np
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from functools import lru_cache

load_dotenv()

df = pd.read_csv("../data/Motor_Vehicle_Collisions_-_Crashes_20251117.csv",
                 low_memory=False)

## 2. Overview

In [2]:
# Check column data types
print(df.info())
# Check for missing values in each column
print(df.isnull().mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2220334 entries, 0 to 2220333
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED      int64  
 18  CO

In [3]:
# Check first few rows
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,11/01/2023,1:29,BROOKLYN,11230.0,40.62179,-73.970024,"(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,,...,Unspecified,Unspecified,,,4675373,Moped,Sedan,Sedan,,
3,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
4,09/21/2022,13:21,,,,,,BROOKLYN BRIDGE,,,...,Unspecified,,,,4566131,Station Wagon/Sport Utility Vehicle,,,,


In [4]:
# Inline with our citibike, we focus on data from 2023 onwards
df["CRASH DATE"] = pd.to_datetime(df["CRASH DATE"], errors='coerce')
df = df[df["CRASH DATE"] >= "2023-01-01"]

## Subsetting the data to accidents involving cyclists

We have direct information on (i) whether a cyclist was killed, and (ii) whether a cyclist was injured. However, we also care about accidents involving cyclist in which they were neither killed nor injured. For this, we need to investigate VEHICLE TYPE CODE 1 - 5.

In [5]:
df["VEHICLE TYPE CODE 1"].astype("string").str.lower().nunique()

520

In [6]:
mask = df["VEHICLE TYPE CODE 1"] \
            .astype("string") \
            .str.lower() \
            .str.contains("bik", na=False)

unique_bike_types = df.loc[mask, "VEHICLE TYPE CODE 1"].unique()
print(unique_bike_types)


['Bike' 'E-Bike' 'Motorbike' 'E-bike' 'Minibike' 'PEDAL BIKE' 'Ebike'
 'Citi bike' 'E-BIKE' 'Dirt Bike' 'E bike']


In [7]:
mask = df["VEHICLE TYPE CODE 1"] \
            .astype("string") \
            .str.lower() \
            .str.contains("cyc", na=False)

unique_bike_types = df.loc[mask, "VEHICLE TYPE CODE 1"].unique()
print(unique_bike_types)


['Motorcycle' 'Minicycle' 'E MOTORCYC' 'Bicycle' 'UNICYCLE' 'Quadricycl']


In [8]:
# List of vehicle type columns
veh_cols = [f"VEHICLE TYPE CODE {i}" for i in range(1, 6)]

# Start with a mask of all False
cyclist_mask = pd.Series(False, index=df.index)

for col in veh_cols:
    s = df[col].astype("string").str.lower()

    is_bike = s.str.contains("bik", na=False) & \
              ~s.str.contains("motor|dirt", na=False)

    is_cycle = s.str.contains("cyc", na=False) & \
               ~s.str.contains("motor|quad", na=False)

    cyclist_mask |= (is_bike | is_cycle)

# Final indicator column: 1 if any of the 5 vehicle codes matches, else 0
df["cyclist_involved"] = cyclist_mask.astype(int)


In [9]:
df.loc[df["cyclist_involved"] == 1, veh_cols].head()


Unnamed: 0,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1925,Bike,E-Bike,,,
2592,Bike,Sedan,,,
2661,Bike,,,,
2689,Bike,Pick-up Truck,,,
2708,Bike,,,,


In [10]:
df = df[["CRASH DATE", "CRASH TIME", "BOROUGH", "ZIP CODE",
         "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME",
         "LATITUDE", "LONGITUDE", "cyclist_involved",
         "NUMBER OF PERSONS INJURED", "NUMBER OF PERSONS KILLED",
         "NUMBER OF CYCLIST INJURED", "NUMBER OF CYCLIST KILLED"]]

In [11]:
len(df[(df["cyclist_involved"] == 1) & (df["NUMBER OF CYCLIST INJURED"] == 0) & (df["NUMBER OF CYCLIST KILLED"] == 0)])

6507

In [12]:
df.isnull().mean()

CRASH DATE                   0.000000
CRASH TIME                   0.000000
BOROUGH                      0.274231
ZIP CODE                     0.274381
ON STREET NAME               0.287961
CROSS STREET NAME            0.473962
OFF STREET NAME              0.712047
LATITUDE                     0.058185
LONGITUDE                    0.058185
cyclist_involved             0.000000
NUMBER OF PERSONS INJURED    0.000000
NUMBER OF PERSONS KILLED     0.000000
NUMBER OF CYCLIST INJURED    0.000000
NUMBER OF CYCLIST KILLED     0.000000
dtype: float64

In [None]:
df.dropna

In [63]:
GEOCLIENT_KEY = os.getenv("GEOCLIENT_KEY")
URL = "https://api.nyc.gov/geoclient/v2/search"

@lru_cache(maxsize=50_000)
def geocode(query: str):

    query_norm = query.strip().lower()

    headers = {
        "Ocp-Apim-Subscription-Key": GEOCLIENT_KEY
    }
    params = {"input": query_norm}

    r = requests.get(URL, headers=headers, params=params, timeout=8)
    if r.status_code != 200:
        return None, None

    data = r.json()
    results = data.get("results") or []
    if not results:
        return None, None

    resp = results[0].get("response", {})
    lat = resp.get("latitude")
    lon = resp.get("longitude")

    if lat is None or lon is None:
        return None, None
    return float(lat), float(lon)


In [64]:
def build_query(row):
    on = str(row.get("ON STREET NAME", "") or "").strip()
    cross = str(row.get("CROSS STREET NAME") or "").strip()
    bor = str(row.get("BOROUGH") or "").strip()
    
    if on and cross and bor:
        return f"{on} & {cross}, {bor}, NY"
    if on and bor:
        return f"{on}, {bor}, NY"

    return None

In [None]:
mask = df["LATITUDE"].isna() | df["LONGITUDE"].isna()

for idx, row in df[mask].iterrows():
    q = build_query(row)
    lat, lon = geocode(q)
    df.at[idx, "LATITUDE"] = lat
    df.at[idx, "LONGITUDE"] = lon

KeyboardInterrupt: 

In [66]:
df.isnull().sum()

CRASH DATE                        0
CRASH TIME                        0
BOROUGH                       71587
ZIP CODE                      71626
ON STREET NAME                75171
CROSS STREET NAME            123726
OFF STREET NAME              185877
LATITUDE                      14705
LONGITUDE                     15189
cyclist_involved                  0
NUMBER OF PERSONS INJURED         0
NUMBER OF PERSONS KILLED          0
NUMBER OF CYCLIST INJURED         0
NUMBER OF CYCLIST KILLED          0
LONGTITUDE                   260562
dtype: int64