# NYC Motor Vehicle Collision Injury Prediction Pipeline

This notebook implements a reproducible data pipeline that predicts whether a NYC collision results in any injuries or fatalities.

**Research Question:** Given collision context (time, location, vehicle types, contributing factors), can we predict if a crash will cause at least one injury or fatality?

## Section 1: Setup and Configuration


In [8]:
# Core libraries
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn - preprocessing and pipelines
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# Sklearn - models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Sklearn - evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style('whitegrid')

print("All imports successful!")


All imports successful!


In [9]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# File paths
RAW_DATA_PATH = "NYPD Motor Vehicle Collisions Dec 3 2025.csv"
DB_PATH = "collisions.db"

# Train/test split configuration (time-based)
TRAIN_YEARS = list(range(2012, 2025))  # 2012-2024
TEST_YEAR = 2025

# Feature configuration
CATEGORICAL_FEATURES = [
    'borough',
    'hour',
    'day_of_week', 
    'is_weekend',
    'is_rush_hour',
    'vehicle_type_1',
    'contributing_factor_1'
]

NUMERIC_FEATURES = [
    'num_vehicles'
]

TARGET = 'severe'

# Vehicle type consolidation mapping (raw values → standardized categories)
VEHICLE_TYPE_MAP = {
    # Sedan/Passenger
    'Sedan': 'Sedan', '4 dr sedan': 'Sedan', '2 dr sedan': 'Sedan',
    'PASSENGER VEHICLE': 'Sedan', '3-Door': 'Sedan',
    # SUV/Station Wagon
    'Station Wagon/Sport Utility Vehicle': 'SUV', 
    'SPORT UTILITY / STATION WAGON': 'SUV',
    # Taxi
    'Taxi': 'Taxi', 'TAXI': 'Taxi', 'Livery Vehicle': 'Taxi',
    # Truck
    'Pick-up Truck': 'Truck', 'Box Truck': 'Truck', 
    'LARGE COM VEH(6 OR MORE TIRES)': 'Truck', 'Tractor Truck Diesel': 'Truck',
    'Flat Bed': 'Truck', 'Dump': 'Truck', 'Tow Truck / Wrecker': 'Truck',
    # Van
    'VAN': 'Van', 'Van': 'Van', 'AMBULANCE': 'Van',
    # Bus
    'Bus': 'Bus', 'BUS': 'Bus',
    # Motorcycle
    'Motorcycle': 'Motorcycle', 'MOTORCYCLE': 'Motorcycle', 'Motorbike': 'Motorcycle',
    # Bike
    'Bike': 'Bike', 'E-Bike': 'Bike', 'E-Scooter': 'Bike',
    # Other/Unknown
    'OTHER': 'Other', 'UNKNOWN': 'Unknown', 'Unknown': 'Unknown'
}

# Contributing factor consolidation mapping
CONTRIBUTING_FACTOR_MAP = {
    'Driver Inattention/Distraction': 'Distraction',
    'Failure to Yield Right-of-Way': 'Failure to Yield',
    'Following Too Closely': 'Following Too Closely',
    'Backing Unsafely': 'Improper Maneuver',
    'Passing or Lane Usage Improper': 'Improper Maneuver',
    'Passing Too Closely': 'Improper Maneuver',
    'Turning Improperly': 'Improper Maneuver',
    'Unsafe Lane Changing': 'Improper Maneuver',
    'Fatigued/Drowsy': 'Fatigue',
    'Traffic Control Disregarded': 'Traffic Violation',
    'Unsafe Speed': 'Speeding',
    'Alcohol Involvement': 'Alcohol/Drugs',
    'Drugs (illegal)': 'Alcohol/Drugs',
    'Driver Inexperience': 'Inexperience',
    'Unspecified': 'Unspecified',
    'Other Vehicular': 'Other'
}

print("Configuration loaded:")
print(f"  Raw data: {RAW_DATA_PATH}")
print(f"  Database: {DB_PATH}")
print(f"  Train years: {TRAIN_YEARS[0]}-{TRAIN_YEARS[-1]}")
print(f"  Test year: {TEST_YEAR}")


Configuration loaded:
  Raw data: NYPD Motor Vehicle Collisions Dec 3 2025.csv
  Database: collisions.db
  Train years: 2012-2024
  Test year: 2025


## Section 2: Data Loading and Cleaning

Load the raw CSV, parse dates, handle missing values, and create the target variable.


In [10]:
# Load raw data
df_raw = pd.read_csv(RAW_DATA_PATH, low_memory=False)

print(f"Loaded {len(df_raw):,} rows × {len(df_raw.columns)} columns")
print(f"\nColumns: {list(df_raw.columns)}")


Loaded 2,224,642 rows × 29 columns

Columns: ['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5', 'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5']


In [11]:
# =============================================================================
# DATA CLEANING
# =============================================================================

df = df_raw.copy()

# 1. Parse date and time
df['crash_datetime'] = pd.to_datetime(
    df['CRASH DATE'] + ' ' + df['CRASH TIME'], 
    format='%m/%d/%Y %H:%M'
)
df['year'] = df['crash_datetime'].dt.year

# 2. Handle missing values
# Borough: fill with "Unknown"
df['borough'] = df['BOROUGH'].fillna('Unknown')

# ZIP Code: fill with "Unknown"  
df['zip_code'] = df['ZIP CODE'].fillna('Unknown').astype(str)

# Coordinates: create flag for missing
df['coords_missing'] = df['LATITUDE'].isna() | df['LONGITUDE'].isna()
df['latitude'] = df['LATITUDE']
df['longitude'] = df['LONGITUDE']

# Contributing factors: fill with "Unknown"
for i in range(1, 6):
    col = f'CONTRIBUTING FACTOR VEHICLE {i}'
    df[f'contributing_factor_{i}'] = df[col].fillna('Unknown') if col in df.columns else 'Unknown'

# Vehicle types: fill with "Unknown"
for i in range(1, 6):
    col = f'VEHICLE TYPE CODE {i}'
    df[f'vehicle_type_{i}'] = df[col].fillna('Unknown') if col in df.columns else 'Unknown'

# 3. Create target variable: SEVERE = 1 if any injury or fatality
df['num_injured'] = df['NUMBER OF PERSONS INJURED'].fillna(0)
df['num_killed'] = df['NUMBER OF PERSONS KILLED'].fillna(0)
df['severe'] = ((df['num_injured'] + df['num_killed']) > 0).astype(int)

# 4. Keep collision ID
df['collision_id'] = df['COLLISION_ID']

print("Data cleaning complete!")
print(f"  Rows: {len(df):,}")
print(f"  Date range: {df['crash_datetime'].min().strftime('%Y-%m-%d')} to {df['crash_datetime'].max().strftime('%Y-%m-%d')}")


Data cleaning complete!
  Rows: 2,224,642
  Date range: 2012-07-01 to 2025-11-29


In [12]:
# =============================================================================
# VALIDATION CHECKS
# =============================================================================

print("=" * 60)
print("VALIDATION SUMMARY")
print("=" * 60)

# Target distribution
severe_counts = df['severe'].value_counts()
print(f"\n1. TARGET DISTRIBUTION:")
print(f"   No injury (0): {severe_counts[0]:,} ({severe_counts[0]/len(df)*100:.1f}%)")
print(f"   Injury/fatal (1): {severe_counts[1]:,} ({severe_counts[1]/len(df)*100:.1f}%)")

# Borough distribution
print(f"\n2. BOROUGH DISTRIBUTION:")
for borough, count in df['borough'].value_counts().items():
    print(f"   {borough}: {count:,} ({count/len(df)*100:.1f}%)")

# Year distribution  
print(f"\n3. YEAR DISTRIBUTION:")
year_counts = df['year'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"   {year}: {count:,}")

# Missing coordinates
missing_coords = df['coords_missing'].sum()
print(f"\n4. COORDINATES:")
print(f"   Missing: {missing_coords:,} ({missing_coords/len(df)*100:.1f}%)")

# Check for any remaining nulls in key columns
key_cols = ['collision_id', 'crash_datetime', 'borough', 'severe']
print(f"\n5. NULL CHECK (key columns):")
for col in key_cols:
    nulls = df[col].isna().sum()
    print(f"   {col}: {nulls} nulls")


VALIDATION SUMMARY

1. TARGET DISTRIBUTION:
   No injury (0): 1,682,159 (75.6%)
   Injury/fatal (1): 542,483 (24.4%)

2. BOROUGH DISTRIBUTION:
   Unknown: 681,099 (30.6%)
   BROOKLYN: 494,784 (22.2%)
   QUEENS: 413,609 (18.6%)
   MANHATTAN: 341,956 (15.4%)
   BRONX: 228,562 (10.3%)
   STATEN ISLAND: 64,632 (2.9%)

3. YEAR DISTRIBUTION:
   2012: 100,545
   2013: 203,742
   2014: 206,046
   2015: 217,708
   2016: 229,831
   2017: 231,007
   2018: 231,564
   2019: 211,486
   2020: 112,917
   2021: 110,557
   2022: 103,887
   2023: 96,607
   2024: 91,314
   2025: 77,431

4. COORDINATES:
   Missing: 240,389 (10.8%)

5. NULL CHECK (key columns):
   collision_id: 0 nulls
   crash_datetime: 0 nulls
   borough: 0 nulls
   severe: 0 nulls


In [13]:
# =============================================================================
# COORDINATE IMPUTATION (K-MEANS)
# =============================================================================

def impute_coordinates_kmeans(df, min_cluster_size=1500):
    """
    Impute missing coordinates using K-means clustering per borough.
    
    For each borough:
    - Fit K-means on rows with valid coordinates
    - Assign borough centroid to rows with missing coordinates
    
    Args:
        df: DataFrame with 'borough', 'latitude', 'longitude', 'coords_missing' columns
        min_cluster_size: Approximate rows per cluster (k = n_rows / min_cluster_size)
    
    Returns:
        DataFrame with imputed coordinates and 'coords_imputed' flag
    """
    df = df.copy()
    df['coords_imputed'] = False
    
    # Get boroughs with valid coordinates (exclude "Unknown")
    valid_boroughs = df[(df['coords_missing'] == False) & (df['borough'] != 'Unknown')]['borough'].unique()
    
    print("Coordinate Imputation by Borough:")
    print("-" * 50)
    
    for borough in valid_boroughs:
        # Masks for this borough
        mask_valid = (df['borough'] == borough) & (df['coords_missing'] == False)
        mask_missing = (df['borough'] == borough) & (df['coords_missing'] == True)
        
        n_valid = mask_valid.sum()
        n_missing = mask_missing.sum()
        
        if n_valid < 10 or n_missing == 0:
            continue
        
        # Determine number of clusters
        n_clusters = max(5, n_valid // min_cluster_size)
        
        # Fit K-means on valid coordinates
        valid_coords = df.loc[mask_valid, ['latitude', 'longitude']].values
        km = KMeans(n_clusters=n_clusters, random_state=42, n_init=5)
        km.fit(valid_coords)
        
        # Compute borough centroid (mean of all valid coords)
        borough_centroid = valid_coords.mean(axis=0)
        
        # Impute missing coords with borough centroid
        df.loc[mask_missing, 'latitude'] = borough_centroid[0]
        df.loc[mask_missing, 'longitude'] = borough_centroid[1]
        df.loc[mask_missing, 'coords_imputed'] = True
        df.loc[mask_missing, 'coords_missing'] = False
        
        print(f"  {borough:15} | Valid: {n_valid:>7,} | Imputed: {n_missing:>6,} | k={n_clusters}")
    
    return df

# Apply imputation
df = impute_coordinates_kmeans(df)

# Summary
print("\n" + "=" * 50)
print("IMPUTATION SUMMARY")
print("=" * 50)
total_imputed = df['coords_imputed'].sum()
still_missing = df['coords_missing'].sum()
print(f"  Coordinates imputed: {total_imputed:,}")
print(f"  Still missing (Unknown borough): {still_missing:,}")


Coordinate Imputation by Borough:
--------------------------------------------------


python(80246) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


  BROOKLYN        | Valid: 484,389 | Imputed: 10,395 | k=322
  BRONX           | Valid: 221,605 | Imputed:  6,957 | k=147
  MANHATTAN       | Valid: 331,875 | Imputed: 10,081 | k=221
  QUEENS          | Valid: 404,914 | Imputed:  8,695 | k=269
  STATEN ISLAND   | Valid:  62,987 | Imputed:  1,645 | k=41

IMPUTATION SUMMARY
  Coordinates imputed: 37,773
  Still missing (Unknown borough): 202,616


In [15]:
# =============================================================================
# SELECT COLUMNS FOR CLEANED DATASET
# =============================================================================

# Define columns to keep
columns_to_keep = [
    # Identifiers
    'collision_id',
    'crash_datetime',
    'year',
    
    # Location
    'borough',
    'zip_code',
    'latitude',
    'longitude',
    'coords_missing',
    'coords_imputed',
    
    # Injury counts (for reference)
    'num_injured',
    'num_killed',
    
    # Target
    'severe',
    
    # Vehicle types (1-5)
    'vehicle_type_1', 'vehicle_type_2', 'vehicle_type_3', 
    'vehicle_type_4', 'vehicle_type_5',
    
    # Contributing factors (1-5)
    'contributing_factor_1', 'contributing_factor_2', 'contributing_factor_3',
    'contributing_factor_4', 'contributing_factor_5'
]

df_clean = df[columns_to_keep].copy()

print(f"Cleaned dataset: {len(df_clean):,} rows × {len(df_clean.columns)} columns")
print(f"\nColumns: {list(df_clean.columns)}")
df_clean.head()


Cleaned dataset: 2,224,642 rows × 22 columns

Columns: ['collision_id', 'crash_datetime', 'year', 'borough', 'zip_code', 'latitude', 'longitude', 'coords_missing', 'coords_imputed', 'num_injured', 'num_killed', 'severe', 'vehicle_type_1', 'vehicle_type_2', 'vehicle_type_3', 'vehicle_type_4', 'vehicle_type_5', 'contributing_factor_1', 'contributing_factor_2', 'contributing_factor_3', 'contributing_factor_4', 'contributing_factor_5']


Unnamed: 0,collision_id,crash_datetime,year,borough,zip_code,latitude,longitude,coords_missing,coords_imputed,num_injured,num_killed,severe,vehicle_type_1,vehicle_type_2,vehicle_type_3,vehicle_type_4,vehicle_type_5,contributing_factor_1,contributing_factor_2,contributing_factor_3,contributing_factor_4,contributing_factor_5
0,4455765,2021-09-11 02:39:00,2021,Unknown,Unknown,,,True,False,2.0,0.0,1,Sedan,Sedan,Unknown,Unknown,Unknown,Aggressive Driving/Road Rage,Unspecified,Unknown,Unknown,Unknown
1,4513547,2022-03-26 11:45:00,2022,Unknown,Unknown,,,True,False,1.0,0.0,1,Sedan,Unknown,Unknown,Unknown,Unknown,Pavement Slippery,Unknown,Unknown,Unknown,Unknown
2,4675373,2023-11-01 01:29:00,2023,BROOKLYN,11230,40.62179,-73.970024,False,False,1.0,0.0,1,Moped,Sedan,Sedan,Unknown,Unknown,Unspecified,Unspecified,Unspecified,Unknown,Unknown
3,4541903,2022-06-29 06:55:00,2022,Unknown,Unknown,,,True,False,0.0,0.0,0,Sedan,Pick-up Truck,Unknown,Unknown,Unknown,Following Too Closely,Unspecified,Unknown,Unknown,Unknown
4,4566131,2022-09-21 13:21:00,2022,Unknown,Unknown,,,True,False,0.0,0.0,0,Station Wagon/Sport Utility Vehicle,Unknown,Unknown,Unknown,Unknown,Passing Too Closely,Unspecified,Unknown,Unknown,Unknown
