# NYC Motor Vehicle Collision Injury Prediction Pipeline

This notebook implements a reproducible data pipeline that predicts whether a NYC collision results in any injuries or fatalities.

**Research Question:** Given collision context (time, location, vehicle types, contributing factors), can we predict if a crash will cause at least one injury or fatality?

## Section 1: Setup and Configuration


In [1]:
# Core libraries
import pandas as pd
import numpy as np
import sqlite3
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn - preprocessing and pipelines
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Sklearn - models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Sklearn - evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style('whitegrid')

print("All imports successful!")


All imports successful!


In [2]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# File paths
RAW_DATA_PATH = "NYPD Motor Vehicle Collisions Dec 3 2025.csv"
DB_PATH = "collisions.db"

# Train/test split configuration (time-based)
TRAIN_YEARS = list(range(2012, 2025))  # 2012-2024
TEST_YEAR = 2025

# Feature configuration
CATEGORICAL_FEATURES = [
    'borough',
    'hour',
    'day_of_week', 
    'is_weekend',
    'is_rush_hour',
    'vehicle_type_1',
    'contributing_factor_1'
]

NUMERIC_FEATURES = [
    'num_vehicles'
]

TARGET = 'severe'

# Vehicle type consolidation mapping (raw values â†’ standardized categories)
VEHICLE_TYPE_MAP = {
    # Sedan/Passenger
    'Sedan': 'Sedan', '4 dr sedan': 'Sedan', '2 dr sedan': 'Sedan',
    'PASSENGER VEHICLE': 'Sedan', '3-Door': 'Sedan',
    # SUV/Station Wagon
    'Station Wagon/Sport Utility Vehicle': 'SUV', 
    'SPORT UTILITY / STATION WAGON': 'SUV',
    # Taxi
    'Taxi': 'Taxi', 'TAXI': 'Taxi', 'Livery Vehicle': 'Taxi',
    # Truck
    'Pick-up Truck': 'Truck', 'Box Truck': 'Truck', 
    'LARGE COM VEH(6 OR MORE TIRES)': 'Truck', 'Tractor Truck Diesel': 'Truck',
    'Flat Bed': 'Truck', 'Dump': 'Truck', 'Tow Truck / Wrecker': 'Truck',
    # Van
    'VAN': 'Van', 'Van': 'Van', 'AMBULANCE': 'Van',
    # Bus
    'Bus': 'Bus', 'BUS': 'Bus',
    # Motorcycle
    'Motorcycle': 'Motorcycle', 'MOTORCYCLE': 'Motorcycle', 'Motorbike': 'Motorcycle',
    # Bike
    'Bike': 'Bike', 'E-Bike': 'Bike', 'E-Scooter': 'Bike',
    # Other/Unknown
    'OTHER': 'Other', 'UNKNOWN': 'Unknown', 'Unknown': 'Unknown'
}

# Contributing factor consolidation mapping
CONTRIBUTING_FACTOR_MAP = {
    'Driver Inattention/Distraction': 'Distraction',
    'Failure to Yield Right-of-Way': 'Failure to Yield',
    'Following Too Closely': 'Following Too Closely',
    'Backing Unsafely': 'Improper Maneuver',
    'Passing or Lane Usage Improper': 'Improper Maneuver',
    'Passing Too Closely': 'Improper Maneuver',
    'Turning Improperly': 'Improper Maneuver',
    'Unsafe Lane Changing': 'Improper Maneuver',
    'Fatigued/Drowsy': 'Fatigue',
    'Traffic Control Disregarded': 'Traffic Violation',
    'Unsafe Speed': 'Speeding',
    'Alcohol Involvement': 'Alcohol/Drugs',
    'Drugs (illegal)': 'Alcohol/Drugs',
    'Driver Inexperience': 'Inexperience',
    'Unspecified': 'Unspecified',
    'Other Vehicular': 'Other'
}

print("Configuration loaded:")
print(f"  Raw data: {RAW_DATA_PATH}")
print(f"  Database: {DB_PATH}")
print(f"  Train years: {TRAIN_YEARS[0]}-{TRAIN_YEARS[-1]}")
print(f"  Test year: {TEST_YEAR}")


Configuration loaded:
  Raw data: NYPD Motor Vehicle Collisions Dec 3 2025.csv
  Database: collisions.db
  Train years: 2012-2024
  Test year: 2025
