In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style('whitegrid')

In [4]:
# Load the dataset
df = pd.read_csv('NYPD Motor Vehicle Collisions Dec 3 2025.csv')

# Basic shape of the dataset
print(f"Dataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")


  df = pd.read_csv('NYPD Motor Vehicle Collisions Dec 3 2025.csv')


Dataset Shape: 2,224,642 rows × 29 columns


In [5]:
# Preview the first few rows
df.head()


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,2.0,0.0,0,0,0,0,2,0,Aggressive Driving/Road Rage,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,1.0,0.0,0,0,0,0,1,0,Pavement Slippery,,,,,4513547,Sedan,,,,
2,11/01/2023,1:29,BROOKLYN,11230.0,40.62179,-73.970024,"(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,,1.0,0.0,0,0,0,0,1,0,Unspecified,Unspecified,Unspecified,,,4675373,Moped,Sedan,Sedan,,
3,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,0.0,0.0,0,0,0,0,0,0,Following Too Closely,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
4,09/21/2022,13:21,,,,,,BROOKLYN BRIDGE,,,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,4566131,Station Wagon/Sport Utility Vehicle,,,,


In [6]:
# Data types and non-null counts
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2224642 entries, 0 to 2224641
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED      int64  
 18  CO

In [7]:
# Statistical summary of numerical columns
df.describe()


Unnamed: 0,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,COLLISION_ID
count,1984253.0,1984253.0,2224624.0,2224611.0,2224642.0,2224642.0,2224642.0,2224642.0,2224642.0,2224642.0,2224642.0
mean,40.58819,-73.68026,0.3287207,0.001584097,0.05973096,0.0007848454,0.0292501,0.0001258629,0.2350738,0.0006436991,3268755.0
std,2.34806,4.363026,0.7152012,0.04204389,0.2503188,0.02863897,0.1707882,0.01125816,0.6763199,0.02774972,1509731.0
min,0.0,-201.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
25%,40.66733,-73.97453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3192281.0
50%,40.72031,-73.92672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3748618.0
75%,40.76956,-73.86656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4304991.0
max,43.34444,0.0,43.0,8.0,27.0,6.0,4.0,2.0,43.0,5.0,4861470.0


In [8]:
# Missing values analysis
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print("Missing Values Summary:")
missing_df[missing_df['Missing Count'] > 0]


Missing Values Summary:


Unnamed: 0,Missing Count,Missing %
VEHICLE TYPE CODE 5,2214884,99.56
CONTRIBUTING FACTOR VEHICLE 5,2214569,99.55
VEHICLE TYPE CODE 4,2189211,98.41
CONTRIBUTING FACTOR VEHICLE 4,2187879,98.35
VEHICLE TYPE CODE 3,2069912,93.04
CONTRIBUTING FACTOR VEHICLE 3,2063645,92.76
OFF STREET NAME,1832041,82.35
CROSS STREET NAME,850175,38.22
ZIP CODE,681376,30.63
BOROUGH,681099,30.62


In [10]:
# Unique values per column
print("Unique values per column:")
for col in df.columns:
    print(f"  {col}: {df[col].nunique():,} unique values")


Unique values per column:
  CRASH DATE: 4,900 unique values
  CRASH TIME: 1,440 unique values
  BOROUGH: 5 unique values
  ZIP CODE: 435 unique values
  LATITUDE: 129,787 unique values
  LONGITUDE: 100,724 unique values
  LOCATION: 384,029 unique values
  ON STREET NAME: 22,779 unique values
  CROSS STREET NAME: 24,890 unique values
  OFF STREET NAME: 260,425 unique values
  NUMBER OF PERSONS INJURED: 32 unique values
  NUMBER OF PERSONS KILLED: 7 unique values
  NUMBER OF PEDESTRIANS INJURED: 14 unique values
  NUMBER OF PEDESTRIANS KILLED: 6 unique values
  NUMBER OF CYCLIST INJURED: 5 unique values
  NUMBER OF CYCLIST KILLED: 3 unique values
  NUMBER OF MOTORIST INJURED: 31 unique values
  NUMBER OF MOTORIST KILLED: 6 unique values
  CONTRIBUTING FACTOR VEHICLE 1: 61 unique values
  CONTRIBUTING FACTOR VEHICLE 2: 61 unique values
  CONTRIBUTING FACTOR VEHICLE 3: 53 unique values
  CONTRIBUTING FACTOR VEHICLE 4: 43 unique values
  CONTRIBUTING FACTOR VEHICLE 5: 34 unique values
  COL

## Quick Checks for Implementation Planning


In [11]:
# 1. Target Variable Analysis: Prevalence of injuries/fatalities
df['SEVERE'] = ((df['NUMBER OF PERSONS INJURED'].fillna(0) + df['NUMBER OF PERSONS KILLED'].fillna(0)) > 0).astype(int)

severe_counts = df['SEVERE'].value_counts()
severe_pct = df['SEVERE'].value_counts(normalize=True) * 100

print("Target Variable Distribution (SEVERE = injury or fatality):")
print(f"  No injury/fatality (0): {severe_counts[0]:,} ({severe_pct[0]:.2f}%)")
print(f"  Injury or fatality (1): {severe_counts[1]:,} ({severe_pct[1]:.2f}%)")
print(f"\nClass imbalance ratio: {severe_counts[0] / severe_counts[1]:.2f}:1")


Target Variable Distribution (SEVERE = injury or fatality):
  No injury/fatality (0): 1,682,159 (75.61%)
  Injury or fatality (1): 542,483 (24.39%)

Class imbalance ratio: 3.10:1


In [12]:
# 2. Date Range Analysis
df['CRASH_DATE_PARSED'] = pd.to_datetime(df['CRASH DATE'], format='%m/%d/%Y')
df['YEAR'] = df['CRASH_DATE_PARSED'].dt.year

print("Date Range:")
print(f"  Earliest: {df['CRASH_DATE_PARSED'].min().strftime('%Y-%m-%d')}")
print(f"  Latest:   {df['CRASH_DATE_PARSED'].max().strftime('%Y-%m-%d')}")

print("\nCollisions by Year:")
year_counts = df['YEAR'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"  {year}: {count:,}")


Date Range:
  Earliest: 2012-07-01
  Latest:   2025-11-29

Collisions by Year:
  2012: 100,545
  2013: 203,742
  2014: 206,046
  2015: 217,708
  2016: 229,831
  2017: 231,007
  2018: 231,564
  2019: 211,486
  2020: 112,917
  2021: 110,557
  2022: 103,887
  2023: 96,607
  2024: 91,314
  2025: 77,431


In [13]:
# 3. Top Contributing Factors (Vehicle 1 - most populated)
print("Top 15 Contributing Factors (Vehicle 1):")
factor_counts = df['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()
for i, (factor, count) in enumerate(factor_counts.head(15).items(), 1):
    pct = count / len(df) * 100
    print(f"  {i:2}. {factor}: {count:,} ({pct:.2f}%)")


Top 15 Contributing Factors (Vehicle 1):
   1. Unspecified: 744,354 (33.46%)
   2. Driver Inattention/Distraction: 451,764 (20.31%)
   3. Failure to Yield Right-of-Way: 133,292 (5.99%)
   4. Following Too Closely: 119,471 (5.37%)
   5. Backing Unsafely: 81,174 (3.65%)
   6. Other Vehicular: 69,554 (3.13%)
   7. Passing or Lane Usage Improper: 63,878 (2.87%)
   8. Passing Too Closely: 56,537 (2.54%)
   9. Turning Improperly: 54,731 (2.46%)
  10. Fatigued/Drowsy: 47,559 (2.14%)
  11. Unsafe Lane Changing: 43,859 (1.97%)
  12. Traffic Control Disregarded: 40,914 (1.84%)
  13. Driver Inexperience: 35,852 (1.61%)
  14. Unsafe Speed: 34,323 (1.54%)
  15. Alcohol Involvement: 25,592 (1.15%)


In [14]:
# 4. Top Vehicle Types (Vehicle 1)
print("Top 15 Vehicle Types (Vehicle 1):")
vehicle_counts = df['VEHICLE TYPE CODE 1'].value_counts()
for i, (vtype, count) in enumerate(vehicle_counts.head(15).items(), 1):
    pct = count / len(df) * 100
    print(f"  {i:2}. {vtype}: {count:,} ({pct:.2f}%)")

print(f"\nNote: {len(vehicle_counts)} unique vehicle types - will need consolidation")


Top 15 Vehicle Types (Vehicle 1):
   1. Sedan: 643,745 (28.94%)
   2. Station Wagon/Sport Utility Vehicle: 504,073 (22.66%)
   3. PASSENGER VEHICLE: 416,206 (18.71%)
   4. SPORT UTILITY / STATION WAGON: 180,291 (8.10%)
   5. Taxi: 55,968 (2.52%)
   6. 4 dr sedan: 40,187 (1.81%)
   7. Pick-up Truck: 38,341 (1.72%)
   8. TAXI: 31,911 (1.43%)
   9. Box Truck: 26,608 (1.20%)
  10. VAN: 25,266 (1.14%)
  11. Bus: 24,374 (1.10%)
  12. OTHER: 22,968 (1.03%)
  13. UNKNOWN: 19,940 (0.90%)
  14. Bike: 18,172 (0.82%)
  15. LARGE COM VEH(6 OR MORE TIRES): 14,397 (0.65%)

Note: 1842 unique vehicle types - will need consolidation


In [15]:
# 5. Borough Distribution
print("Borough Distribution:")
borough_counts = df['BOROUGH'].value_counts(dropna=False)
for borough, count in borough_counts.items():
    pct = count / len(df) * 100
    borough_name = borough if pd.notna(borough) else "Missing/Unknown"
    print(f"  {borough_name}: {count:,} ({pct:.2f}%)")


Borough Distribution:
  Missing/Unknown: 681,099 (30.62%)
  BROOKLYN: 494,784 (22.24%)
  QUEENS: 413,609 (18.59%)
  MANHATTAN: 341,956 (15.37%)
  BRONX: 228,562 (10.27%)
  STATEN ISLAND: 64,632 (2.91%)


In [16]:
# 6. Summary Statistics for Implementation Planning
print("=" * 60)
print("SUMMARY FOR IMPLEMENTATION")
print("=" * 60)
print(f"\n1. TARGET: ~{severe_pct[1]:.0f}% of crashes result in injury/fatality")
print(f"   → Class imbalance is {'moderate' if severe_pct[1] > 20 else 'significant'}, consider class weights")

print(f"\n2. DATE RANGE: {df['YEAR'].min()} - {df['YEAR'].max()}")
print(f"   → Suggested train/test split: Train on {df['YEAR'].min()}-{df['YEAR'].max()-1}, Test on {df['YEAR'].max()}")

print(f"\n3. VEHICLE TYPES: {len(vehicle_counts)} unique values")
print(f"   → Top 10 cover {vehicle_counts.head(10).sum() / len(df) * 100:.1f}% - consolidate rest to 'Other'")

print(f"\n4. CONTRIBUTING FACTORS: {len(factor_counts)} unique values")
print(f"   → 'Unspecified' is dominant ({factor_counts.iloc[0] / len(df) * 100:.1f}%) - may have limited predictive value")

print(f"\n5. BOROUGH: ~{borough_counts.isna().sum() / len(df) * 100 if borough_counts.index.isna().any() else (df['BOROUGH'].isna().sum() / len(df) * 100):.0f}% missing")
print(f"   → Create 'Unknown' category for missing values")


SUMMARY FOR IMPLEMENTATION

1. TARGET: ~24% of crashes result in injury/fatality
   → Class imbalance is moderate, consider class weights

2. DATE RANGE: 2012 - 2025
   → Suggested train/test split: Train on 2012-2024, Test on 2025

3. VEHICLE TYPES: 1842 unique values
   → Top 10 cover 88.2% - consolidate rest to 'Other'

4. CONTRIBUTING FACTORS: 61 unique values
   → 'Unspecified' is dominant (33.5%) - may have limited predictive value

5. BOROUGH: ~0% missing
   → Create 'Unknown' category for missing values
