# NYC Traffic Crashes Analysis and Data Preparation

This notebook performs comprehensive analysis of NYC traffic crashes data, preparing it for visualization in a web application. The analysis includes data cleaning, integration, and exploratory analysis of crash patterns across New York City.

# 1. Data Loading and Initial Inspection

First, we'll import the required libraries and load our datasets.

In [None]:
# %% [markdown]
# # 1. Exploratory Data Analysis (EDA)
# This step involves loading the raw Crashes dataset and using descriptive statistics and initial analysis to understand its structure, issues (like missing data and inconsistencies), and initial patterns, fulfilling the thorough EDA requirement.

# %%
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- File paths (local) ---
crashes_file = "Motor_Vehicle_Collisions_-_Crashes_20251111.csv"

# --- Load raw data ---
df_crashes_raw = pd.read_csv(crashes_file, low_memory=False)
print("Initial Crashes Data Shape:", df_crashes_raw.shape)

# --- Initial Data Structure and Issues ---
print("\n--- Initial Data Info ---")
df_crashes_raw.info(verbose=False, memory_usage='deep')

# --- Missing Values Analysis ---
# Show top columns with missing values (Critical for cleaning justification)
missing_summary = df_crashes_raw.isnull().sum().sort_values(ascending=False).head(10)
print("\n--- Top 10 Missing Values ---")
print(missing_summary[missing_summary > 0])

# --- Descriptive Statistics (Numeric Data) ---
print("\n--- Descriptive Statistics for Injuries/Fatalities ---")
print(df_crashes_raw[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']].describe())

# --- Initial Visualization (Example: Crashes by Borough) ---
plt.figure(figsize=(10, 6))
borough_counts = df_crashes_raw['BOROUGH'].fillna('UNKNOWN').value_counts()
sns.barplot(x=borough_counts.index, y=borough_counts.values)
plt.title('Initial Crash Counts by Borough')
plt.ylabel('Number of Crashes')
plt.xlabel('Borough')
plt.xticks(rotation=45)
plt.show()

# Retain the raw data for the next cleaning step
df_crashes = df_crashes_raw.copy()

In [None]:
# %% [markdown]
# # 2. Pre-Integration Cleaning
# This step involves cleaning the Crashes dataset before integration. We handle missing values, outliers, remove duplicates, and standardize formats. The resulting pre-cleaned data is saved to a temporary file (`pre_cleaned_crashes.csv`) to ensure data continuity for the next cell.

# %%
# Ensure the DataFrame df_crashes (from Cell 1) is available
df = df_crashes.copy()
initial_shape = df.shape

# -------------------- Handle Missing Values (Pre-Integration) --------------------
# [cite_start]1. Drop columns with too many missing values (e.g., > 50% missing) [cite: 50]
df.dropna(axis=1, thresh=len(df) * 0.5, inplace=True) 

# [cite_start]2. Impute remaining missing values [cite: 50]
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numeric: use Median (less sensitive to outliers)
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))
# Impute categorical: use Mode (most frequent value)
df[categorical_cols] = df[categorical_cols].apply(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "unknown")
)

# -------------------- Remove Duplicates --------------------
df.drop_duplicates(inplace=True)

# -------------------- Standardize Formats --------------------
# [cite_start]Convert date/time columns to datetime objects [cite: 52]
datetime_cols = [col for col in df.columns if "date" in col.lower() or "time" in col.lower()]
for col in datetime_cols:
     df[col] = pd.to_datetime(df[col], errors='coerce')

# [cite_start]Standardize categorical strings (strip whitespace and lowercase) [cite: 52]
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower()

# -------------------- Handle Outliers (IQR) --------------------
# [cite_start]Applied to counts of injuries/fatalities (using IQR method) [cite: 51]
for col in ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']:
    if col in df.columns:
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        upper = Q3 + 1.5 * IQR
        # Remove values above the upper fence
        df = df[df[col] <= upper]

# -------------------- Clean Location Data --------------------
# Drop rows where LATITUDE or LONGITUDE is outside valid geographical ranges
if {'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
    df = df[df['LATITUDE'].between(-90, 90) & df['LONGITUDE'].between(-180, 180)]

print("Pre-Integration Cleaning Complete.")
print(f"Rows Removed: {initial_shape[0] - df.shape[0]}")
print(f"Final Pre-Cleaned Crashes Shape: {df.shape}")

# Save the pre-cleaned data for use in the next cell
df.to_csv("pre_cleaned_crashes.csv", index=False)
print("✅ Pre-cleaned Crashes data saved to pre_cleaned_crashes.csv")

In [None]:
import pandas as pd

# -------------------- Load Person dataset in chunks --------------------
person_file = "Motor_Vehicle_Collisions_-_Person_20251111.csv"
chunksize = 100000  # Adjust based on your RAM

chunks = []
for chunk in pd.read_csv(person_file, chunksize=chunksize, low_memory=False, on_bad_lines='skip'):
    chunks.append(chunk)

df_person = pd.concat(chunks, ignore_index=True)

# -------------------- Preprocess join key (Person) --------------------
df_person = df_person.drop_duplicates(subset=["COLLISION_ID"]).dropna(subset=["COLLISION_ID"])
df_person["COLLISION_ID"] = df_person["COLLISION_ID"].astype(str)
df_crashes["COLLISION_ID"] = df_crashes["COLLISION_ID"].astype(str)

# -------------------- Merge datasets (Integration) --------------------
merged = pd.merge(
    df_crashes,
    df_person,
    on="COLLISION_ID",
    how="left",
    suffixes=("_CRASH", "_PERSON")
)

# -------------------- Post-Integration Cleaning --------------------
redundant_cols = [
    c for c in merged.columns if c.endswith("_CRASH") and c.replace("_CRASH", "_PERSON") in merged.columns
]
redundant_cols.extend([c for c in merged.columns if c.startswith(('LATITUDE', 'LONGITUDE', 'BOROUGH')) and c.endswith('_PERSON')])

merged.drop(columns=redundant_cols, inplace=True, errors='ignore')

cols_to_fill_unknown = ['PERSON_SEX', 'PERSON_INJURY']
for col in cols_to_fill_unknown:
    if col in merged.columns:
        merged[col] = merged[col].fillna('UNKNOWN')

print("Post-Integration Cleaning Complete.")
print(f"Final Integrated Shape: {merged.shape}")
print("Top missing values after post-cleaning:\n", merged.isna().sum().sort_values(ascending=False).head(5))

# -------------------- Save Final Output --------------------
output_file_name = "merged_crashes_person.csv"
merged.to_csv(output_file_name, index=False)
print(f"\n✅ Final cleaned and integrated dataset saved to {output_file_name}")