In [3]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "US_Accidents_March23.csv"
df = pd.read_csv(file_path)

# Standardizing column names (lowercase, replacing spaces with underscores)
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Convert timestamp columns to datetime format
date_columns = ["start_time", "end_time", "weather_timestamp"]
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")  # Handle invalid dates

# Convert numerical fields
numeric_columns = ["start_lat", "start_lng", "temperature(f)", "wind_speed(mph)", "humidity(%)"]
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert non-numeric values to NaN

# Fill missing values
df.fillna({
    "temperature(f)": df["temperature(f)"].median(),
    "wind_speed(mph)": df["wind_speed(mph)"].median(),
    "humidity(%)": df["humidity(%)"].median(),
    "description": "Unknown"
}, inplace=True)

# Convert severity into categorical for easier analysis
if "severity" in df.columns:
    df["severity"] = df["severity"].astype("category")

# Save cleaned dataset
df.to_csv("cleaned_crash_data.csv", index=False)
print("Cleaned data saved as 'cleaned_crash_data.csv'")


Cleaned data saved as 'cleaned_crash_data.csv'
