In [None]:
!pip install imbalanced-learn xgboost lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import re
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [4]:
df = pd.read_csv("crash_data.csv", low_memory=False)

In [5]:
keep_cols = [
    'Crash_Date','Crash_Military_Time','Intersection_Type','Area_Type',
    'Roadway_Alignment','Light_Condition','Weather_Condition',
    'Roadway_Surface_Condition','Roadway_Surface_Type',
    'Traffic_Control_Type','Traffic_Control_Status','Collision_Type',
    'Speed_Posted','Vehicle_Count','First_Harmful_Event','Crash_Severity'
]
df = df[keep_cols].copy()

In [None]:
df['Crash_Date'] = pd.to_datetime(df['Crash_Date'])
df['day_of_week'] = df['Crash_Date'].dt.dcay_name()


In [7]:
def extract_hour(t):
    s = str(int(t)).zfill(4)
    return int(s[:2])
df['hour_of_day'] = df['Crash_Military_Time'].apply(extract_hour)

bins = [0, 6, 12, 18, 24]
labels = ['Night','Morning','Afternoon','Evening']
df['time_of_day'] = pd.cut(df['hour_of_day'], bins=bins, labels=labels, right=False)


In [8]:
def parse_max_speed(s):
    if pd.isna(s):
        return np.nan
    nums = [int(x) for x in re.findall(r'\d+', s)]
    return max(nums) if nums else np.nan

df['Speed_Posted_max'] = df['Speed_Posted'].apply(parse_max_speed)


In [9]:
df = df.drop(columns=['Crash_Date','Crash_Military_Time','Speed_Posted','hour_of_day'])

In [10]:
X = df.drop(columns=['Crash_Severity'])
y = df['Crash_Severity']

In [25]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

print(y_enc)

[4 4 1 ... 4 4 5]


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.30,
    # stratify=y_enc,
    random_state=42
)

In [21]:
numeric_feats = ['Vehicle_Count','Speed_Posted_max']
numeric_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categorical_feats = [
    'Intersection_Type','Area_Type','Roadway_Alignment',
    'Light_Condition','Weather_Condition',
    'Roadway_Surface_Condition','Roadway_Surface_Type',
    'Traffic_Control_Type','Traffic_Control_Status',
    'Collision_Type','First_Harmful_Event',
    'day_of_week','time_of_day'
]
categorical_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, numeric_feats),
    ('cat', categorical_pipe, categorical_feats)
])
