In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler

# resample the dataset
from sklearn.utils import resample
from sklearn.utils import shuffle

# import various functions from sklearn
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier


# import the XGBoost function for classification
from xgboost import XGBClassifier

import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [19]:
data = pd.read_csv('/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/0_dataset/3.FEATURED_ENGINEERING DATASET/Encoded_Data.csv')

In [20]:
df=data.copy()

## Target Encoding Mistake (Vehicle Damage Extent)

The target variable **Vehicle Damage Extent** was mistakenly one-hot encoded using `get_dummies(drop_first=True)`, which caused:

- The target to split into **multiple binary columns** instead of one multi-class label.
- Loss of the first category (e.g., **“Destroyed”**) due to `drop_first=True`.
- Models failing to train properly because they require **one target column**, not several.
- Incorrect mapping of labels and unreliable predictions.

### Fix Applied
The one-hot encoded target columns were removed, and the original **single categorical target column** was restored to enable correct multi-class classification.

In [21]:
# ---- Damage Extent Reverse Encoding ----

damage_cols = [
    'Vehicle Damage Extent_Disabling',
    'Vehicle Damage Extent_Functional',
    'Vehicle Damage Extent_No Damage',
    'Vehicle Damage Extent_Other',
    'Vehicle Damage Extent_Superficial',
    'Vehicle Damage Extent_Vehicle Not at Scene'
]

# 1) Get category from highest one-hot value
df['Damage_Class'] = df[damage_cols].idxmax(axis=1)
df['Damage_Class'] = df['Damage_Class'].str.replace('Vehicle Damage Extent_', '', regex=False)

# 2) Recover dropped class "Destroyed"
df.loc[df[damage_cols].sum(axis=1) == 0, 'Damage_Class'] = 'Destroyed'

# 3) Optional → Convert to numeric for ML
df['Damage_Class'] = df['Damage_Class'].astype('category').cat.codes

# 4) Drop one-hot encoded target columns
df = df.drop(columns=damage_cols)

In [22]:
df['Damage_Class'].value_counts()

Unnamed: 0_level_0,count
Damage_Class,Unnamed: 1_level_1
1,80179
2,55586
5,53026
0,7630
3,6631
6,2383
4,104


In [16]:
# df['Damage_Class'] = df['Damage_Class'].astype('category').cat.codes


In [23]:
mapping_damage_extent = (
    pd.DataFrame({
        "encoded_class": df['Damage_Class'],
        "original_class": df['Damage_Class_raw']
    })
    .drop_duplicates()
    .sort_values("encoded_class")
    .reset_index(drop=True)
)

mapping_damage_extent


KeyError: 'Damage_Class_raw'

In [26]:
df_orig=pd.read_excel('/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/0_dataset/2_CLEANED DATASET/FULLY_CLEANED DATASET.xlsx')

In [28]:
damage_cols = [
    'Vehicle Damage Extent_Disabling',
    'Vehicle Damage Extent_Functional',
    'Vehicle Damage Extent_No Damage',
    'Vehicle Damage Extent_Other',
    'Vehicle Damage Extent_Superficial',
    'Vehicle Damage Extent_Vehicle Not at Scene'
]

# category order used during encoding
cat = df['Damage_Class'].astype('category')

mapping_damage_extent = pd.DataFrame({
    "encoded_class": range(len(cat.cat.categories)),
    "original_class": [
        col.replace("Vehicle Damage Extent_", "")
        for col in damage_cols
    ] + ["Destroyed"]  # for zero-sum rows you handled manually
})

mapping_damage_extent


Unnamed: 0,encoded_class,original_class
0,0,Disabling
1,1,Functional
2,2,No Damage
3,3,Other
4,4,Superficial
5,5,Vehicle Not at Scene
6,6,Destroyed


In [None]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Road Name,Cross-Street Name,Driver Substance Abuse,Driver At Fault,Injury Severity,Driver Distracted By,Drivers License State,Vehicle Body Type,Speed Limit,Driverless Vehicle,Parked Vehicle,Vehicle Year,Vehicle Make,Latitude,Longitude,Vehicle Model,hour,Crash_year,Crash_month,Crash_day,Crash_hour,Crash_week,Agency Name_Maryland_National_Capital,Agency Name_Montgomery_County_Police,Agency Name_Rockville_Police_Department,Agency Name_Takoma_Park_Police_Department,ACRS Report Type_Injury Crash,ACRS Report Type_Property Damage Crash,Route Type_County_Route,Route Type_Crossover,Route Type_Government_Route,Route Type_Interstate_Route,Route Type_Local_Route,Route Type_Maryland_State_Route,Route Type_Municipality_Route,Route Type_Other_Public_Roadway,Route Type_Private_Route,Route Type_Ramp_Route,Route Type_Service_Road,Route Type_Spur_Route,Route Type_US_State_Route,Collision Type_Angle + Left Turn,Collision Type_Angle + Right Turn,Collision Type_Angle Collision,Collision Type_Back-to-Back,Collision Type_Both Left Turns (Opposite Direction),Collision Type_Both Left Turns (Same Direction),Collision Type_Head-On,Collision Type_Head-On + Left Turn,Collision Type_Left Turn (Same Direction),Collision Type_Other,Collision Type_Rear vs Side,Collision Type_Rear-End (Same Direction),Collision Type_Right Turn (Same Direction),Collision Type_Sideswipe (Opposite Direction),Collision Type_Sideswipe (Same Direction),Collision Type_Single Vehicle,Circumstance_Category_Congestion / Backup,Circumstance_Category_Driver Inattention,Circumstance_Category_Lane Violation,Circumstance_Category_Mechanical / Environmental,Circumstance_Category_Passing / Turning,Circumstance_Category_Right-of-Way Violation,Circumstance_Category_Road Obstruction,Circumstance_Category_Speed Related,Circumstance_Category_Surface Condition,Circumstance_Category_Tailgating,Circumstance_Category_Traffic Control Violation,Circumstance_Category_Weather Related,Weather_Blowing Snow,Weather_Clear,Weather_Cloudy,Weather_Fog,Weather_Freezing Rain,Weather_Other,Weather_Rain,Weather_Severe Crosswinds,Weather_Severe Winds,Weather_Sleet,Weather_Snow,Weather_Wintry Mix,Surface Condition_Ice,Surface Condition_Mud_Dirt_Gravel,Surface Condition_Oil,Surface Condition_Other,Surface Condition_Sand,Surface Condition_Slush,Surface Condition_Snow,Surface Condition_Wet,Light_Dark - Not Lighted,Light_Dark - Unknown Lighting,Light_Dawn,Light_Daylight,Light_Dusk,Light_Other,Traffic Control_Flashing Traffic Signal,Traffic Control_Lane Use Control,Traffic Control_No Control,Traffic Control_Other,Traffic Control_Other Pavement Marking,Traffic Control_Other Signal,Traffic Control_Pedestrian Crossing,Traffic Control_Person Control,Traffic Control_Railroad Crossing Device,Traffic Control_Railroad Crossing Signal,Traffic Control_Ramp Meter Signal,Traffic Control_School Zone Sign,Traffic Control_Stop Sign,Traffic Control_Traffic Signal,Traffic Control_Warning Sign,Traffic Control_Yield Sign,Vehicle First Impact Location_Eight O Clock,Vehicle First Impact Location_Eleven O Clock,Vehicle First Impact Location_Five O Clock,Vehicle First Impact Location_Four O Clock,Vehicle First Impact Location_Nine O Clock,Vehicle First Impact Location_NonCollision,Vehicle First Impact Location_One O Clock,Vehicle First Impact Location_RoofTop,Vehicle First Impact Location_Seven O Clock,Vehicle First Impact Location_Six O Clock,Vehicle First Impact Location_Ten O Clock,Vehicle First Impact Location_Three O Clock,Vehicle First Impact Location_Twelve O Clock,Vehicle First Impact Location_Two O Clock,Vehicle First Impact Location_Underside,Vehicle First Impact Location_VehicleNotAtScene,Vehicle Movement_Backing,Vehicle Movement_ChangingLanes,Vehicle Movement_DriverlessMovingVehicle,Vehicle Movement_EnteringTrafficLane,Vehicle Movement_LeavingTrafficLane,Vehicle Movement_MakingLeftTurn,Vehicle Movement_MakingRightTurn,Vehicle Movement_MakingUTurn,Vehicle Movement_MovingConstantSpeed,Vehicle Movement_NegotiatingCurve,Vehicle Movement_Other,Vehicle Movement_Parked,Vehicle Movement_Parking,Vehicle Movement_Passing,Vehicle Movement_RightTurnOnRed,Vehicle Movement_Skidding,Vehicle Movement_SlowingOrStopping,Vehicle Movement_StartingFromLane,Vehicle Movement_StartingFromParked,Vehicle Movement_StoppedInTrafficLane,Vehicle Going Dir_Eastbound,Vehicle Going Dir_North,Vehicle Going Dir_Northbound,Vehicle Going Dir_Not On Roadway,Vehicle Going Dir_South,Vehicle Going Dir_Southbound,Vehicle Going Dir_Unknown,Vehicle Going Dir_West,Vehicle Going Dir_Westbound,Crash_day_name_Monday,Crash_day_name_Saturday,Crash_day_name_Sunday,Crash_day_name_Thursday,Crash_day_name_Tuesday,Crash_day_name_Wednesday,Damage_Class
0,0,0.000978,0.001309,9,2,0,13,0.883827,24,40.0,0,0,2013,16,39.219796,-77.257416,0.002146,17,2025,8,21,17,34,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,5
1,1,0.009186,0.005634,9,0,2,8,0.883827,25,55.0,0,0,2011,28,39.180181,-77.250657,0.13541,10,2025,8,22,10,34,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6
2,2,5.8e-05,3.4e-05,9,0,0,12,0.000462,24,40.0,0,0,2023,18,39.121219,-76.988905,0.001951,11,2025,7,25,11,30,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,5


In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.shape

(205539, 163)

## Model-1: Master + Validation Split and Train–Test Split
### Step 1: Load Encoded Dataset
We load the complete encoded dataset which will be used to create splits.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

target = "Damage_Class"
X = df.drop(columns=[target])
y = df[target]

print("Dataset loaded:", df.shape)


Dataset loaded: (205539, 163)


## Step 2: Create Master (90%) and Validation (10%) Split
Master dataset is used for model training and testing.  
Validation dataset is for final unbiased evaluation.


In [None]:
X_major, X_val, y_major, y_val = train_test_split(
    X, y,
    test_size=0.10,
    stratify=y,
    random_state=42
)

major_df = pd.concat([X_major, y_major], axis=1)
val_df   = pd.concat([X_val, y_val], axis=1)

print("Master dataset:", major_df.shape)
print("Validation dataset:", val_df.shape)


Master dataset: (184985, 163)
Validation dataset: (20554, 163)


## Step 3: Save Master and Validation Files
Both files are saved in the Model-1 folder for later use.


In [None]:
save_path = "/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/08_MODELS/Model6_DamageExtent/"

major_df.to_csv(save_path + "model_6_master_data.csv", index=False)
val_df.to_csv(save_path + "model_6_validation_data.csv", index=False)

print("Saved model_6_master_data.csv and model_6_validation_data.csv")


Saved model_6_master_data.csv and model_6_validation_data.csv


In [None]:
df_master_data = pd.read_csv(save_path + "model_6_master_data.csv")


# IMPORTANT  USER DEFINED FUNCTION

### UNIVERSAL MODEL FUNCTION

In [None]:
# ================================================================
# UNIVERSAL MODEL FUNCTION (simple + clear scaling logic)
# ================================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

def run_model(model, X, y, test_size=0.20, scaled=False, threshold=0.5):
    """
    scaled = True  → apply scaling on numeric columns
    scaled = False → no scaling
    model  = sklearn model OR "stats" for statsmodels logistic
    """

    # 1) Train–Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    # 2) Scaling if selected
    if scaled:
        scaler = StandardScaler()
        num_cols = X.select_dtypes(include='number').columns

        X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
        X_test[num_cols]  = scaler.transform(X_test[num_cols])

    # 3) Statsmodels Logit
    if model == "stats":
        X_train_c = sm.add_constant(X_train)
        X_test_c  = sm.add_constant(X_test)

        logit = sm.Logit(y_train, X_train_c).fit(disp=False)

        yproba_train = logit.predict(X_train_c)
        yproba_test  = logit.predict(X_test_c)

        ypred_train = (yproba_train >= threshold).astype(int)
        ypred_test  = (yproba_test >= threshold).astype(int)

        return logit, X_train, X_test, y_train, y_test, ypred_train, ypred_test, yproba_train, yproba_test

    # 4) Normal sklearn model
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)
    ypred_test  = model.predict(X_test)

    yproba_train = model.predict_proba(X_train)[:,1]
    yproba_test  = model.predict_proba(X_test)[:,1]

    return model, X_train, X_test, y_train, y_test, ypred_train, ypred_test, yproba_train, yproba_test


### METRICS FUNCTION

In [None]:
# ================================================================
# METRICS FUNCTION (Train + Test separate + Binary/Multiclass safe)
# ================================================================

import pandas as pd
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score,
    f1_score, roc_auc_score, cohen_kappa_score,
    classification_report, confusion_matrix
)

d = pd.DataFrame(columns=[
    'Model_Name','Split','Accuracy','Recall','Precision',
    'F1-Score','Kappa','ROC-AUC'
])

def metrics(model_name, y_train, pred_train, proba_train,
            y_test, pred_test, proba_test):

    global d

    # Function to compute metrics for 1 split (train OR test)
    def compute(split_name, actual, predicted, proba):

        unique_classes = len(pd.Series(actual).unique())
        is_binary = (unique_classes == 2)
        avg = "binary" if is_binary else "weighted"

        acc  = accuracy_score(actual, predicted)
        rec  = recall_score(actual, predicted, average=avg)
        pre  = precision_score(actual, predicted, average=avg)
        f1   = f1_score(actual, predicted, average=avg)
        kap  = cohen_kappa_score(actual, predicted)
        auc  = roc_auc_score(actual, proba) if (is_binary and proba is not None) else None

        # append to global dataframe
        d.loc[len(d)] = [model_name, split_name, acc, rec, pre, f1, kap, auc]

        # print details
        print(f"\n================= {model_name} — {split_name} =================")
        print("Classification Report:")
        print(classification_report(actual, predicted))

        print("Confusion Matrix:")
        print(confusion_matrix(actual, predicted))

        if auc is not None:
            print("ROC-AUC:", auc)

    # ---- TRAIN METRICS ----
    compute("Train", y_train, pred_train, proba_train)

    # ---- TEST METRICS ----
    compute("Test", y_test, pred_test, proba_test)

    return d

### Feature Importance Function

In [None]:
def fi(model, x, n_features=10):
    df_fi = pd.DataFrame({
        "Feature": x.columns,
        "Importance": model.feature_importances_
    })
    return df_fi.sort_values(by="Importance", ascending=False).head(n_features)


### ROC Curve

In [None]:
def plot_roc_plain(y_test, yproba_test):
    fpr, tpr, _ = roc_curve(y_test, yproba_test)
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1])
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("ROC Curve")
    plt.show()


# MODEL BUILDING AND TRAINING


### Logit Model (Statsmodels Logistic Regression)

###  Why Statsmodels Logit Failed

- Statsmodels **Logit requires a binary target** (only 0 and 1).
- Our Damage Extent column has **7 classes** (0, 1, 2, 3, 4, 5, 6).
- Because the target is **multi-class**, Logit cannot estimate probabilities in the 0–1 range.
- Therefore, Logit throws the error: *“endog must be in the unit interval.”*


## Logistic Regression (Sklearn) — Model Call
This model supports multi-class Injury Severity and works correctly without converting to binary.


In [None]:
X = df_master_data.drop(columns=target)
y = df_master_data[target]

In [None]:
from sklearn.linear_model import LogisticRegression

# Define model
log_reg = LogisticRegression(multi_class="multinomial")

# Run model using universal function
log_reg, X_train_lr, X_test_lr, y_train_lr, y_test_lr, ypred_train_lr, ypred_test_lr, yproba_train_lr, yproba_test_lr = run_model(
    model=log_reg,
    X=X,
    y=y,
    test_size=0.20,
    scaled=True
)

# Correct metrics call
metrics(
    model_name="LogisticRegression",
    y_train=y_train_lr,
    pred_train=ypred_train_lr,
    proba_train=yproba_train_lr,
    y_test=y_test_lr,
    pred_test=ypred_test_lr,
    proba_test=yproba_test_lr
)


Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.06      0.11      5494
           1       0.57      0.81      0.67     57729
           2       0.43      0.25      0.32     40022
           3       0.62      0.18      0.28      4774
           4       0.00      0.00      0.00        75
           5       0.50      0.50      0.50     38178
           6       1.00      1.00      1.00      1716

    accuracy                           0.53    147988
   macro avg       0.53      0.40      0.41    147988
weighted avg       0.52      0.53      0.50    147988

Confusion Matrix:
[[  332  4740   262     9     0   151     0]
 [  182 46888  4918   128     0  5613     0]
 [   27 18123 10112   116     0 11642     2]
 [    3  1283   688   847     0  1952     1]
 [    1    35    11     4     0    24     0]
 [    7 11065  7684   266     0 19155     1]
 [    0     0     0     0     0     5  1711]]

Classification Report:
              precisi

Unnamed: 0,Model_Name,Split,Accuracy,Recall,Precision,F1-Score,Kappa,ROC-AUC
0,LogisticRegression,Train,0.534131,0.534131,0.520258,0.500776,0.30723,
1,LogisticRegression,Test,0.527286,0.527286,0.509701,0.493378,0.297085,


## Decision Tree Classifier — Model Call
Simple non-linear classifier, works well without scaling.


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define model
dt = DecisionTreeClassifier(random_state=42)

# Run model using universal function
dt, X_train_dt, X_test_dt, y_train_dt, y_test_dt, ypred_train_dt, ypred_test_dt, yproba_train_dt, yproba_test_dt = run_model(
    model=dt,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False    # Trees do NOT need scaling
)

# Correct metrics function call
metrics(
    model_name="DecisionTree",
    y_train=y_train_dt,
    pred_train=ypred_train_dt,
    proba_train=yproba_train_dt,
    y_test=y_test_dt,
    pred_test=ypred_test_dt,
    proba_test=yproba_test_dt
)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5494
           1       1.00      1.00      1.00     57729
           2       1.00      1.00      1.00     40022
           3       1.00      1.00      1.00      4774
           4       1.00      1.00      1.00        75
           5       1.00      1.00      1.00     38178
           6       1.00      1.00      1.00      1716

    accuracy                           1.00    147988
   macro avg       1.00      1.00      1.00    147988
weighted avg       1.00      1.00      1.00    147988

Confusion Matrix:
[[ 5494     0     0     0     0     0     0]
 [    0 57729     0     0     0     0     0]
 [    0     0 40022     0     0     0     0]
 [    0     0     0  4774     0     0     0]
 [    0     0     0     0    75     0     0]
 [    0     1     1     0     0 38176     0]
 [    0     0     0     0     0     0  1716]]

Classification Report:
              precisi

Unnamed: 0,Model_Name,Split,Accuracy,Recall,Precision,F1-Score,Kappa,ROC-AUC
0,LogisticRegression,Train,0.534131,0.534131,0.520258,0.500776,0.30723,
1,LogisticRegression,Test,0.527286,0.527286,0.509701,0.493378,0.297085,
2,DecisionTree,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
3,DecisionTree,Test,0.437279,0.437279,0.438417,0.437834,0.203502,


## Random Forest Classifier — Model Call
Ensemble of decision trees, robust to imbalance and noise.


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define model
rf = RandomForestClassifier(random_state=42)

# Run model using universal function
rf, X_train_rf, X_test_rf, y_train_rf, y_test_rf, ypred_train_rf, ypred_test_rf, yproba_train_rf, yproba_test_rf = run_model(
    model=rf,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False     # Trees & ensemble trees do NOT need scaling
)

# Correct metrics function call
metrics(
    model_name="RandomForest",
    y_train=y_train_rf,
    pred_train=ypred_train_rf,
    proba_train=yproba_train_rf,
    y_test=y_test_rf,
    pred_test=ypred_test_rf,
    proba_test=yproba_test_rf
)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5494
           1       1.00      1.00      1.00     57729
           2       1.00      1.00      1.00     40022
           3       1.00      1.00      1.00      4774
           4       1.00      1.00      1.00        75
           5       1.00      1.00      1.00     38178
           6       1.00      1.00      1.00      1716

    accuracy                           1.00    147988
   macro avg       1.00      1.00      1.00    147988
weighted avg       1.00      1.00      1.00    147988

Confusion Matrix:
[[ 5494     0     0     0     0     0     0]
 [    0 57728     0     0     0     1     0]
 [    0     0 40021     0     0     1     0]
 [    0     0     0  4774     0     0     0]
 [    0     0     0     0    75     0     0]
 [    0     0     0     0     0 38178     0]
 [    0     0     0     0     0     0  1716]]

Classification Report:
              precisi

Unnamed: 0,Model_Name,Split,Accuracy,Recall,Precision,F1-Score,Kappa,ROC-AUC
0,LogisticRegression,Train,0.534131,0.534131,0.520258,0.500776,0.30723,
1,LogisticRegression,Test,0.527286,0.527286,0.509701,0.493378,0.297085,
2,DecisionTree,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
3,DecisionTree,Test,0.437279,0.437279,0.438417,0.437834,0.203502,
4,RandomForest,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
5,RandomForest,Test,0.552207,0.552207,0.539291,0.519198,0.333865,


## AdaBoost Classifier — Model Call
Boosting model good for imbalanced classes.


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Define model
ada = AdaBoostClassifier(random_state=42)

# Run universal model function
ada, X_train_ada, X_test_ada, y_train_ada, y_test_ada, ypred_train_ada, ypred_test_ada, yproba_train_ada, yproba_test_ada = run_model(
    model=ada,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False   # Tree-based → NO scaling needed
)

# Correct metrics call (train + test)
metrics(
    model_name="AdaBoost",
    y_train=y_train_ada,
    pred_train=ypred_train_ada,
    proba_train=yproba_train_ada,
    y_test=y_test_ada,
    pred_test=ypred_test_ada,
    proba_test=yproba_test_ada
)


Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.03      0.06      5494
           1       0.52      0.80      0.63     57729
           2       0.39      0.30      0.34     40022
           3       0.63      0.01      0.02      4774
           4       0.00      0.00      0.00        75
           5       0.46      0.30      0.36     38178
           6       1.00      0.81      0.89      1716

    accuracy                           0.48    147988
   macro avg       0.47      0.32      0.33    147988
weighted avg       0.47      0.48      0.44    147988

Confusion Matrix:
[[  167  4634   493     0     0   200     0]
 [  318 46243  6314     9     0  4845     0]
 [   39 20830 12156    14     0  6981     2]
 [    2  1948  1286    59     0  1478     1]
 [    1    36    15     2     0    21     0]
 [   21 15673 11068    10     0 11405     1]
 [    0   319     1     0     0     4  1392]]

Classification Report:
              precisi

Unnamed: 0,Model_Name,Split,Accuracy,Recall,Precision,F1-Score,Kappa,ROC-AUC
0,LogisticRegression,Train,0.534131,0.534131,0.520258,0.500776,0.30723,
1,LogisticRegression,Test,0.527286,0.527286,0.509701,0.493378,0.297085,
2,DecisionTree,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
3,DecisionTree,Test,0.437279,0.437279,0.438417,0.437834,0.203502,
4,RandomForest,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
5,RandomForest,Test,0.552207,0.552207,0.539291,0.519198,0.333865,
6,AdaBoost,Train,0.48262,0.48262,0.467189,0.443335,0.219173,
7,AdaBoost,Test,0.48258,0.48258,0.468795,0.443169,0.21877,


## XGBoost Classifier — Model Call
High-performance boosting model; does not require scaling.


In [None]:
from xgboost import XGBClassifier

# Define model
xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"
)

# Run model
xgb, X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb, ypred_train_xgb, ypred_test_xgb, yproba_train_xgb, yproba_test_xgb = run_model(
    model=xgb,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False   # XGBoost does NOT require scaling
)

# CORRECT metrics call
metrics(
    model_name="XGBoost",
    y_train=y_train_xgb,
    pred_train=ypred_train_xgb,
    proba_train=yproba_train_xgb,
    y_test=y_test_xgb,
    pred_test=ypred_test_xgb,
    proba_test=yproba_test_xgb
)


Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.16      0.27      5494
           1       0.64      0.85      0.73     57729
           2       0.57      0.40      0.47     40022
           3       0.68      0.37      0.48      4774
           4       1.00      0.87      0.93        75
           5       0.62      0.61      0.62     38178
           6       1.00      1.00      1.00      1716

    accuracy                           0.63    147988
   macro avg       0.76      0.61      0.64    147988
weighted avg       0.63      0.63      0.61    147988

Confusion Matrix:
[[  877  4194   286     6     0   131     0]
 [  158 48834  4786   206     0  3745     0]
 [   25 14922 16074   206     0  8795     0]
 [    0   627   578  1752     0  1817     0]
 [    0     5     2     0    65     3     0]
 [    6  7709  6592   395     0 23476     0]
 [    0     0     0     0     0     0  1716]]

Classification Report:
              precisi

Unnamed: 0,Model_Name,Split,Accuracy,Recall,Precision,F1-Score,Kappa,ROC-AUC
0,LogisticRegression,Train,0.534131,0.534131,0.520258,0.500776,0.30723,
1,LogisticRegression,Test,0.527286,0.527286,0.509701,0.493378,0.297085,
2,DecisionTree,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
3,DecisionTree,Test,0.437279,0.437279,0.438417,0.437834,0.203502,
4,RandomForest,Train,0.999986,0.999986,0.999986,0.999986,0.999981,
5,RandomForest,Test,0.552207,0.552207,0.539291,0.519198,0.333865,
6,AdaBoost,Train,0.48262,0.48262,0.467189,0.443335,0.219173,
7,AdaBoost,Test,0.48258,0.48258,0.468795,0.443169,0.21877,
8,XGBoost,Train,0.627037,0.627037,0.627403,0.60797,0.451527,
9,XGBoost,Test,0.561262,0.561262,0.543764,0.536816,0.353865,


# Model 6 — Vehicle Damage Extent Classification  
## Full Model Performance Summary & Interpretation

---

## Table 1 — Performance Overview (Train vs Test)

| Model               | Train Acc | Test Acc | Gap | Overfitting Status | Interpretation |
|--------------------|-----------|----------|------|--------------------|----------------|
| Logistic Regression | 0.534     | 0.527    | 0.007 | ❌ No Overfitting | Weak model; linear boundary not sufficient for 6-class damage problem. |
| Decision Tree       | 1.000     | 0.437    | 0.563 | ⚠️ Severe Overfitting | Memorizes training data; fails on unseen data. |
| Random Forest       | 1.000     | 0.552    | 0.448 | ⚠️ Strong Overfitting | Needs parameter tuning; high variance. |
| AdaBoost            | 0.483     | 0.483    | 0.000 | ❌ No Overfitting | Very weak classifier; cannot model complex patterns. |
| XGBoost             | 0.627     | 0.561    | 0.066 | ⚠️ Mild Overfitting | Best performer so far; still needs tuning and class balancing. |

---

## Table 2 — Overfitting / Underfitting Diagnosis

| Model               | Status | Reason |
|--------------------|--------|--------|
| Logistic Regression | Underfitting | Low accuracy for both train & test. |
| Decision Tree       | Severe Overfitting | Train=100%, Test extremely low. |
| Random Forest       | Overfitting | Perfect train accuracy; low generalization. |
| AdaBoost            | Underfitting | Very low scores on train & test. |
| XGBoost             | Mild Overfitting | Better learning but imperfect generalization. |

---

## Table 3 — Business Interpretation

| Model               | Business Impact | Recommendation |
|--------------------|----------------|----------------|
| Logistic Regression | Too simple for multi-class damage prediction | Not suitable |
| Decision Tree       | Highly unstable | Avoid |
| Random Forest       | Good potential but needs tuning | Use after hyperparameter tuning |
| AdaBoost            | Weak patterns learned | Not recommended |
| XGBoost             | Best accuracy so far | Proceed with tuning + SMOTE |

---

# Final Verdict

| Rank | Model | Reason |
|------|--------|--------|
| 1    | XGBoost | Highest test performance; stable after tuning |
| 2    | Random Forest | Needs tuning to reduce overfitting |
| 3    | Logistic Regression | Too weak but consistent |
| 4    | AdaBoost | Very weak predictive power |
| 5    | Decision Tree | Extremely overfit; avoid |

---

# Key Insight (Why accuracy is low?)

Vehicle Damage Extent has **6 categories + heavy imbalance**, making it a **hard multi-class problem**.

To improve performance:

Apply **SMOTE** or class-weight balancing  
Reduce dimensionality  
Tune XGBoost & Random Forest  
Use fewer, more meaningful features  

---

In [None]:
df.columns.to_list()

['Road Name',
 'Cross-Street Name',
 'Driver Substance Abuse',
 'Driver At Fault',
 'Injury Severity',
 'Driver Distracted By',
 'Drivers License State',
 'Vehicle Body Type',
 'Speed Limit',
 'Driverless Vehicle',
 'Parked Vehicle',
 'Vehicle Year',
 'Vehicle Make',
 'Latitude',
 'Longitude',
 'Vehicle Model',
 'hour',
 'Crash_year',
 'Crash_month',
 'Crash_day',
 'Crash_hour',
 'Crash_week',
 'Agency Name_Maryland_National_Capital',
 'Agency Name_Montgomery_County_Police',
 'Agency Name_Rockville_Police_Department',
 'Agency Name_Takoma_Park_Police_Department',
 'ACRS Report Type_Injury Crash',
 'ACRS Report Type_Property Damage Crash',
 'Route Type_County_Route',
 'Route Type_Crossover',
 'Route Type_Government_Route',
 'Route Type_Interstate_Route',
 'Route Type_Local_Route',
 'Route Type_Maryland_State_Route',
 'Route Type_Municipality_Route',
 'Route Type_Other_Public_Roadway',
 'Route Type_Private_Route',
 'Route Type_Ramp_Route',
 'Route Type_Service_Road',
 'Route Type_Spur_Ro