In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler

# resample the dataset
from sklearn.utils import resample
from sklearn.utils import shuffle

# import various functions from sklearn
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier


# import the XGBoost function for classification
from xgboost import XGBClassifier

import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
data = pd.read_csv('/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/0_dataset/3.FEATURED_ENGINEERING DATASET/Encoded_Data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/0_dataset/3.FEATURED_ENGINEERING DATASET/Encoded_Data.csv'

In [None]:
df=data.copy()

In [None]:
df.head(3)

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.shape

## Model-1: Master + Validation Split and Train–Test Split
### Step 1: Load Encoded Dataset
We load the complete encoded dataset which will be used to create splits.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

target = "Injury Severity"
X = df.drop(columns=[target])
y = df[target]

print("Dataset loaded:", df.shape)


## Step 2: Create Master (90%) and Validation (10%) Split
Master dataset is used for model training and testing.  
Validation dataset is for final unbiased evaluation.


In [None]:
X_major, X_val, y_major, y_val = train_test_split(
    X, y,
    test_size=0.10,
    stratify=y,
    random_state=42
)

major_df = pd.concat([X_major, y_major], axis=1)
val_df   = pd.concat([X_val, y_val], axis=1)

print("Master dataset:", major_df.shape)
print("Validation dataset:", val_df.shape)


In [None]:
val_df.head(2)

## Step 3: Save Master and Validation Files
Both files are saved in the Model-1 folder for later use.


In [None]:
save_path = "/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/08_MODELS/Model1_InjurySeverity/"

major_df.to_csv(save_path + "model_1_master_data.csv", index=False)
val_df.to_csv(save_path + "model_1_validation_data.csv", index=False)

print("Saved model_1_master_data.csv and model_1_validation_data.csv")


In [None]:
df_master_data = pd.read_csv('/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/08_MODELS/Model1_InjurySeverity/model_1_master_data.csv')


In [None]:
df_master_data.head(2)

# IMPORTANT  USER DEFINED FUNCTION

### UNIVERSAL MODEL FUNCTION

In [None]:
# ================================================================
# UNIVERSAL MODEL FUNCTION (simple + clear scaling logic)
# ================================================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

def run_model(model, X, y, test_size=0.20, scaled=False, threshold=0.5):
    """
    scaled = True  → apply scaling on numeric columns
    scaled = False → no scaling
    model  = sklearn model OR "stats" for statsmodels logistic
    """

    # 1) Train–Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    # 2) Scaling if selected
    if scaled:
        scaler = StandardScaler()
        num_cols = X.select_dtypes(include='number').columns

        X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
        X_test[num_cols]  = scaler.transform(X_test[num_cols])

    # 3) Statsmodels Logit
    if model == "stats":
        X_train_c = sm.add_constant(X_train)
        X_test_c  = sm.add_constant(X_test)

        logit = sm.Logit(y_train, X_train_c).fit(disp=False)

        yproba_train = logit.predict(X_train_c)
        yproba_test  = logit.predict(X_test_c)

        ypred_train = (yproba_train >= threshold).astype(int)
        ypred_test  = (yproba_test >= threshold).astype(int)

        return logit, X_train, X_test, y_train, y_test, ypred_train, ypred_test, yproba_train, yproba_test

    # 4) Normal sklearn model
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)
    ypred_test  = model.predict(X_test)

    yproba_train = model.predict_proba(X_train)[:,1]
    yproba_test  = model.predict_proba(X_test)[:,1]

    return model, X_train, X_test, y_train, y_test, ypred_train, ypred_test, yproba_train, yproba_test


### METRICS FUNCTION

In [None]:
# ================================================================
# METRICS FUNCTION (Train + Test separate + Binary/Multiclass safe)
# ================================================================

import pandas as pd
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score,
    f1_score, roc_auc_score, cohen_kappa_score,
    classification_report, confusion_matrix
)

d = pd.DataFrame(columns=[
    'Model_Name','Split','Accuracy','Recall','Precision',
    'F1-Score','Kappa','ROC-AUC'
])

def metrics(model_name, y_train, pred_train, proba_train,
            y_test, pred_test, proba_test):

    global d

    # Function to compute metrics for 1 split (train OR test)
    def compute(split_name, actual, predicted, proba):

        unique_classes = len(pd.Series(actual).unique())
        is_binary = (unique_classes == 2)
        avg = "binary" if is_binary else "weighted"

        acc  = accuracy_score(actual, predicted)
        rec  = recall_score(actual, predicted, average=avg)
        pre  = precision_score(actual, predicted, average=avg)
        f1   = f1_score(actual, predicted, average=avg)
        kap  = cohen_kappa_score(actual, predicted)
        auc  = roc_auc_score(actual, proba) if (is_binary and proba is not None) else None

        # append to global dataframe
        d.loc[len(d)] = [model_name, split_name, acc, rec, pre, f1, kap, auc]

        # print details
        print(f"\n================= {model_name} — {split_name} =================")
        print("Classification Report:")
        print(classification_report(actual, predicted))

        print("Confusion Matrix:")
        print(confusion_matrix(actual, predicted))

        if auc is not None:
            print("ROC-AUC:", auc)

    # ---- TRAIN METRICS ----
    compute("Train", y_train, pred_train, proba_train)

    # ---- TEST METRICS ----
    compute("Test", y_test, pred_test, proba_test)

    return d

### Feature Importance Function

In [None]:
def fi(model, x, n_features=10):
    df_fi = pd.DataFrame({
        "Feature": x.columns,
        "Importance": model.feature_importances_
    })
    return df_fi.sort_values(by="Importance", ascending=False).head(n_features)


### ROC Curve

In [None]:
def plot_roc_plain(y_test, yproba_test):
    fpr, tpr, _ = roc_curve(y_test, yproba_test)
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1])
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("ROC Curve")
    plt.show()


# MODEL BUILDING AND TRAINING


### Logit Model (Statsmodels Logistic Regression)

###  Why Statsmodels Logit Failed

- Statsmodels **Logit requires a binary target** (only 0 and 1).
- Our Injury Severity column has **5 classes** (0, 1, 2, 3, 4).
- Because the target is **multi-class**, Logit cannot estimate probabilities in the 0–1 range.
- Therefore, Logit throws the error: *“endog must be in the unit interval.”*


## Logistic Regression (Sklearn) — Model Call
This model supports multi-class Injury Severity and works correctly without converting to binary.


In [None]:
X = df_master_data.drop(columns=target)
y = df_master_data[target]

In [None]:
from sklearn.linear_model import LogisticRegression

# Define model
log_reg = LogisticRegression(multi_class="multinomial")

# Run model using universal function
log_reg, X_train_lr, X_test_lr, y_train_lr, y_test_lr, ypred_train_lr, ypred_test_lr, yproba_train_lr, yproba_test_lr = run_model(
    model=log_reg,
    X=X,
    y=y,
    test_size=0.20,
    scaled=True
)

# Correct metrics call
metrics(
    model_name="LogisticRegression",
    y_train=y_train_lr,
    pred_train=ypred_train_lr,
    proba_train=yproba_train_lr,
    y_test=y_test_lr,
    pred_test=ypred_test_lr,
    proba_test=yproba_test_lr
)

## Decision Tree Classifier — Model Call
Simple non-linear classifier, works well without scaling.


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define model
dt = DecisionTreeClassifier(random_state=42)

# Run model using universal function
dt, X_train_dt, X_test_dt, y_train_dt, y_test_dt, ypred_train_dt, ypred_test_dt, yproba_train_dt, yproba_test_dt = run_model(
    model=dt,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False    # Trees do NOT need scaling
)

# Correct metrics function call
metrics(
    model_name="DecisionTree",
    y_train=y_train_dt,
    pred_train=ypred_train_dt,
    proba_train=yproba_train_dt,
    y_test=y_test_dt,
    pred_test=ypred_test_dt,
    proba_test=yproba_test_dt
)

## Random Forest Classifier — Model Call
Ensemble of decision trees, robust to imbalance and noise.


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define model
rf = RandomForestClassifier(random_state=42)

# Run model using universal function
rf, X_train_rf, X_test_rf, y_train_rf, y_test_rf, ypred_train_rf, ypred_test_rf, yproba_train_rf, yproba_test_rf = run_model(
    model=rf,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False     # Trees & ensemble trees do NOT need scaling
)

# Correct metrics function call
metrics(
    model_name="RandomForest",
    y_train=y_train_rf,
    pred_train=ypred_train_rf,
    proba_train=yproba_train_rf,
    y_test=y_test_rf,
    pred_test=ypred_test_rf,
    proba_test=yproba_test_rf
)

## Gradient Boosting Classifier — Model Call
Boosting method that handles complex patterns; scaling not needed.


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Define model
gb = GradientBoostingClassifier(random_state=42)

# Run model
gb, X_train_gb, X_test_gb, y_train_gb, y_test_gb, ypred_train_gb, ypred_test_gb, yproba_train_gb, yproba_test_gb = run_model(
    model=gb,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False      # Boosting trees DO NOT need scaling
)

# Correct metrics function call
metrics(
    model_name="GradientBoosting",
    y_train=y_train_gb,
    pred_train=ypred_train_gb,
    proba_train=yproba_train_gb,
    y_test=y_test_gb,
    pred_test=ypred_test_gb,
    proba_test=yproba_test_gb
)

## AdaBoost Classifier — Model Call
Boosting model good for imbalanced classes.


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Define model
ada = AdaBoostClassifier(random_state=42)

# Run universal model function
ada, X_train_ada, X_test_ada, y_train_ada, y_test_ada, ypred_train_ada, ypred_test_ada, yproba_train_ada, yproba_test_ada = run_model(
    model=ada,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False   # Tree-based → NO scaling needed
)

# Correct metrics call (train + test)
metrics(
    model_name="AdaBoost",
    y_train=y_train_ada,
    pred_train=ypred_train_ada,
    proba_train=yproba_train_ada,
    y_test=y_test_ada,
    pred_test=ypred_test_ada,
    proba_test=yproba_test_ada
)

## XGBoost Classifier — Model Call
High-performance boosting model; does not require scaling.


In [None]:
from xgboost import XGBClassifier

# Define model
xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"
)

# Run model
xgb, X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb, ypred_train_xgb, ypred_test_xgb, yproba_train_xgb, yproba_test_xgb = run_model(
    model=xgb,
    X=X,
    y=y,
    test_size=0.20,
    scaled=False   # XGBoost does NOT require scaling
)

# CORRECT metrics call
metrics(
    model_name="XGBoost",
    y_train=y_train_xgb,
    pred_train=ypred_train_xgb,
    proba_train=yproba_train_xgb,
    y_test=y_test_xgb,
    pred_test=ypred_test_xgb,
    proba_test=yproba_test_xgb
)

# Full Model Comparison & Inference Summary (Train vs Test)

---

## Table 1 — Performance Summary

| Model               | Train Acc | Test Acc | Gap (Train−Test) | Overfitting Status       | Business Interpretation |
|--------------------|-----------|----------|------------------|---------------------------|-------------------------|
| Logistic Regression | 0.832     | 0.829    | 0.003            | ❌ No Overfitting         | Stable, reliable, interpretable baseline model. |
| Decision Tree       | 1.000     | 0.796    | 0.204            | ⚠️ Severe Overfitting     | Memorizes data; poor real-world performance. Avoid without pruning. |
| Random Forest       | 1.000     | 0.830    | 0.170            | ⚠️ Strong Overfitting     | High train accuracy but weak generalization; needs tuning. |
| Gradient Boosting   | 0.839     | 0.835    | 0.004            | ❌ No Overfitting         | Strong generalization; excellent production candidate. |
| AdaBoost            | 0.825     | 0.824    | 0.001            | ❌ No Overfitting         | Stable and reliable; consistent performance. |
| XGBoost             | 0.882     | 0.837    | 0.045            | ⚠️  Mild Overfitting       | High accuracy; needs tuning but very effective. |

---

## Table 2 — Overfitting/Underfitting Check

| Model               | Status              | Reason |
|--------------------|----------------------|--------|
| Logistic Regression | Balanced             | Train ≈ Test, no overfit |
| Decision Tree       | Severe Overfitting   | Train = 100%, huge gap vs test |
| Random Forest       | Overfitting          | Train = 100%, lower test accuracy |
| Gradient Boosting   | Balanced             | Very small gap |
| AdaBoost            | Balanced             | Train and test same |
| XGBoost             | Mild Overfitting     | Slight drop in test |

---

## Table 3 — Business Interpretation

| Model               | Business Impact | Use Case |
|--------------------|-----------------|----------|
| Logistic Regression | High interpretability; stable | Policy, audit reporting, explainable AI |
| Decision Tree       | Unstable; risky | Avoid for deployment |
| Random Forest       | Strong model but needs tuning | Post-tuning severity prediction |
| Gradient Boosting   | Best stability + accuracy | Real-time severity prediction |
| AdaBoost            | Very reliable | Safe deployment; low risk |
| XGBoost             | Highest accuracy | Best choice after tuning |

---

# Final Verdict (Short Summary)

| Rank | Model             | Reason |
|------|------------------|--------|
| 1 | Gradient Boosting | Best balance of accuracy + stability |
| 2 | XGBoost           | Highest accuracy; slight overfit |
| 3 | AdaBoost          | Stable and consistent |
| 4    | Logistic Regression | Strong interpretable baseline |
| 5    | Random Forest     | Overfits; needs tuning |
| 6    | Decision Tree     | Too unstable |

---

## Model Selection Summary for Further Tuning & Feature Alteration

| Model              | Why Consider / Not Consider for Further Tuning |
|--------------------|------------------------------------------------|
| **XGBoost**     | **Best choice.** Highest Test F1-score and Kappa → strongest generalization. Handles high-dimensional (167 encoded features) data very well. Learns complex non-linear feature interactions. Highly sensitive to hyperparameter tuning and feature engineering, giving maximum upside. |
| Logistic Regression | Good baseline but **linear model**. Limited ability to capture complex interactions in heavily encoded feature space. Gains from tuning and feature alteration are usually marginal. |
| Decision Tree      | **Severe overfitting** (Train = 1.0). Very unstable with many features. Poor generalization → not suitable for reliable feature experimentation. |
| Random Forest      | Strong but **fully overfitted on train (1.0)**. Less responsive to fine feature engineering. Improvements from tuning are usually incremental compared to XGBoost. |
| Gradient Boosting  | Decent performance and stability, but lower Test F1 and Kappa than XGBoost. Less flexible and slower to scale with large feature sets. |
| AdaBoost           | Works better with weak learners and simpler feature spaces. Lower performance metrics and limited gains from extensive tuning with many encoded features. |

### Final Decision
**Select XGBoost for further tuning and feature alteration** due to its superior generalization, robustness to high-dimensional data, and highest potential performance gains.
