# Injury Severity Prediction using Neural Network

This notebook implements a complete machine learning pipeline
to predict injury severity in road crashes.

The focus is on:
- Clean data separation
- Avoiding data leakage
- Industry-standard evaluation
- Neural networks for structured data

The workflow follows real production and research practices.


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Model-1: Neural Network using Master and Validation Dataset Split

This notebook builds a neural network model
to predict Injury Severity from raw crash data.

The workflow follows a strict data separation strategy
to avoid data leakage and ensure reliable evaluation.

The dataset is first divided into:
- Master dataset for model development
- Validation dataset for final unbiased evaluation


## Loading the Master Data for further Tuning

In [None]:
data = pd.read_excel('/content/drive/MyDrive/CAPSTONE/CAPSTONE_PROJECT/0_dataset/2_CLEANED DATASET/FULLY_CLEANED DATASET.xlsx')

In [None]:
df = data.copy()

df.head()

Unnamed: 0.1,Unnamed: 0,Report Number,Local Case Number,Agency Name,ACRS Report Type,Crash Date/Time,Route Type,Road Name,Cross-Street Name,Collision Type,Weather,Surface Condition,Light,Traffic Control,Driver Substance Abuse,Person ID,Driver At Fault,Injury Severity,Driver Distracted By,Drivers License State,Vehicle ID,Vehicle Damage Extent,Vehicle First Impact Location,Vehicle Body Type,Vehicle Movement,Vehicle Going Dir,Speed Limit,Driverless Vehicle,Parked Vehicle,Vehicle Year,Vehicle Make,Latitude,Longitude,Vehicle Model,Circumstance_Category,hour,Crash_year,Crash_month,Crash_day,Crash_hour,Crash_day_name,Crash_date,Crash_week
0,0,MCP3126006X,250037402,Montgomery_County_Police,Injury Crash,2025-08-21 17:21:00,Maryland_State_Route,FREDERICK RD,TILTON DR,Rear-End (Same Direction),Clear,Dry,Daylight,No Control,None Detected,BB3CB0F3-5A89-45FB-9516-48DDDB92B0A9,Yes,No Apparent Injury,Other Distraction,MD,768C98FA-C137-47BC-BE44-EE3BA4B95F66,Superficial,Twelve O Clock,PassengerCar,MovingConstantSpeed,Northbound,40.0,No,No,2013.0,KIA,39.219796,-77.257416,Soul_Kia,Tailgating,17,2025,8,21,17,Thursday,2025-08-21,34
1,1,MCP2349001B,250037516,Montgomery_County_Police,Property Damage Crash,2025-08-22 10:44:00,Interstate_Route,EISENHOWER MEMORIAL HWY,DIAMONDBACK DR,Single Vehicle,Clear,Dry,Daylight,No Control,Alcohol Present,9B84E695-215A-447E-8AA6-D3958187BBCA,No,Suspected Minor Injury,Inattentive / Lost In Thought,XX,BC322ECD-006B-4919-AAF8-3F64D934B789,Vehicle Not at Scene,VehicleNotAtScene,PickupTruck,MovingConstantSpeed,Northbound,55.0,No,No,2018.0,NISSAN,39.180181,-77.250657,Altima_Nissan,Lane Violation,10,2025,8,22,10,Friday,2025-08-22,34
2,2,MCP296500BC,250033157,Montgomery_County_Police,Property Damage Crash,2025-07-25 11:55:00,Bicycle_Route,AIRPARK RD,NEW HAMPSHIRE AVE (SB/L) NORBECK RD (WB/L) SPENCERVILLE RD (WB/L),Sideswipe (Same Direction),Clear,Dry,Daylight,Traffic Signal,None Detected,1D28ADF4-0DB2-4CBC-BDB0-1C1F5E7CF955,No,No Apparent Injury,Not Distracted,CO,1F4EBE18-DB94-4CA7-8D9A-88C30E90400D,Superficial,Seven O Clock,PassengerCar,MovingConstantSpeed,Westbound,40.0,No,No,2023.0,LEXUS,39.121219,-76.988905,RX_Lexus,Lane Violation,11,2025,7,25,11,Friday,2025-07-25,30
3,3,MCP2159003K,250037509,Montgomery_County_Police,Property Damage Crash,2025-08-22 10:36:00,Maryland_State_Route,CLOPPER RD,SPUR TO SHADY GROVE RD,Rear-End (Same Direction),Clear,Dry,Daylight,Flashing Traffic Signal,None Detected,AE9A3389-3486-4199-B8F6-015D7D2E1139,Yes,No Apparent Injury,Cell Phone (Manual Use),MD,AAEB6B5A-30B2-47D3-BF59-7F14D0A5BCAD,Disabling,Twelve O Clock,Van - Passenger (&lt;9 Seats),SlowingOrStopping,Southbound,30.0,No,No,2003.0,TOYOTA,39.207931,-77.14148,Sienna_Toyota,Tailgating,10,2025,8,22,10,Friday,2025-08-22,34
4,4,MCP312900D6,250034573,Montgomery_County_Police,Property Damage Crash,2025-08-03 14:10:00,Maryland_State_Route,E RANDOLPH RD,BRIARDALE RD,Rear vs Side,Clear,Dry,Daylight,Traffic Signal,None Detected,3B4FB53F-9543-48EA-8C28-14AC093FBC36,No,No Apparent Injury,Not Distracted,MD,B683B035-8C9F-45F7-BDB5-F9141CCF160D,Vehicle Not at Scene,VehicleNotAtScene,SportUtilityVehicle,Backing,Not On Roadway,2.5,No,No,2023.0,SUBARU,39.039662,-77.057238,Impreza_Subaru,Backing Error,14,2025,8,3,14,Sunday,2025-08-03,31


In [None]:
# ============================================================
# COMPLETE ERROR-FREE NEURAL NETWORK PIPELINE (COLAB SAFE)
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# ------------------------------------------------------------
# STEP 1: DEFINE TARGET AND DROP COLUMNS
# ------------------------------------------------------------
target_col = "Injury Severity"

drop_cols = [
    "Unnamed: 0",
    "Report Number",
    "Local Case Number",
    "Person ID",
    "Vehicle ID",
    "Crash Date/Time",
    "Crash_date",
    "Latitude",
    "Longitude"
]

X = df.drop(columns=[target_col] + drop_cols)
y = df[target_col]

# ------------------------------------------------------------
# STEP 2: MASTER (90%) + VALIDATION (10%) SPLIT
# ------------------------------------------------------------
X_master, X_val, y_master, y_val = train_test_split(
    X, y,
    test_size=0.10,
    stratify=y,
    random_state=42
)

# ------------------------------------------------------------
# STEP 3: TRAIN (80%) + TEST (20%) FROM MASTER
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_master, y_master,
    test_size=0.20,
    stratify=y_master,
    random_state=42
)

# ------------------------------------------------------------
# STEP 4: TARGET ENCODING (THIS FIXES YOUR ERROR)
# ------------------------------------------------------------
label_encoder = LabelEncoder()

y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)
y_val_enc   = label_encoder.transform(y_val)

y_train_enc = y_train_enc.astype("int32")
y_test_enc  = y_test_enc.astype("int32")
y_val_enc   = y_val_enc.astype("int32")

# ------------------------------------------------------------
# STEP 5: FEATURE TYPES
# ------------------------------------------------------------
cat_cols = X_train.select_dtypes(include="object").columns
num_cols = X_train.select_dtypes(exclude="object").columns

# ------------------------------------------------------------
# STEP 6: MEMORY-SAFE PREPROCESSING
# ------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            min_frequency=50,
            sparse_output=True
        ), cat_cols)
    ]
)

# ------------------------------------------------------------
# STEP 7: APPLY PREPROCESSING
# ------------------------------------------------------------
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)
X_val_prep   = preprocessor.transform(X_val)

# ------------------------------------------------------------
# STEP 8: SPARSE → DENSE (CORRECT WAY)
# ------------------------------------------------------------
X_train_prep = X_train_prep.toarray().astype("float32")
X_test_prep  = X_test_prep.toarray().astype("float32")
X_val_prep   = X_val_prep.toarray().astype("float32")

# ------------------------------------------------------------
# STEP 9: NEURAL NETWORK
# ------------------------------------------------------------
num_classes = len(label_encoder.classes_)

model = Sequential([
    Dense(256, activation="relu", input_shape=(X_train_prep.shape[1],)),
    Dropout(0.4),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer=Adam(0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# ------------------------------------------------------------
# STEP 10: TRAIN (NO ERROR HERE)
# ------------------------------------------------------------
history = model.fit(
    X_train_prep,
    y_train_enc,
    validation_data=(X_test_prep, y_test_enc),
    epochs=25,
    batch_size=64,
    verbose=1
)

# ------------------------------------------------------------
# STEP 11: FINAL VALIDATION EVALUATION
# ------------------------------------------------------------
val_loss, val_accuracy = model.evaluate(
    X_val_prep,
    y_val_enc,
    verbose=0
)

print("Validation Accuracy:", val_accuracy)
print("Target Classes:", label_encoder.classes_)

Epoch 1/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - accuracy: 0.8158 - loss: 0.4433 - val_accuracy: 0.8304 - val_loss: 0.3659
Epoch 2/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8310 - loss: 0.3665 - val_accuracy: 0.8305 - val_loss: 0.3613
Epoch 3/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8373 - loss: 0.3547 - val_accuracy: 0.8342 - val_loss: 0.3603
Epoch 4/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8412 - loss: 0.3454 - val_accuracy: 0.8347 - val_loss: 0.3587
Epoch 5/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8448 - loss: 0.3389 - val_accuracy: 0.8338 - val_loss: 0.3599
Epoch 6/25
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8514 - loss: 0.3279 - val_accuracy: 0.8330 - val_loss: 0.3646
Epoch 7/25
[1m

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_prep,
    y_train_enc,
    validation_data=(X_test_prep, y_test_enc),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9229 - loss: 0.1855 - val_accuracy: 0.8218 - val_loss: 0.5359
Epoch 2/50
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9245 - loss: 0.1806 - val_accuracy: 0.8199 - val_loss: 0.5425
Epoch 3/50
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9275 - loss: 0.1755 - val_accuracy: 0.8217 - val_loss: 0.5515
Epoch 4/50
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9282 - loss: 0.1731 - val_accuracy: 0.8211 - val_loss: 0.5759


In [None]:
val_loss, val_accuracy = model.evaluate(
    X_val_prep,
    y_val_enc,
    verbose=0
)

print("Final Validation Accuracy:", val_accuracy)

Final Validation Accuracy: 0.8203756213188171


In [None]:
from sklearn.metrics import classification_report

y_val_pred = model.predict(X_val_prep).argmax(axis=1)

print(classification_report(
    y_val_enc,
    y_val_pred,
    target_names=label_encoder.classes_
))

[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
                          precision    recall  f1-score   support

            Fatal Injury       0.62      0.84      0.71        19
      No Apparent Injury       0.90      0.94      0.92     16781
         Possible Injury       0.38      0.32      0.35      2049
  Suspected Minor Injury       0.31      0.27      0.29      1508
Suspected Serious Injury       0.36      0.10      0.16       197

                accuracy                           0.82     20554
               macro avg       0.51      0.49      0.49     20554
            weighted avg       0.80      0.82      0.81     20554



In [None]:
# ============================================================
# XGBOOST MULTI-CLASS CLASSIFICATION (FAIR COMPARISON)
# ============================================================

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# ------------------------------------------------------------
# STEP 1: DEFINE XGBOOST MODEL
# ------------------------------------------------------------
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(label_encoder.classes_),
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",        # fast + Colab safe
    eval_metric="mlogloss",
    random_state=42
)

# ------------------------------------------------------------
# STEP 2: TRAIN XGBOOST (FAST, NO GPU NEEDED)
# ------------------------------------------------------------
xgb_model.fit(
    X_train_prep,
    y_train_enc,
    eval_set=[(X_test_prep, y_test_enc)],
    verbose=False
)

# ------------------------------------------------------------
# STEP 3: VALIDATION EVALUATION
# ------------------------------------------------------------
y_val_pred_xgb = xgb_model.predict(X_val_prep)

val_acc_xgb = accuracy_score(y_val_enc, y_val_pred_xgb)

print("XGBoost Validation Accuracy:", val_acc_xgb)

# ------------------------------------------------------------
# STEP 4: CLASSIFICATION REPORT
# ------------------------------------------------------------
print(
    classification_report(
        y_val_enc,
        y_val_pred_xgb,
        target_names=label_encoder.classes_
    )
)

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7ce610e10f20>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 630, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


In [None]:
# ============================================================
# NEURAL NETWORK WITH CLASS WEIGHTS (ALL CLASSES TREATED FAIRLY)
# ============================================================

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping

# ------------------------------------------------------------
# STEP 1: COMPUTE CLASS WEIGHTS
# ------------------------------------------------------------
classes = np.unique(y_train_enc)

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train_enc
)

class_weights = dict(zip(classes, class_weights_array))

print("Class Weights:", class_weights)

# ------------------------------------------------------------
# STEP 2: EARLY STOPPING
# ------------------------------------------------------------
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

# ------------------------------------------------------------
# STEP 3: TRAIN NEURAL NETWORK WITH CLASS WEIGHTS
# ------------------------------------------------------------
history = model.fit(
    X_train_prep,
    y_train_enc,
    validation_data=(X_test_prep, y_test_enc),
    epochs=50,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)