In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# ---------------------------------------------
# Load Data
# ---------------------------------------------
dftrain = pd.read_csv('../Models/LabelEncoded.csv')

# Remove unnecessary column
dftrain = dftrain.drop(columns=['Unnamed: 0'])

# Tell XGBoost that these columns are categorical
cat_cols = ['Street','City','County','State','Airport_Code',
            'Wind_Direction','Weather_Condition','Sunrise_Sunset']

for col in cat_cols:
    dftrain[col] = dftrain[col].astype('category')

# ---------------------------------------------
# Split X, y
# ---------------------------------------------
X = dftrain.drop(columns=['Severity'])
y = dftrain['Severity'] - 1      # convert 1,2,3,4 â†’ 0,1,2,3

# Correct train-test split (NO shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, shuffle=True
)

# ---------------------------------------------
# K-Fold Training
# ---------------------------------------------
FOLDS = 3
SEED = 1004

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

predictions = np.zeros((y_test.shape[0], 4))
val_scores = []
train_scores = []

counter = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\n########### FOLD {fold+1} ###########")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=1000,
        max_depth=3,
        eta=0.2,
        colsample_bytree=0.67,
        tree_method='hist',               # CPU friendly
        enable_categorical=True,
        eval_metric='mlogloss',
        n_jobs=-1,
        verbosity=0
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        verbose=False
    )

    # Predict
    train_pred = model.predict_proba(X_tr)
    val_pred = model.predict_proba(X_val)

    # AUC (multiclass)
    t_auc = roc_auc_score(y_tr, train_pred, multi_class='ovr')
    v_auc = roc_auc_score(y_val, val_pred, multi_class='ovr')

    train_scores.append(t_auc)
    val_scores.append(v_auc)

    print(f"Train AUC: {t_auc:.4f} | Val AUC: {v_auc:.4f}")

    # Add to final predictions if validation quality is decent
    if v_auc > 0.70:
        predictions += model.predict_proba(X_test)
        counter += 1

# ---------------------------------------------
# Final average predictions
# ---------------------------------------------
if counter > 0:
    predictions /= counter

print("\n====== FINAL RESULTS ======")
print("Mean Train AUC:", np.mean(train_scores))
print("Mean Val AUC:", np.mean(val_scores))

# # Save booster
# model.get_booster().save_model("xgb_model.json")
# print("\nModel saved as xgb_model.json")



########### FOLD 1 ###########
Train AUC: 0.9859 | Val AUC: 0.8656

########### FOLD 2 ###########
Train AUC: 0.9858 | Val AUC: 0.8820

########### FOLD 3 ###########
Train AUC: 0.9852 | Val AUC: 0.9033

Mean Train AUC: 0.9856287901552641
Mean Val AUC: 0.8836401876656091

Model saved as xgb_model.json


In [2]:
model.save_model("xgb_model_.json")

In [3]:
# import xgboost as xgb

# model = xgb.Booster()
# model.load_model("xgb_model_.json")


In [4]:
dftrain['Severity'].value_counts()

Severity
2    196190
3    103231
4     14771
1        93
Name: count, dtype: int64

In [8]:
X.columns

Index(['Year', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Street', 'City',
       'County', 'State', 'Airport_Code', 'Temperature(F)', 'Wind_Chill(F)',
       'Visibility(mi)', 'Wind_Direction', 'Weather_Condition',
       'Traffic_Signal', 'Sunrise_Sunset', 'TimeDiff'],
      dtype='object')