# Final XGBoost model

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from tqdm.auto import tqdm
import pickle

## Background

In a previous step I trained and tuned 3 different models and found that the best performer was the XGBoost model tuned with the following variables:

- num_boost_round_final = 43
- eta_final = 0.3
- max_depth_final = 4
- min_child_weight_final=5

I retrain this model here for ease of reproducibility (some of the steps in the training notebook where I try to find the ideal parameters) can take time. This is faster to run.

In [17]:
df_collision = pd.read_csv("data/collisions_final.csv")
df_collision.head(2)

Unnamed: 0,police_force,number_of_vehicles,day_of_week,time,first_road_class,road_type,speed_limit,light_conditions,weather_conditions,road_surface_conditions,is_severe,month,day_of_year,is_trunk,is_near_pedestrian_crossing,is_urban,has_special_conditions_at_site,is_carriageway_hazard,is_near_junction
0,metropolitan_police,1,sunday,01:00,c,one_way_street,20,darkness___lights_lit,other_adverse_weather_condition,wet_or_damp,0,january,1,0,1,1,0,0,1
1,metropolitan_police,3,sunday,02:00,unclassified,single_carriageway,30,darkness___lights_lit,fine_no_high_winds,dry,0,january,1,0,1,1,0,0,1


## Prepare data

In [18]:
df_full_train, df_test = train_test_split(df_collision, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train["is_severe"].values
y_val = df_val["is_severe"].values
y_test = df_test["is_severe"].values

del df_train["is_severe"]
del df_val["is_severe"]
del df_test["is_severe"]

In [19]:
def train(df_training, y_training, eta=0.3, max_depth=4, min_child_weight=5, num_boost_round=43):
    """Train the dataset
    
    Parameters
    ---------
    df_training: training dataframe
    y_training: np.array of y training values
    eta: the learning rate
    max_depth: int, the depth of the decision tree i.e. the number of levels from the root node to furthest leaf node
    min_child_weight: int, the minimum “sum of weights” of observations. Higher values are associated with less overfitting.
    num_boost_round: number of boosting rounds
    """
    xgb_final_params = {
        'booster': 'gbtree', # default
        'verbosity': 1, # default
        'nthread': 6, # how many cores/ how much parallelization, depends on your system
    
        'eta': eta, # default
        'max_depth': max_depth, 
        'min_child_weight': min_child_weight, # default
    
        'objective' :'binary:logistic', 
        'seed': 1, # random number seed to make the results reproducible
        'eval_metric': 'auc'
    }
    
    dicts = df_training.to_dict(orient="records")
    dv = DictVectorizer(sparse=False).fit(dicts)
    X_training = dv.transform(dicts)
    dxtrain = xgb.DMatrix(X_training, label=y_training, feature_names=list(dv.get_feature_names_out()))

    model = xgb.train(xgb_final_params, dxtrain, num_boost_round=num_boost_round)
    
    return dv, model

In [20]:
def predict(df_v, dv, model, y_test):
    """Calculate predictions for given dataset

    Parameters
    ---------
    df_v: the validation dataset to perform the prediction on
    dv: DictVectorizer to use to transform the validation dataset
    model: The trained XGBoost model to use to calculate the predictions
    y_test: the list of actual target values in the validation dataset
    """
    dicts = df_v.to_dict(orient="records")

    X_test = dv.transform(dicts)
    dtest = xgb.DMatrix(X_test, label=y_test, feature_names=list(dv.get_feature_names_out()))

    y_pred = model.predict(dtest)

    return y_pred

## Validation

For our final model, I will use the KFold split method to evaluate the model on different subsets of the full training set and then take the average `roc_auc_score`. 

In [21]:
def perform_validation(df, kfold_n_splits=10, eta=0.3, max_depth=4, min_child_weight=5, num_boost_round=43):
    """Evaluate the model across different datasets

    Parameters
    ---------
    df: The dataset to use to extract training and validation datasets and perform evaluation
    kfold_n_slits: the number of groups the df should be split into 
    eta: the learning rate
    max_depth: int, the depth of the decision tree i.e. the number of levels from the root node to furthest leaf node
    min_child_weight: int, the minimum “sum of weights” of observations. Higher values are associated with less overfitting.
    num_boost_round: number of boosting rounds
    """
    kfold = KFold(n_splits=kfold_n_splits, shuffle=True, random_state=1)

    scores = []
    
    for fold_num, (train_idx, val_idx) in enumerate(kfold.split(df)):
        df_t = df.iloc[train_idx]
        df_v = df.iloc[val_idx]
    
        y_t = df_t["is_severe"].values
        y_v = df_v["is_severe"].values
    
        del df_t["is_severe"]
        del df_v["is_severe"]
        
        dv, model =train(df_t, y_t,eta=eta, max_depth=max_depth, min_child_weight=min_child_weight, num_boost_round=num_boost_round)
        y_pred = predict(df_v, dv, model, y_v)
        auc = roc_auc_score(y_v, y_pred)
        scores.append(auc)
        print(f"Score on fold {fold_num}: {auc}")
    return scores

In [22]:
n_splits = 5
num_boost_round_final = 43
eta_final = 0.3
max_depth_final = 4
min_child_weight_final=5

totals = perform_validation(df_full_train,kfold_n_splits=n_splits, 
                            eta=eta_final, max_depth=max_depth_final, 
                            min_child_weight=min_child_weight_final, num_boost_round=num_boost_round_final)

Score on fold 0: 0.6278771981084864
Score on fold 1: 0.6257791507700057
Score on fold 2: 0.6205236281538566
Score on fold 3: 0.62936066745586
Score on fold 4: 0.6199174512530055


In [23]:
np.mean(totals), np.std(totals)

(np.float64(0.6246916191482429), np.float64(0.0038287198502983356))

## Train final model

In [24]:
df_full_train, df_test = train_test_split(df_collision, test_size=0.2, random_state=11)

df_full_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train["is_severe"].values
y_test = df_test["is_severe"].values

del df_test["is_severe"]
del df_full_train["is_severe"]

In [25]:
n_splits = 5
num_boost_round_final = 43
eta_final = 0.3
max_depth_final = 4
min_child_weight_final=5

dv, model_final =train(df_full_train, y_full_train,eta=eta_final, max_depth=max_depth_final, 
                            min_child_weight=min_child_weight_final, num_boost_round=num_boost_round_final)
y_pred = predict(df_test, dv, model_final, y_test)
auc = roc_auc_score(y_test, y_pred)

auc

np.float64(0.6288534117462088)

## Save model

In [26]:
output_file = f"collision_severity_model_eta={eta_final}_md={max_depth_final}_mcw={min_child_weight_final}_nboost={num_boost_round_final}.bin"

with open(output_file, 'wb') as f_out:
    pickle.dump((dv,model_final),f_out)