In [171]:
from src.data.sets import load_sets

In [172]:
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

../data/processed/


In [173]:
import xgboost as xgb
import numpy as np
import pandas as pd

In [174]:
xgboost1 = xgb.XGBClassifier()

In [175]:
df = pd.DataFrame(X_train)
X_train = df.fillna(0).drop(columns=[1,2,6]).to_numpy()

In [176]:
df = pd.DataFrame(X_val)
X_val = df.fillna(0).drop(columns=[1,2,6]).to_numpy()

In [177]:
print(X_train.shape)
print(X_val.shape)

(142795, 15)
(47599, 15)


In [178]:
notna_idx = pd.DataFrame(y_train).notna().to_numpy().flatten()
X_train, y_train = X_train[notna_idx], y_train[notna_idx]

xgboost1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [180]:
from joblib import dump 

dump(xgboost1,  '../models/xgboost_default.joblib')

['../models/xgboost_default.joblib']

In [181]:
notna_idx = pd.DataFrame(y_val).notna().to_numpy().flatten()
X_val, y_val = X_val[notna_idx], y_val[notna_idx]


y_train_preds = xgboost1.predict(X_train)
y_val_preds = xgboost1.predict(X_val)

In [182]:
print(y_train_preds)
print(y_val_preds)
print(y_train)

['x<10min' 'x<5min' 'x<30min' ... 'x<30min' 'x<30min' 'x<5min']
['x<5min' 'x<30min' 'x<30min' ... 'x<30min' 'x<30min' 'x<5min']
['x<10min' 'x<5min' 'x<30min' ... 'x<30min' 'x<30min' 'x<5min']


In [185]:
from src.models.performance import print_class_perf

print_class_perf(y_preds=y_train_preds, y_actuals=y_train, set_name='Training', average='weighted')
print_class_perf(y_preds=y_val_preds, y_actuals=y_val, set_name='Validation', average='weighted')

Accuracy Training: 0.9281387163270743
F1 Training: 0.927929800867642
Accuracy Validation: 0.9113612455291394
F1 Validation: 0.9108423630196816


In [186]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [187]:
space = {
    'max_depth' : hp.choice('max_depth', range(5, 20, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.05),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.05)
}

In [188]:
def objective(space):
    from sklearn.model_selection import cross_val_score
    
    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree']
    )
    
    acc = cross_val_score(xgboost, X_train, y_train, cv=10, scoring="accuracy").mean()

    return{'loss': 1-acc, 'status': STATUS_OK }

In [189]:
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=5
)

100%|██████████| 5/5 [09:56<00:00, 119.36s/trial, best loss: 0.09387050041454859]


In [190]:
print("Best: ", best)

Best:  {'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.2, 'max_depth': 8, 'min_child_weight': 1.0, 'subsample': 0.6000000000000001}


In [192]:
xgboost2 = xgb.XGBClassifier(
    max_depth = best['max_depth'],
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    colsample_bytree = best['colsample_bytree']
)

In [193]:
xgboost2.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8500000000000001, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=8,
              min_child_weight=1.0, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6000000000000001,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [194]:
print_class_perf(y_preds=xgboost2.predict(X_train), y_actuals=y_train, set_name='Training', average='weighted')
print_class_perf(y_preds=xgboost2.predict(X_val), y_actuals=y_val, set_name='Validation', average='weighted')

Accuracy Training: 0.9385859124118299
F1 Training: 0.938431660611072
Accuracy Validation: 0.9115295602777194
F1 Validation: 0.9110171687372137


In [195]:
dump(xgboost2,  '../models/xgboost_best.joblib')

['../models/xgboost_best.joblib']