In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np 
import joblib

In [5]:
import os 

project_dir = os.path.dirname(os.getcwd()) 
data_dir = os.path.join(project_dir, "data") 
sets_dir = os.path.join(data_dir, "processed") 

models_dir = os.path.join(project_dir, "models")

In [3]:
from draft.data.sets import load_sets

In [6]:
X_train, X_val, X_test, y_train, y_val, y_test = load_sets(
    load_dir=sets_dir,
    return_dict=False
)

In [7]:
import xgboost as xgb

In [8]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin 

rstate = np.random.default_rng(5)

  import pkg_resources


In [9]:
space = {
    'max_depth': hp.choice('max_depth', range(3, 15, 1)), 
    'min_child_weight': hp.choice('min_child_weight', range(3, 15, 1)), 
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.4, 0.05), 
    'subsample': hp.quniform('subsample', 0.7, 1, 0.05), 
    'colsample_bytree': hp.quniform('colsample_bytree', 0.2, 0.5, 0.05) 
}

In [10]:
def objective(space): 
    from sklearn.model_selection import cross_val_score 

    xgboost = xgb.XGBClassifier(
        max_depth=int(space['max_depth']), 
        learning_rate=space['learning_rate'], 
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'], 
        colsample_bytree=space['colsample_bytree'] 
    ) 

    acc = cross_val_score(xgboost, X_train, y_train, cv=5, scoring='accuracy').mean()

    return { 'loss': 1-acc, 'status': STATUS_OK }

In [11]:
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=3,
    rstate=rstate
)

100%|██████████| 3/3 [01:46<00:00, 35.51s/trial, best loss: 0.15504744563885287]


In [15]:
from pprint import pprint

pprint("Best: ") 
pprint(best)

'Best: '
{'colsample_bytree': np.float64(0.30000000000000004),
 'learning_rate': np.float64(0.35000000000000003),
 'max_depth': np.int64(6),
 'min_child_weight': np.int64(9),
 'subsample': np.float64(0.75)}


In [16]:
xgboost_ho = xgb.XGBClassifier(**best) 

In [18]:
from draft.model.eval import fit_assess_classifier

xgboost_ho, xbg_ho_scores = fit_assess_classifier(
    xgboost_ho,
    X_train,
    y_train, 
    X_val, 
    y_val, 
    metrics={"accuracy_score": {}, "f1_score":
                {"average": "weighted"} 
             }
)

Evaluation metrics for <class 'xgboost.sklearn.XGBClassifier'>on train
***************************************************************************
Evaluation metrics for train
***************************************************************************
accuracy_score: 0.9112784061066564
f1_score: 0.9118691665590464
Evaluation metrics for <class 'xgboost.sklearn.XGBClassifier'>on val
***************************************************************************
Evaluation metrics for val
***************************************************************************
accuracy_score: 0.9052711191411584
f1_score: 0.9058459012944218


In [19]:
joblib.dump(xgboost_ho, os.path.join(models_dir, "xgboost_best.joblib"))

['/home/bened/DataScience/AMLA/labs/5/model-interpretation/models/xgboost_best.joblib']