In [1]:
import os
os.chdir("../..")

In [2]:
import random
import optuna
import pickle as pkl
import pandas as pd
import numpy as np
from models.utils.metrics import get_metrics
from models.utils.model_wrapper import ModelWrapper

In [3]:
X_train = pd.read_parquet("resources/data/balanced/X_train.parquet")
y_train = pd.read_parquet("resources/data/balanced/y_train.parquet")
X_test = pd.read_parquet("resources/data/balanced/X_test.parquet")
y_test = pd.read_parquet("resources/data/balanced/y_test.parquet")
X_calib = pd.read_parquet("resources/data/balanced/X_calib.parquet")
y_calib = pd.read_parquet("resources/data/balanced/y_calib.parquet")

In [4]:
study = optuna.create_study(
    study_name="xgboost", 
    storage="sqlite:///resources/models/studies/xgboost/xgboost-2022-10-07-19_10_48_colab.db", 
    load_if_exists=True,
    directions=["maximize", "minimize"]
)

[32m[I 2022-10-07 23:41:58,077][0m Using an existing study with name 'xgboost' instead of creating a new one.[0m


In [5]:
fig = optuna.visualization.plot_pareto_front(study)
fig.show(renderer="browser")

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.3)

In [14]:
params = study.get_trials()[663].params
from xgboost import XGBClassifier
model = XGBClassifier(**params)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10)


`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.68556
[1]	validation_0-logloss:0.67872
[2]	validation_0-logloss:0.67253
[3]	validation_0-logloss:0.66685
[4]	validation_0-logloss:0.66168
[5]	validation_0-logloss:0.65698
[6]	validation_0-logloss:0.65272
[7]	validation_0-logloss:0.64872
[8]	validation_0-logloss:0.64505
[9]	validation_0-logloss:0.64173
[10]	validation_0-logloss:0.63864
[11]	validation_0-logloss:0.63578
[12]	validation_0-logloss:0.63318
[13]	validation_0-logloss:0.63077
[14]	validation_0-logloss:0.62849
[15]	validation_0-logloss:0.62636
[16]	validation_0-logloss:0.62438
[17]	validation_0-logloss:0.62253
[18]	validation_0-logloss:0.62085
[19]	validation_0-logloss:0.61919
[20]	validation_0-logloss:0.61762
[21]	validation_0-logloss:0.61619
[22]	validation_0-logloss:0.61481
[23]	validation_0-logloss:0.61347
[24]	validation_0-logloss:0.61223
[25]	validation_0-logloss:0.61107
[26]	validation_0-logloss:0.60990
[27]	validation_0-logloss:0.60865
[28]	validation_0-logloss:0.60748
[29]	validation_0-loglos

In [15]:
get_metrics(model, X_train, y_train)

{'accuracy': 0.7068218319014964,
 'f1_score': 0.7084581486629796,
 'precision': 0.7058011398553714,
 'recall': 0.7111352378367375,
 'ROC_AUC': 0.7068139370018993}

In [18]:
get_metrics(model, X_valid, y_valid)

{'accuracy': 0.7050163576881134,
 'f1_score': 0.7043568980078494,
 'precision': 0.7029251363411007,
 'recall': 0.705794504181601,
 'ROC_AUC': 0.7050196607778686}

In [19]:
get_metrics(model, X_test, y_test)

{'accuracy': 0.7017597153210233,
 'f1_score': 0.26658129235366307,
 'precision': 0.16631158455392808,
 'recall': 0.6713249126578876,
 'ROC_AUC': 0.6878788562722946}

In [21]:
def random_proba(X):
    return np.array([[0, random.uniform(0, 1)] for _ in range(X.shape[0])])

baseline = ModelWrapper(random_proba, threshold=.5)

In [22]:
get_metrics(baseline, X_test, y_test)

{'accuracy': 0.49907783105864995,
 'f1_score': 0.1339935479030685,
 'precision': 0.07786545755765793,
 'recall': 0.47997850040311746,
 'ROC_AUC': 0.49036691153375905}

In [23]:
from utils.io import save_model
save_model(
    model=model,
    model_serialized_path="resources/models/serialized/xgboost_final_2.pkl", 
    model_parameters_path="resources/models/parameters/xgboost_final_2.json"
)

2022-10-08 00:15:29,300 utils.io: Saving XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eta=0.05583946280622387, eval_metric=None,
              gamma=7.950724483766072, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=6.849414841615746, learning_rate=0.0558394641, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=5, max_leaves=0,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=128, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, ...) object to resources/models/serialized/xgboost_final_2.pkl
2022-10-08 00:15:29,308 utils.io: Saving to resources/models/serialized/xgboost_final_2.pkl succeeded
2022-10-08 00:15:29,312 utils.io: Saving dict

slightly better than baseline