In [2]:
import os
os.chdir("../..")

In [3]:
import random
import optuna
import pickle as pkl
import pandas as pd
import numpy as np
from models.utils.metrics import get_metrics
from models.utils.model_wrapper import ModelWrapper

In [4]:
X_train = pd.read_parquet("resources/data/balanced/X_train.parquet")
y_train = pd.read_parquet("resources/data/balanced/y_train.parquet")
X_test = pd.read_parquet("resources/data/balanced/X_test.parquet")
y_test = pd.read_parquet("resources/data/balanced/y_test.parquet")
X_calib = pd.read_parquet("resources/data/balanced/X_calib.parquet")
y_calib = pd.read_parquet("resources/data/balanced/y_calib.parquet")

In [5]:
study = optuna.create_study(
    study_name="xgboost", 
    storage="sqlite:///resources/models/studies/xgboost/xgboost-2022-10-03-20_10_18_colab.db", 
    load_if_exists=True,
    directions=["maximize", "minimize"]
)

[32m[I 2022-10-04 21:24:56,234][0m Using an existing study with name 'xgboost' instead of creating a new one.[0m


In [6]:
fig = optuna.visualization.plot_pareto_front(study)
fig.show(renderer="browser")

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.3)

In [8]:
params = study.get_trials()[1394].params
params["n_estimators"] = 5
params["max_depth"] = 8
from xgboost import XGBClassifier
model = XGBClassifier(**params)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10)


`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.67852
[1]	validation_0-logloss:0.66773
[2]	validation_0-logloss:0.65870
[3]	validation_0-logloss:0.65071
[4]	validation_0-logloss:0.64473


In [9]:
get_metrics(model, X_train, y_train)

{'accuracy': 0.6636472608562587,
 'f1_score': 0.6615222068705915,
 'precision': 0.6663880175118067,
 'recall': 0.6567269391810978,
 'ROC_AUC': 0.6636540331877679}

In [10]:
get_metrics(model, X_valid, y_valid)

{'accuracy': 0.6544275172577958,
 'f1_score': 0.651775005996642,
 'precision': 0.6552985389577765,
 'recall': 0.6482891623754896,
 'ROC_AUC': 0.654413546295509}

In [11]:
get_metrics(model, X_test, y_test)

{'accuracy': 0.6536550437216568,
 'f1_score': 0.2057916210568216,
 'precision': 0.12627465347743788,
 'recall': 0.555764579414136,
 'ROC_AUC': 0.6090086646303556}

In [12]:
def random_proba(X):
    return np.array([[0, random.uniform(0, 1)] for _ in range(X.shape[0])])

baseline = ModelWrapper(random_proba, threshold=.5)

In [13]:
get_metrics(baseline, X_test, y_test)

{'accuracy': 0.49925141580055116,
 'f1_score': 0.1399716777222926,
 'precision': 0.08125297451650586,
 'recall': 0.5047030368180596,
 'ROC_AUC': 0.5017378187442043}

In [14]:
from utils.io import save_model
save_model(model=model, model_serialized_path="resources/models/serialized/xgboost_final.pkl", model_parameters_path="resources/models/parameters/xgboost_final.json")

2022-10-04 21:28:59,037 utils.io: Saving XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eta=0.1636715892395978, eval_metric=None, gamma=7.024507940975488,
              gpu_id=-1, grow_policy='depthwise', importance_type=None,
              interaction_constraints='', lambda=4.8444792553750275,
              learning_rate=0.163671583, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=5, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...) object to resources/models/serialized/xgboost_final.pkl
2022-10-04 21:28:59,043 utils.io: Saving to resources/models/serialized/xgboost_final.pkl succeeded
2022-10-04 21:28:59,049 utils.io: Saving dictionary into resources

slightly better than baseline