In [1]:
import os
os.chdir("../..")

In [2]:
import random
import optuna
import pickle as pkl
import pandas as pd
import numpy as np
from models.utils.metrics import get_metrics
from models.utils.model_wrapper import ModelWrapper

In [3]:
X_train = pd.read_parquet("resources/data/reduced/X_train.parquet")
y_train = pd.read_parquet("resources/data/reduced/y_train.parquet")
X_test = pd.read_parquet("resources/data/reduced/X_test.parquet")
y_test = pd.read_parquet("resources/data/reduced/y_test.parquet")
X_valid = pd.read_parquet("resources/data/reduced/X_valid.parquet")
y_valid = pd.read_parquet("resources/data/reduced/y_valid.parquet")

In [4]:
study = optuna.create_study(
    study_name="xgboost", 
    storage="sqlite:///resources/models/studies/xgboost/xgboost-2022-10-16-21_10_01_colab.db", 
    load_if_exists=True,
    directions=["maximize", "minimize"]
)

[32m[I 2022-10-17 00:41:50,708][0m Using an existing study with name 'xgboost' instead of creating a new one.[0m


In [6]:
fig = optuna.visualization.plot_pareto_front(study)
fig.show(renderer="browser")

In [12]:
params = study.get_trials()[424].params
params["scale_pos_weight"] = 11.381618
from xgboost import XGBClassifier
model = XGBClassifier(**params)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10)


`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.66737
[1]	validation_0-logloss:0.65157
[2]	validation_0-logloss:0.64094
[3]	validation_0-logloss:0.63299
[4]	validation_0-logloss:0.62802
[5]	validation_0-logloss:0.62327
[6]	validation_0-logloss:0.62022
[7]	validation_0-logloss:0.61716
[8]	validation_0-logloss:0.61402
[9]	validation_0-logloss:0.61148
[10]	validation_0-logloss:0.60904
[11]	validation_0-logloss:0.60767
[12]	validation_0-logloss:0.60610
[13]	validation_0-logloss:0.60481
[14]	validation_0-logloss:0.60391
[15]	validation_0-logloss:0.60297
[16]	validation_0-logloss:0.60161
[17]	validation_0-logloss:0.60066
[18]	validation_0-logloss:0.59873
[19]	validation_0-logloss:0.59816
[20]	validation_0-logloss:0.59754
[21]	validation_0-logloss:0.59715
[22]	validation_0-logloss:0.59643
[23]	validation_0-logloss:0.59585
[24]	validation_0-logloss:0.59522
[25]	validation_0-logloss:0.59461
[26]	validation_0-logloss:0.59427
[27]	validation_0-logloss:0.59396
[28]	validation_0-logloss:0.59325
[29]	validation_0-loglos

In [13]:
get_metrics(model, X_train, y_train)

{'accuracy': 0.6987008397578373,
 'f1_score': 0.27149056731050303,
 'precision': 0.16866897630755476,
 'recall': 0.6954267941481396,
 'ROC_AUC': 0.6972075801797855}

In [14]:
get_metrics(model, X_valid, y_valid)

{'accuracy': 0.6931976037506512,
 'f1_score': 0.26145566644025287,
 'precision': 0.16227785705020106,
 'recall': 0.6723998925020156,
 'ROC_AUC': 0.6837124016829929}

In [15]:
(1 - y_valid).sum()/y_valid.sum()

TARGET    11.381618
dtype: float64

In [16]:
get_metrics(model, X_test, y_test)

{'accuracy': 0.6969427387332654,
 'f1_score': 0.26678565803979215,
 'precision': 0.1657750521920668,
 'recall': 0.6828809459822628,
 'ROC_AUC': 0.690529365027198}

In [17]:
def random_proba(X):
    return np.array([[0, random.uniform(0, 1)] for _ in range(X.shape[0])])

baseline = ModelWrapper(random_proba, threshold=.5)

In [18]:
get_metrics(baseline, X_test, y_test)

{'accuracy': 0.5004231128083841,
 'f1_score': 0.1358006155694017,
 'precision': 0.07892325814755029,
 'recall': 0.486159634506853,
 'ROC_AUC': 0.49391775333424603}

In [19]:
from utils.io import save_model
save_model(
    model=model,
    model_serialized_path="resources/models/serialized/xgboost_final_3.pkl", 
    model_parameters_path="resources/models/parameters/xgboost_final_3.json"
)

2022-10-17 10:15:18,672 utils.io: Saving XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eta=0.26151120075621215, eval_metric=None,
              gamma=1.6873264119365516, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=7.9007947788123305, learning_rate=0.261511207, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=88, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, ...) object to resources/models/serialized/xgboost_final_3.pkl
2022-10-17 10:15:18,687 utils.io: Saving to resources/models/serialized/xgboost_final_3.pkl succeeded
2022-10-17 10:15:18,693 utils.io: Saving dict

slightly better than baseline