In [2]:
import os
os.chdir("..")

In [27]:
import pandas as pd
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from models.lib.utils.metrics import get_metrics

In [24]:
X_train = pd.read_parquet("resources/data/categorical_transformed/X_train.parquet")
y_train = pd.read_parquet("resources/data/categorical_transformed/y_train.parquet")
X_valid = pd.read_parquet("resources/data/categorical_transformed/X_valid.parquet")
y_valid = pd.read_parquet("resources/data/categorical_transformed/y_valid.parquet")

In [36]:
def objective(trial):
    parameters = {
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 10, 100),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "class_weight": "balanced",
        "max_features": "sqrt",
        "random_state": 42,
    }
    model = DecisionTreeClassifier(**parameters)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_roc_auc = roc_auc_score(y_train, y_train_pred)
    valid_roc_auc = roc_auc_score(y_valid, y_valid_pred)

    return valid_roc_auc, train_roc_auc - valid_roc_auc

In [44]:
study = optuna.create_study(
    directions=["maximize", "minimize"],
    study_name="optimize_rfe_estimator",
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(
    objective,
    n_jobs=-1,
    n_trials=1000
)

[32m[I 2022-10-19 17:10:15,768][0m A new study created in memory with name: optimize_rfe_estimator[0m
[32m[I 2022-10-19 17:10:23,027][0m Trial 0 finished with values: [0.6162341088622121, 0.0881769739640954] and parameters: {'max_depth': 16, 'min_samples_split': 21, 'min_samples_leaf': 10}. [0m
[32m[I 2022-10-19 17:10:23,096][0m Trial 2 finished with values: [0.6154450707122368, 0.07310577466505652] and parameters: {'max_depth': 15, 'min_samples_split': 74, 'min_samples_leaf': 2}. [0m
[32m[I 2022-10-19 17:10:23,172][0m Trial 1 finished with values: [0.601508876265133, 0.09455056919383908] and parameters: {'max_depth': 16, 'min_samples_split': 76, 'min_samples_leaf': 1}. [0m
[32m[I 2022-10-19 17:10:23,317][0m Trial 7 finished with values: [0.6226436644433575, 0.04766079671481693] and parameters: {'max_depth': 13, 'min_samples_split': 29, 'min_samples_leaf': 6}. [0m
[32m[I 2022-10-19 17:10:23,377][0m Trial 3 finished with values: [0.6132275646676327, 0.06896220150465482]

In [45]:
fig = optuna.visualization.plot_pareto_front(study)
fig.show(renderer="browser")

In [49]:
study.get_trials()[375].params

{'max_depth': 10, 'min_samples_split': 28, 'min_samples_leaf': 10}

In [50]:
final_model_params = {
    "max_depth": 10,
    "min_samples_split": 28,
    "min_samples_leaf": 10,
    "class_weight": "balanced",
    "max_features": "sqrt",
    "random_state": 42,
}

In [52]:
estimator = DecisionTreeClassifier(**final_model_params)
estimator.fit(X_train, y_train)

In [53]:
get_metrics(estimator, X_train, y_train)

{'accuracy': 0.6291069552036157,
 'f1_score': 0.2269859575334102,
 'precision': 0.1364520827264783,
 'recall': 0.6745190646238912,
 'ROC_AUC': 0.6498189657970241}

In [54]:
get_metrics(estimator, X_val, y_val)

{'accuracy': 0.621592290328182,
 'f1_score': 0.21595610721352762,
 'precision': 0.12967863894139886,
 'recall': 0.6452566514377855,
 'ROC_AUC': 0.632384884005592}

### Estimator is properly fitted and significantly better than random