In [1]:
import os
os.chdir("..")

In [2]:
import pandas as pd
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from models.lib.utils.metrics import get_metrics

In [14]:
X_train = pd.read_parquet("resources/data/transformed_categorical_column/X_train.parquet")
y_train = pd.read_parquet("resources/data/transformed_categorical_column/y_train.parquet")
X_valid = pd.read_parquet("resources/data/transformed_categorical_column/X_valid.parquet")
y_valid = pd.read_parquet("resources/data/transformed_categorical_column/y_valid.parquet")

In [15]:
def objective(trial):
    parameters = {
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 10, 100),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "class_weight": "balanced",
        "max_features": "sqrt",
        "random_state": 42,
    }
    model = DecisionTreeClassifier(**parameters)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_roc_auc = roc_auc_score(y_train, y_train_pred)
    valid_roc_auc = roc_auc_score(y_valid, y_valid_pred)

    return valid_roc_auc, train_roc_auc - valid_roc_auc

In [17]:
study = optuna.create_study(
    directions=["maximize", "minimize"],
    study_name="optimize_rfe_estimator",
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(
    objective,
    n_jobs=-1,
    n_trials=400
)

[32m[I 2022-11-14 00:59:53,390][0m A new study created in memory with name: optimize_rfe_estimator[0m
[32m[I 2022-11-14 00:59:54,983][0m Trial 2 finished with values: [0.8935653012909972, -0.0022922205897647485] and parameters: {'max_depth': 5, 'min_samples_split': 20, 'min_samples_leaf': 2}. [0m
[32m[I 2022-11-14 00:59:55,395][0m Trial 7 finished with values: [0.8935653012909972, -0.0022922205897647485] and parameters: {'max_depth': 5, 'min_samples_split': 94, 'min_samples_leaf': 1}. [0m
[32m[I 2022-11-14 00:59:56,020][0m Trial 5 finished with values: [0.954816338541617, 0.03301122127607714] and parameters: {'max_depth': 18, 'min_samples_split': 48, 'min_samples_leaf': 5}. [0m
[32m[I 2022-11-14 00:59:56,334][0m Trial 1 finished with values: [0.9246413542089877, 0.06512436662829935] and parameters: {'max_depth': 14, 'min_samples_split': 45, 'min_samples_leaf': 8}. [0m
[32m[I 2022-11-14 00:59:56,382][0m Trial 3 finished with values: [0.9683868320970291, 0.02594725607090

In [18]:
fig = optuna.visualization.plot_pareto_front(study)
fig.show(renderer="browser")

In [19]:
study.get_trials()[333].params

{'max_depth': 14, 'min_samples_split': 53, 'min_samples_leaf': 9}

In [20]:
final_model_params = {
    "max_depth": 14,
    "min_samples_split": 53,
    "min_samples_leaf": 9,
    "class_weight": "balanced",
    "max_features": "sqrt",
    "random_state": 42,
}

In [21]:
estimator = DecisionTreeClassifier(**final_model_params)
estimator.fit(X_train, y_train)

In [22]:
get_metrics(estimator, X_train, y_train)

{'accuracy': 0.9927377046403018,
 'f1_score': 0.9799294565658456,
 'precision': 0.9689207123236937,
 'recall': 0.9911912354371231,
 'ROC_AUC': 0.9921328995363683}

In [23]:
get_metrics(estimator, X_valid, y_valid)

{'accuracy': 0.9828967480103304,
 'f1_score': 0.9532117367168914,
 'precision': 0.9332298136645962,
 'recall': 0.9740680713128039,
 'ROC_AUC': 0.979443933278277}