In [17]:
import numpy as np
import os
import pandas as pd
import pickle

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import optuna

import custom_map

In [18]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/src/custom_map.py'>

In [19]:
storage_url = "sqlite:///optuna_studies.db"

# Data Preprocessing

In [20]:
data = pd.read_csv("../dataset/train.csv")

target = "Heart Disease"
data['Heart Disease'] = np.where(data['Heart Disease'] == 'Presence', 1, 0)

data.drop('id', axis=1, inplace=True)
categorical_cols = data.columns[(data.nunique() <= 10) & (data.nunique() > 2)]
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True, dtype=float)

data_encoded

Unnamed: 0,Age,Sex,BP,Cholesterol,FBS over 120,Max HR,Exercise angina,ST depression,Heart Disease,Chest pain type_2,...,Chest pain type_4,EKG results_1,EKG results_2,Slope of ST_2,Slope of ST_3,Number of vessels fluro_1,Number of vessels fluro_2,Number of vessels fluro_3,Thallium_6,Thallium_7
0,58,1,152,239,0,158,1,3.6,1,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,52,1,125,325,0,171,0,0.0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,56,0,160,188,0,151,0,0.0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,0,134,229,0,150,0,1.0,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,58,1,140,234,0,125,1,3.8,1,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,56,0,110,226,0,132,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629996,54,1,128,249,1,150,0,0.0,0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
629997,67,1,130,275,0,149,0,0.0,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
629998,52,1,140,199,0,157,0,0.0,1,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
all_cols = data_encoded.columns.tolist()

MI_cols = ['Chest pain type_3', 'EKG results_2', 'Number of vessels fluro_2',
    'Number of vessels fluro_1', 'Age', 'Chest pain type_2',
    'Number of vessels fluro_3', 'Cholesterol', 'BP', 'Slope of ST_3',
    'FBS over 120', 'Thallium_6', 'EKG results_1']

RFECV_cols = ['Chest pain type_2', 'Chest pain type_3', 'Chest pain type_4',
       'EKG results_1', 'EKG results_2', 'Slope of ST_2', 'Slope of ST_3',
       'Number of vessels fluro_1', 'Number of vessels fluro_2',
       'Number of vessels fluro_3', 'Thallium_6', 'Thallium_7', 'Sex',
       'FBS over 120', 'Exercise angina', 'Cholesterol', 'ST depression',
       'Age', 'Max HR']

boruta_cols = ['Chest pain type_4', 'Slope of ST_2', 'Thallium_7', 'Exercise angina',
        'Cholesterol', 'ST depression', 'Age', 'Max HR']

feature_sets = {
    "ALL": all_cols,
    "MI": MI_cols,
    "RFECV": RFECV_cols,
    "BORUTA": boruta_cols
}

models = {
    #"LogReg": LogisticRegression,
    #"RF": RandomForestClassifier,
    "ExtraTrees": ExtraTreesClassifier,
    "KNN": KNeighborsClassifier,
    "SVC": SVC,
    "XGB": XGBClassifier,
    "LGBM": LGBMClassifier,
    "CatBoost": CatBoostClassifier
}

In [22]:
X = data_encoded
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optuna function

In [23]:
def run_optuna_all_models(
    X_train,
    y_train,
    cv,
    data_encoded,
    n_trials=10,
    storage_url="sqlite:///optuna_studies.db",
    models_dir="models1"
):
    os.makedirs(models_dir, exist_ok=True)

    def create_objective(model_name, model_class, X, y):

        def objective(trial):

            if model_name == "LogReg":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "max_iter": 3000,
                    "solver": "lbfgs"
                }

            elif model_name == "RF":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 150, 350),
                    "max_depth": trial.suggest_int("max_depth", 8, 25),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                    "max_features": trial.suggest_float("max_features", 0.3, 0.9),
                    "bootstrap": True,
                    "max_samples": trial.suggest_float("max_samples", 0.5, 0.9),
                    "n_jobs": -1,
                }

            elif model_name == "ExtraTrees":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 200, 400),
                    "max_depth": trial.suggest_int("max_depth", 10, 30),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                    "max_features": trial.suggest_float("max_features", 0.3, 1.0),
                    "bootstrap": False,
                    "n_jobs": -1,
                }

            elif model_name == "KNN":
                params = {
                    "n_neighbors": trial.suggest_int("n_neighbors", 3, 25),
                    "weights": trial.suggest_categorical("weights", ["uniform", "distance"])
                }

            elif model_name == "SVC":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "gamma": trial.suggest_float("gamma", 1e-4, 1, log=True),
                    "kernel": "rbf",
                    "probability": True
                }

            elif model_name == "XGB":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                    "eval_metric": "auc",
                    "use_label_encoder": False,
                }

            elif model_name == "LGBM":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", -1, 15),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "num_leaves": trial.suggest_int("num_leaves", 20, 200),
                }

            elif model_name == "CatBoost":
                params = {
                    "iterations": trial.suggest_int("iterations", 200, 600),
                    "depth": trial.suggest_int("depth", 4, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "verbose": 0,
                }

            model = model_class(**params)

            scores = cross_val_score(
                model,
                X,
                y,
                cv=cv,
                scoring="roc_auc",
                n_jobs=-1
            )

            return scores.mean()

        return objective

    for model_name, model_class in models.items():
        for fs_name, cols in feature_sets.items():

            X_subset = X_train[cols]
            study_name = f"{model_name}_{fs_name}"

            study = optuna.create_study(
                direction="maximize",
                study_name=study_name,
                storage=storage_url,
                load_if_exists=True
            )

            remaining_trials = n_trials
            print(f"{study_name} existing trials: {len(study.trials)}")

            if remaining_trials > 0:
                study.optimize(create_objective(model_name, model_class, X_subset, y_train),
                    n_trials=remaining_trials, show_progress_bar=True)

            print(f"Best ROC AUC for {study_name}: {study.best_value:.4f}")

            best_model = model_class(**study.best_params)
            best_model.fit(X_subset, y_train)

            model_path = os.path.join(models_dir, f"{study_name}.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(best_model, f)

            print(f"Saved model to {model_path}")

In [24]:
run_optuna_all_models(
    X_train=X_train,
    y_train=y_train,
    cv=cv,
    data_encoded=data_encoded,
    n_trials=10
)

[32m[I 2026-02-28 22:02:17,856][0m A new study created in RDB with name: ExtraTrees_ALL[0m


ExtraTrees_ALL existing trials: 0


Best trial: 0. Best value: 1:  10%|█         | 1/10 [00:10<01:31, 10.15s/it]

[32m[I 2026-02-28 22:02:28,002][0m Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 244, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  20%|██        | 2/10 [00:16<01:05,  8.18s/it]

[32m[I 2026-02-28 22:02:34,813][0m Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 177, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  30%|███       | 3/10 [00:29<01:10, 10.01s/it]

[32m[I 2026-02-28 22:02:46,992][0m Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 251, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  40%|████      | 4/10 [00:48<01:22, 13.79s/it]

[32m[I 2026-02-28 22:03:06,566][0m Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 355, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  50%|█████     | 5/10 [01:06<01:15, 15.09s/it]

[32m[I 2026-02-28 22:03:23,968][0m Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 330, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  60%|██████    | 6/10 [01:27<01:08, 17.14s/it]

[32m[I 2026-02-28 22:03:45,086][0m Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 427, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  70%|███████   | 7/10 [01:51<00:57, 19.31s/it]

[32m[I 2026-02-28 22:04:08,858][0m Trial 6 finished with value: 1.0 and parameters: {'n_estimators': 489, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  80%|████████  | 8/10 [02:02<00:33, 16.79s/it]

[32m[I 2026-02-28 22:04:20,244][0m Trial 7 finished with value: 1.0 and parameters: {'n_estimators': 314, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  90%|█████████ | 9/10 [02:20<00:17, 17.29s/it]

[32m[I 2026-02-28 22:04:38,657][0m Trial 8 finished with value: 1.0 and parameters: {'n_estimators': 472, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1: 100%|██████████| 10/10 [02:34<00:00, 15.47s/it]


[32m[I 2026-02-28 22:04:52,507][0m Trial 9 finished with value: 1.0 and parameters: {'n_estimators': 272, 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 5}. Best is trial 0 with value: 1.0.[0m
Best ROC AUC for ExtraTrees_ALL: 1.0000


[32m[I 2026-02-28 22:05:01,447][0m A new study created in RDB with name: ExtraTrees_MI[0m


Saved model to models1/ExtraTrees_ALL.pkl
ExtraTrees_MI existing trials: 0


Best trial: 0. Best value: 0.858019:  10%|█         | 1/10 [00:35<05:16, 35.20s/it]

[32m[I 2026-02-28 22:05:36,645][0m Trial 0 finished with value: 0.8580189016766729 and parameters: {'n_estimators': 467, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8580189016766729.[0m


Best trial: 0. Best value: 0.858019:  20%|██        | 2/10 [01:01<03:57, 29.70s/it]

[32m[I 2026-02-28 22:06:02,492][0m Trial 1 finished with value: 0.8579189734704439 and parameters: {'n_estimators': 336, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8580189016766729.[0m


Best trial: 0. Best value: 0.858019:  30%|███       | 3/10 [01:32<03:34, 30.60s/it]

[32m[I 2026-02-28 22:06:34,159][0m Trial 2 finished with value: 0.8572355491353421 and parameters: {'n_estimators': 444, 'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8580189016766729.[0m


Best trial: 3. Best value: 0.858314:  40%|████      | 4/10 [02:04<03:07, 31.26s/it]

[32m[I 2026-02-28 22:07:06,423][0m Trial 3 finished with value: 0.8583139226112498 and parameters: {'n_estimators': 369, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.8583139226112498.[0m


Best trial: 3. Best value: 0.858314:  50%|█████     | 5/10 [02:10<01:49, 21.85s/it]

[32m[I 2026-02-28 22:07:11,598][0m Trial 4 finished with value: 0.8514988713012273 and parameters: {'n_estimators': 138, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.8583139226112498.[0m


Best trial: 5. Best value: 0.858628:  60%|██████    | 6/10 [02:33<01:28, 22.24s/it]

[32m[I 2026-02-28 22:07:34,579][0m Trial 5 finished with value: 0.8586277817718051 and parameters: {'n_estimators': 261, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.8586277817718051.[0m


Best trial: 5. Best value: 0.858628:  70%|███████   | 7/10 [02:47<00:58, 19.59s/it]

[32m[I 2026-02-28 22:07:48,712][0m Trial 6 finished with value: 0.8520118144865114 and parameters: {'n_estimators': 493, 'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.8586277817718051.[0m


Best trial: 5. Best value: 0.858628:  80%|████████  | 8/10 [03:05<00:38, 19.09s/it]

[32m[I 2026-02-28 22:08:06,745][0m Trial 7 finished with value: 0.8579743690909476 and parameters: {'n_estimators': 248, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.8586277817718051.[0m


Best trial: 5. Best value: 0.858628:  90%|█████████ | 9/10 [03:30<00:21, 21.09s/it]

[32m[I 2026-02-28 22:08:32,241][0m Trial 8 finished with value: 0.8565580341697304 and parameters: {'n_estimators': 411, 'max_depth': 11, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 5 with value: 0.8586277817718051.[0m


Best trial: 5. Best value: 0.858628: 100%|██████████| 10/10 [03:45<00:00, 22.56s/it]


[32m[I 2026-02-28 22:08:47,053][0m Trial 9 finished with value: 0.8585817221992533 and parameters: {'n_estimators': 167, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.8586277817718051.[0m
Best ROC AUC for ExtraTrees_MI: 0.8586


[32m[I 2026-02-28 22:09:08,356][0m A new study created in RDB with name: ExtraTrees_RFECV[0m


Saved model to models1/ExtraTrees_MI.pkl
ExtraTrees_RFECV existing trials: 0


Best trial: 0. Best value: 0.941478:  10%|█         | 1/10 [00:14<02:12, 14.69s/it]

[32m[I 2026-02-28 22:09:23,048][0m Trial 0 finished with value: 0.9414778685751036 and parameters: {'n_estimators': 232, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9414778685751036.[0m


Best trial: 0. Best value: 0.941478:  20%|██        | 2/10 [00:20<01:13,  9.19s/it]

[32m[I 2026-02-28 22:09:28,376][0m Trial 1 finished with value: 0.9327055304682694 and parameters: {'n_estimators': 161, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9414778685751036.[0m


Best trial: 0. Best value: 0.941478:  20%|██        | 2/10 [30:41<2:02:44, 920.50s/it]


[33m[W 2026-02-28 22:39:49,357][0m Trial 2 failed with parameters: {'n_estimators': 155, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/8d/gk38hstj1gg1tk319gmlzgm00000gn/T/ipykernel_14580/4209175738.py", line 79, in objective
    scores = cross_val_score(
             ^^^^^^^^^^^^^^^^
  File "/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", lin

KeyboardInterrupt: 

In [None]:
def summarize_optuna_results(
    storage_url="sqlite:///optuna_studies.db"
):

    storage = optuna.storages.RDBStorage(url=storage_url)
    study_summaries = optuna.study.get_all_study_summaries(storage=storage)

    results = []

    for summary in study_summaries:
        study_name = summary.study_name

        study = optuna.load_study(
            study_name=study_name,
            storage=storage_url
        )

        if len(study.trials) == 0:
            continue

        results.append({
            "Study": study_name,
            "Trials": len(study.trials),
            "Best ROC AUC": study.best_value,
            "Best Params": study.best_params
        })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values("Best ROC AUC", ascending=False)

    return df_results

In [None]:
results_df = summarize_optuna_results()
results_df