In [9]:
import numpy as np
import os
import pandas as pd
import pickle

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import optuna

import custom_map

In [10]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/src/custom_map.py'>

In [11]:
storage_url = "sqlite:///optuna_studies.db"

# Data Preprocessing

In [12]:
data = pd.read_csv("../dataset/train.csv")

target = "Heart Disease"
data['Heart Disease'] = np.where(data['Heart Disease'] == 'Presence', 1, 0)

data.drop('id', axis=1, inplace=True)
categorical_cols = data.columns[(data.nunique() <= 10) & (data.nunique() > 2)]
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True, dtype=float)

data_encoded

Unnamed: 0,Age,Sex,BP,Cholesterol,FBS over 120,Max HR,Exercise angina,ST depression,Heart Disease,Chest pain type_2,...,Chest pain type_4,EKG results_1,EKG results_2,Slope of ST_2,Slope of ST_3,Number of vessels fluro_1,Number of vessels fluro_2,Number of vessels fluro_3,Thallium_6,Thallium_7
0,58,1,152,239,0,158,1,3.6,1,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,52,1,125,325,0,171,0,0.0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,56,0,160,188,0,151,0,0.0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,0,134,229,0,150,0,1.0,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,58,1,140,234,0,125,1,3.8,1,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,56,0,110,226,0,132,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629996,54,1,128,249,1,150,0,0.0,0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
629997,67,1,130,275,0,149,0,0.0,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
629998,52,1,140,199,0,157,0,0.0,1,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
all_cols = data_encoded.columns.tolist()

MI_cols = ['Chest pain type_3', 'EKG results_2', 'Number of vessels fluro_2',
    'Number of vessels fluro_1', 'Age', 'Chest pain type_2',
    'Number of vessels fluro_3', 'Cholesterol', 'BP', 'Slope of ST_3',
    'FBS over 120', 'Thallium_6', 'EKG results_1']

RFECV_cols = ['Chest pain type_2', 'Chest pain type_3', 'Chest pain type_4',
       'EKG results_1', 'EKG results_2', 'Slope of ST_2', 'Slope of ST_3',
       'Number of vessels fluro_1', 'Number of vessels fluro_2',
       'Number of vessels fluro_3', 'Thallium_6', 'Thallium_7', 'Sex',
       'FBS over 120', 'Exercise angina', 'Cholesterol', 'ST depression',
       'Age', 'Max HR']

boruta_cols = ['Chest pain type_4', 'Slope of ST_2', 'Thallium_7', 'Exercise angina',
        'Cholesterol', 'ST depression', 'Age', 'Max HR']

feature_sets = {
    "ALL": all_cols,
    "MI": MI_cols,
    "RFECV": RFECV_cols,
    "BORUTA": boruta_cols
}

models = {
    "LogReg": LogisticRegression,
    "RF": RandomForestClassifier,
    "ExtraTrees": ExtraTreesClassifier,
    "KNN": KNeighborsClassifier,
    "SVC": SVC,
    "XGB": XGBClassifier,
    "LGBM": LGBMClassifier,
    "CatBoost": CatBoostClassifier
}

In [14]:
X = data_encoded
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optuna function

In [15]:
def run_optuna_all_models(
    X_train,
    y_train,
    cv,
    data_encoded,
    n_trials=10,
    storage_url="sqlite:///optuna_studies.db",
    models_dir="models1"
):
    os.makedirs(models_dir, exist_ok=True)

    def create_objective(model_name, model_class, X, y):

        def objective(trial):

            if model_name == "LogReg":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "max_iter": 3000,
                    "solver": "lbfgs"
                }

            elif model_name in ["RF", "ExtraTrees"]:
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", 3, 20),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
                    "n_jobs": -1,
                    "random_state": 42
                }

            elif model_name == "KNN":
                params = {
                    "n_neighbors": trial.suggest_int("n_neighbors", 3, 25),
                    "weights": trial.suggest_categorical("weights", ["uniform", "distance"])
                }

            elif model_name == "SVC":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "gamma": trial.suggest_float("gamma", 1e-4, 1, log=True),
                    "kernel": "rbf",
                    "probability": True
                }

            elif model_name == "XGB":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                    "eval_metric": "auc",
                    "use_label_encoder": False,
                    "random_state": 42
                }

            elif model_name == "LGBM":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", -1, 15),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "num_leaves": trial.suggest_int("num_leaves", 20, 200),
                    "random_state": 42
                }

            elif model_name == "CatBoost":
                params = {
                    "iterations": trial.suggest_int("iterations", 200, 600),
                    "depth": trial.suggest_int("depth", 4, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "verbose": 0,
                    "random_state": 42
                }

            model = model_class(**params)

            scores = cross_val_score(
                model,
                X,
                y,
                cv=cv,
                scoring="roc_auc",
                n_jobs=-1
            )

            return scores.mean()

        return objective

    for model_name, model_class in models.items():
        for fs_name, cols in feature_sets.items():

            X_subset = X_train[cols]
            study_name = f"{model_name}_{fs_name}"

            study = optuna.create_study(
                direction="maximize",
                study_name=study_name,
                storage=storage_url,
                load_if_exists=True
            )

            remaining_trials = n_trials
            print(f"{study_name} existing trials: {len(study.trials)}")

            if remaining_trials > 0:
                study.optimize(create_objective(model_name, model_class, X_subset, y_train),
                    n_trials=remaining_trials, show_progress_bar=True)

            print(f"Best ROC AUC for {study_name}: {study.best_value:.4f}")

            best_model = model_class(**study.best_params)
            best_model.fit(X_subset, y_train)

            model_path = os.path.join(models_dir, f"{study_name}.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(best_model, f)

            print(f"Saved model to {model_path}")

In [None]:
run_optuna_all_models(
    X_train=X_train,
    y_train=y_train,
    cv=cv,
    data_encoded=data_encoded,
    n_trials=10
)

[32m[I 2026-02-28 20:29:18,072][0m Using an existing study with name 'LogReg_ALL' instead of creating a new one.[0m


LogReg_ALL → existing trials: 50


[32m[I 2026-02-28 20:29:21,512][0m Trial 50 finished with value: 1.0 and parameters: {'C': 0.053970260462046594}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:24,138][0m Trial 51 finished with value: 1.0 and parameters: {'C': 0.035903523182380444}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:33,039][0m Trial 52 finished with value: 1.0 and parameters: {'C': 0.003416415874508888}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:37,348][0m Trial 53 finished with value: 1.0 and parameters: {'C': 0.010597329680259098}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:39,209][0m Trial 54 finished with value: 1.0 and parameters: {'C': 0.7035047546715393}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:41,388][0m Trial 55 finished with value: 1.0 and parameters: {'C': 0.14502246887063835}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:29:43,249][0m Trial 56 finished with value: 1.0 and parameters

Best ROC AUC for LogReg_ALL: 1.0000


[32m[I 2026-02-28 20:29:49,392][0m Using an existing study with name 'LogReg_MI' instead of creating a new one.[0m


Saved model to models1/LogReg_ALL.pkl
LogReg_MI → existing trials: 13


[32m[I 2026-02-28 20:30:00,242][0m Trial 13 finished with value: 0.8586104526631135 and parameters: {'C': 0.681764221536764}. Best is trial 7 with value: 0.8589574037944366.[0m
[32m[I 2026-02-28 20:30:10,688][0m Trial 14 finished with value: 0.8586133636053674 and parameters: {'C': 0.04639875978949585}. Best is trial 7 with value: 0.8589574037944366.[0m
[32m[I 2026-02-28 20:30:21,193][0m Trial 15 finished with value: 0.8586114609580733 and parameters: {'C': 2.419075854179531}. Best is trial 7 with value: 0.8589574037944366.[0m
[32m[I 2026-02-28 20:30:32,187][0m Trial 16 finished with value: 0.8586110660605175 and parameters: {'C': 0.07866325656574862}. Best is trial 7 with value: 0.8589574037944366.[0m
[32m[I 2026-02-28 20:30:41,380][0m Trial 17 finished with value: 0.8586107556726155 and parameters: {'C': 0.36984598841429706}. Best is trial 7 with value: 0.8589574037944366.[0m
[32m[I 2026-02-28 20:30:50,674][0m Trial 18 finished with value: 0.8586113282570021 and para

Best ROC AUC for LogReg_MI: 0.8590


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[32m[I 2026-02-28 20:31:29,074][0m A new study created in RDB with name: LogReg_RFECV[0m


Saved model to models1/LogReg_MI.pkl
LogReg_RFECV → existing trials: 0


[32m[I 2026-02-28 20:31:44,496][0m Trial 0 finished with value: 0.9529139408135979 and parameters: {'C': 0.5364811657666937}. Best is trial 0 with value: 0.9529139408135979.[0m
[32m[I 2026-02-28 20:31:50,119][0m Trial 1 finished with value: 0.951712890690177 and parameters: {'C': 0.00108810105948105}. Best is trial 0 with value: 0.9529139408135979.[0m
[32m[I 2026-02-28 20:32:01,231][0m Trial 2 finished with value: 0.9523553898885517 and parameters: {'C': 0.0022386927210717978}. Best is trial 0 with value: 0.9529139408135979.[0m
[32m[I 2026-02-28 20:32:16,530][0m Trial 3 finished with value: 0.9529159303741082 and parameters: {'C': 2.0995825052863126}. Best is trial 3 with value: 0.9529159303741082.[0m
[32m[I 2026-02-28 20:32:33,432][0m Trial 4 finished with value: 0.9529117621627627 and parameters: {'C': 0.049851862000354956}. Best is trial 3 with value: 0.9529159303741082.[0m
[32m[I 2026-02-28 20:32:48,414][0m Trial 5 finished with value: 0.9529162189250681 and parame

Best ROC AUC for LogReg_RFECV: 0.9529


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[32m[I 2026-02-28 20:33:46,844][0m A new study created in RDB with name: LogReg_BORUTA[0m


Saved model to models1/LogReg_RFECV.pkl
LogReg_BORUTA → existing trials: 0


[32m[I 2026-02-28 20:33:50,034][0m Trial 0 finished with value: 0.941192595667534 and parameters: {'C': 7.7953274647161495}. Best is trial 0 with value: 0.941192595667534.[0m
[32m[I 2026-02-28 20:33:53,165][0m Trial 1 finished with value: 0.9411925151074941 and parameters: {'C': 6.867845177816759}. Best is trial 0 with value: 0.941192595667534.[0m
[32m[I 2026-02-28 20:33:56,679][0m Trial 2 finished with value: 0.9411923478570049 and parameters: {'C': 0.04780427118049554}. Best is trial 0 with value: 0.941192595667534.[0m
[32m[I 2026-02-28 20:34:00,404][0m Trial 3 finished with value: 0.9411917457922888 and parameters: {'C': 0.018598106214998703}. Best is trial 0 with value: 0.941192595667534.[0m
[32m[I 2026-02-28 20:34:03,445][0m Trial 4 finished with value: 0.9411918536901769 and parameters: {'C': 0.12478912816803762}. Best is trial 0 with value: 0.941192595667534.[0m
[32m[I 2026-02-28 20:34:06,888][0m Trial 5 finished with value: 0.9411827177003673 and parameters: {'

Best ROC AUC for LogReg_BORUTA: 0.9412


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[32m[I 2026-02-28 20:34:21,275][0m A new study created in RDB with name: RF_ALL[0m


Saved model to models1/LogReg_BORUTA.pkl
RF_ALL → existing trials: 0


[32m[I 2026-02-28 20:34:48,362][0m Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 424, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:35:06,712][0m Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 426, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:35:39,604][0m Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 471, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:36:04,916][0m Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 398, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 0 with value: 1.0.[0m
[32m[I 2026-02-28 20:36:14,105][0m Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 156, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_l

Best ROC AUC for RF_ALL: 1.0000


[32m[I 2026-02-28 20:37:47,313][0m A new study created in RDB with name: RF_MI[0m


Saved model to models1/RF_ALL.pkl
RF_MI → existing trials: 0


[32m[I 2026-02-28 20:38:35,214][0m Trial 0 finished with value: 0.8602363123785249 and parameters: {'n_estimators': 417, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8602363123785249.[0m
[32m[I 2026-02-28 20:39:18,602][0m Trial 1 finished with value: 0.8601183300403832 and parameters: {'n_estimators': 367, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8602363123785249.[0m
[32m[I 2026-02-28 20:40:12,693][0m Trial 2 finished with value: 0.8598303472497593 and parameters: {'n_estimators': 428, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8602363123785249.[0m
[32m[I 2026-02-28 20:40:41,688][0m Trial 3 finished with value: 0.8581299317537919 and parameters: {'n_estimators': 197, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8602363123785249.[0m
[32m[I 2026-02-28 20:41:37,672][0m Tri

Best ROC AUC for RF_MI: 0.8602


[32m[I 2026-02-28 20:44:34,689][0m A new study created in RDB with name: RF_RFECV[0m


Saved model to models1/RF_MI.pkl
RF_RFECV → existing trials: 0


[32m[I 2026-02-28 20:45:10,863][0m Trial 0 finished with value: 0.9503095768508857 and parameters: {'n_estimators': 360, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9503095768508857.[0m


In [None]:
def summarize_optuna_results(
    storage_url="sqlite:///optuna_studies.db"
):

    storage = optuna.storages.RDBStorage(url=storage_url)
    study_summaries = optuna.study.get_all_study_summaries(storage=storage)

    results = []

    for summary in study_summaries:
        study_name = summary.study_name

        study = optuna.load_study(
            study_name=study_name,
            storage=storage_url
        )

        if len(study.trials) == 0:
            continue

        results.append({
            "Study": study_name,
            "Trials": len(study.trials),
            "Best ROC AUC": study.best_value,
            "Best Params": study.best_params
        })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values("Best ROC AUC", ascending=False)

    return df_results

In [None]:
results_df = summarize_optuna_results()
results_df