In [41]:
import numpy as np
import os
import pandas as pd
import pickle

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import optuna

import custom_map

In [42]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-Hill-Climbing/src/custom_map.py'>

In [43]:
storage_url = "sqlite:///optuna_studies.db"

# Data Preprocessing

In [44]:
data = pd.read_csv("../dataset/train.csv")

target = "Heart Disease"
data['Heart Disease'] = np.where(data['Heart Disease'] == 'Presence', 1, 0)

data.drop('id', axis=1, inplace=True)
categorical_cols = data.columns[(data.nunique() <= 10) & (data.nunique() > 2)]
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True, dtype=float)

X = data_encoded.drop(columns=[target])
y = data_encoded[target]

data_encoded

Unnamed: 0,Age,Sex,BP,Cholesterol,FBS over 120,Max HR,Exercise angina,ST depression,Heart Disease,Chest pain type_2,...,Chest pain type_4,EKG results_1,EKG results_2,Slope of ST_2,Slope of ST_3,Number of vessels fluro_1,Number of vessels fluro_2,Number of vessels fluro_3,Thallium_6,Thallium_7
0,58,1,152,239,0,158,1,3.6,1,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,52,1,125,325,0,171,0,0.0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,56,0,160,188,0,151,0,0.0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,0,134,229,0,150,0,1.0,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,58,1,140,234,0,125,1,3.8,1,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,56,0,110,226,0,132,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629996,54,1,128,249,1,150,0,0.0,0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
629997,67,1,130,275,0,149,0,0.0,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
629998,52,1,140,199,0,157,0,0.0,1,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
all_cols = X.columns.tolist()

MI_cols = ['Chest pain type_3', 'EKG results_2', 'Number of vessels fluro_2',
    'Number of vessels fluro_1', 'Age', 'Chest pain type_2',
    'Number of vessels fluro_3', 'Cholesterol', 'BP', 'Slope of ST_3',
    'FBS over 120', 'Thallium_6', 'EKG results_1']

RFECV_cols = ['Chest pain type_2', 'Chest pain type_3', 'Chest pain type_4',
       'EKG results_1', 'EKG results_2', 'Slope of ST_2', 'Slope of ST_3',
       'Number of vessels fluro_1', 'Number of vessels fluro_2',
       'Number of vessels fluro_3', 'Thallium_6', 'Thallium_7', 'Sex',
       'FBS over 120', 'Exercise angina', 'Cholesterol', 'ST depression',
       'Age', 'Max HR']

boruta_cols = ['Chest pain type_4', 'Slope of ST_2', 'Thallium_7', 'Exercise angina',
        'Cholesterol', 'ST depression', 'Age', 'Max HR']

feature_sets = {
    "ALL": all_cols,
    #"MI": MI_cols,
    #"RFECV": RFECV_cols,
    #"BORUTA": boruta_cols
}

models = {
    "LogReg": LogisticRegression,
    "RF": RandomForestClassifier,
    "ExtraTrees": ExtraTreesClassifier,
    "KNN": KNeighborsClassifier,
    "SVC": SVC,
    "XGB": XGBClassifier,
    "LGBM": LGBMClassifier,
    "CatBoost": CatBoostClassifier
}

In [46]:
X = data_encoded
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optuna function

In [47]:
def run_optuna_all_models(
    X_train,
    y_train,
    cv,
    data_encoded,
    n_trials=10,
    storage_url="sqlite:///optuna_studies.db",
    models_dir="models1"
):
    os.makedirs(models_dir, exist_ok=True)

    def create_objective(model_name, model_class, X, y):

        def objective(trial):

            if model_name == "LogReg":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "max_iter": 3000,
                    "solver": "lbfgs"
                }

            elif model_name == "RF":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 150, 350),
                    "max_depth": trial.suggest_int("max_depth", 8, 25),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                    "max_features": trial.suggest_float("max_features", 0.3, 0.9),
                    "bootstrap": True,
                    "max_samples": trial.suggest_float("max_samples", 0.5, 0.9),
                    "n_jobs": -1,
                }

            elif model_name == "ExtraTrees":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 200, 400),
                    "max_depth": trial.suggest_int("max_depth", 10, 30),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                    "max_features": trial.suggest_float("max_features", 0.3, 1.0),
                    "bootstrap": False,
                    "n_jobs": -1,
                }

            elif model_name == "KNN":
                params = {
                    "n_neighbors": trial.suggest_int("n_neighbors", 3, 25),
                    "weights": trial.suggest_categorical("weights", ["uniform", "distance"])
                }

            elif model_name == "SVC":
                params = {
                    "C": trial.suggest_float("C", 1e-3, 10, log=True),
                    "gamma": trial.suggest_float("gamma", 1e-4, 1, log=True),
                    "kernel": "rbf",
                    "probability": True
                }

            elif model_name == "XGB":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                    "eval_metric": "auc",
                    "use_label_encoder": False,
                }

            elif model_name == "LGBM":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                    "max_depth": trial.suggest_int("max_depth", -1, 15),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "num_leaves": trial.suggest_int("num_leaves", 20, 200),
                }

            elif model_name == "CatBoost":
                params = {
                    "iterations": trial.suggest_int("iterations", 200, 600),
                    "depth": trial.suggest_int("depth", 4, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "verbose": 0,
                }

            model = model_class(**params)

            scores = cross_val_score(
                model,
                X,
                y,
                cv=cv,
                scoring="roc_auc",
                n_jobs=-1
            )

            return scores.mean()

        return objective

    for model_name, model_class in models.items():
        for fs_name, cols in feature_sets.items():

            X_subset = X_train[cols]
            study_name = f"{model_name}_{fs_name}"

            study = optuna.create_study(
                direction="maximize",
                study_name=study_name,
                storage=storage_url,
                load_if_exists=True
            )

            remaining_trials = n_trials
            print(f"{study_name} existing trials: {len(study.trials)}")

            if remaining_trials > 0:
                study.optimize(create_objective(model_name, model_class, X_subset, y_train),
                    n_trials=remaining_trials, show_progress_bar=True)

            print(f"Best ROC AUC for {study_name}: {study.best_value:.4f}")

            best_model = model_class(**study.best_params)
            best_model.fit(X_subset, y_train)

            model_path = os.path.join(models_dir, f"{study_name}.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(best_model, f)

            print(f"Saved model to {model_path}")

In [None]:
run_optuna_all_models(
    X_train=X_train,
    y_train=y_train,
    cv=cv,
    data_encoded=data_encoded,
    n_trials=10
)

[32m[I 2026-02-28 23:57:46,728][0m Using an existing study with name 'LogReg_ALL' instead of creating a new one.[0m


LogReg_ALL existing trials: 60


Best trial: 0. Best value: 1:  10%|█         | 1/10 [00:28<04:13, 28.12s/it]

[32m[I 2026-02-28 23:58:14,859][0m Trial 60 finished with value: 0.9529793088859299 and parameters: {'C': 0.4772531772986087}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  20%|██        | 2/10 [01:07<04:40, 35.01s/it]

[32m[I 2026-02-28 23:58:54,677][0m Trial 61 finished with value: 0.9529714957278864 and parameters: {'C': 2.8509531633794447}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  30%|███       | 3/10 [01:54<04:42, 40.31s/it]

[32m[I 2026-02-28 23:59:41,306][0m Trial 62 finished with value: 0.9529730507342808 and parameters: {'C': 9.803895358018107}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  40%|████      | 4/10 [02:51<04:40, 46.72s/it]

[32m[I 2026-03-01 00:00:37,845][0m Trial 63 finished with value: 0.9529738407201525 and parameters: {'C': 4.388242252032665}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  50%|█████     | 5/10 [03:40<03:58, 47.78s/it]

[32m[I 2026-03-01 00:01:27,519][0m Trial 64 finished with value: 0.9529771091112738 and parameters: {'C': 1.3812693341031903}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  60%|██████    | 6/10 [04:37<03:22, 50.66s/it]

[32m[I 2026-03-01 00:02:23,764][0m Trial 65 finished with value: 0.9529743805024985 and parameters: {'C': 0.8394135139837903}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  70%|███████   | 7/10 [05:32<02:36, 52.28s/it]

[32m[I 2026-03-01 00:03:19,398][0m Trial 66 finished with value: 0.952973076013991 and parameters: {'C': 2.476671731542627}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  80%|████████  | 8/10 [06:26<01:45, 52.72s/it]

[32m[I 2026-03-01 00:04:13,060][0m Trial 67 finished with value: 0.9529718893893462 and parameters: {'C': 7.300549357512272}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1:  90%|█████████ | 9/10 [07:31<00:56, 56.60s/it]

[32m[I 2026-03-01 00:05:18,174][0m Trial 68 finished with value: 0.952975315162276 and parameters: {'C': 1.891288548651111}. Best is trial 0 with value: 1.0.[0m


Best trial: 0. Best value: 1: 100%|██████████| 10/10 [08:41<00:00, 52.12s/it]


[32m[I 2026-03-01 00:06:27,952][0m Trial 69 finished with value: 0.9529698147920364 and parameters: {'C': 3.6618447184378766}. Best is trial 0 with value: 1.0.[0m
Best ROC AUC for LogReg_ALL: 1.0000


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[32m[I 2026-03-01 00:06:30,804][0m Using an existing study with name 'RF_ALL' instead of creating a new one.[0m


Saved model to models1/LogReg_ALL.pkl
RF_ALL existing trials: 10


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
def summarize_optuna_results(
    storage_url="sqlite:///optuna_studies.db"
):

    storage = optuna.storages.RDBStorage(url=storage_url)
    study_summaries = optuna.study.get_all_study_summaries(storage=storage)

    results = []

    for summary in study_summaries:
        study_name = summary.study_name

        study = optuna.load_study(
            study_name=study_name,
            storage=storage_url
        )

        if len(study.trials) == 0:
            continue

        results.append({
            "Study": study_name,
            "Trials": len(study.trials),
            "Best ROC AUC": study.best_value,
            "Best Params": study.best_params
        })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values("Best ROC AUC", ascending=False)

    return df_results

In [None]:
results_df = summarize_optuna_results()
results_df