In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import pingouin as pg
import seaborn as sns
import pandas as pd
from scipy import stats
from statannotations.Annotator import Annotator

%matplotlib widget
sns.set_palette("pastel")
palette = sns.color_palette("pastel")

<h1 style="text-align:center">Dataset Loading</h1>

In [17]:
import os
import shutil
import optuna
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

SEED = 108
N_FOLDS = 3


def objective(trial):
    data = pd.read_csv("data/hamd_data.csv")
    y = data.pop("hamd_response")
    X = data

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": SEED,
    }

    if param["booster"] in ["gbtree", "dart"]:
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"]
        )
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"]
        )
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    mcc_scores = []
    best_n_estimators_list = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        clf = xgb.XGBClassifier(
            **param,
            n_estimators=10000,
            early_stopping_rounds=100,
        )

        clf.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False,
        )

        preds = clf.predict(X_valid)
        mcc = matthews_corrcoef(y_valid, preds)
        mcc_scores.append(mcc)
        best_n_estimators_list.append(clf.best_iteration)

    trial.set_user_attr("n_estimators", int(np.mean(best_n_estimators_list)))
    return np.mean(mcc_scores)


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20, timeout=600)

    print("Number of finished trials:", len(study.trials))
    print("Best trial:")
    trial = study.best_trial
    print("  Value (MCC): {:.4f}".format(trial.value))
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print("  Avg. number of estimators:", trial.user_attrs["n_estimators"])

[I 2025-07-07 18:41:41,906] A new study created in memory with name: no-name-13e49dbd-174e-48bf-acb1-c34539153847
[W 2025-07-07 18:47:01,272] Trial 0 failed with parameters: {'booster': 'dart', 'lambda': 1.397482656351756e-06, 'alpha': 3.552117312322106e-06, 'subsample': 0.3053230912344269, 'colsample_bytree': 0.34407720607407094, 'max_depth': 5, 'min_child_weight': 2, 'eta': 1.4763917829367005e-06, 'gamma': 8.649687398261285e-05, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0002992682391829122, 'skip_drop': 0.00020464932388442145} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\barbie\Desktop\telomere-mental-health\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\barbie\AppData\Local\Temp\ipykernel_20592\3832988141.py", line 65, in objective
    clf.fit(
  File "c:\User

KeyboardInterrupt: 