# **EEG Brain Age – Random Forest + BAG  (Final 2025)**

### Librerias

In [7]:
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib
matplotlib.use("Agg")  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
import optuna
import warnings
warnings.filterwarnings("ignore")

### Load Dataset

In [14]:
DATA_PATH = r"C:\Users\Ale\Downloads\Jade\Ale\Data_Parametrizada\EEG_features_final.csv"
SAVE_DIR = r"C:\Users\Ale\Downloads\Jade\Ale\Jade_Saves\Results_RF_Bayessian"
os.makedirs(SAVE_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH)
df = df.select_dtypes(include=[np.number])
print(f" Dataset loaded: {df.shape[0]} samples, {df.shape[1]} columns")

 Dataset loaded: 510 samples, 10 columns


## **Feature engineering**
Feature engineering consiste en crear nuevas variables a partir de las bandas EEG para representar mejor la actividad cerebral. En este caso, se usan sumas, razones entre bandas y transformaciones matemáticas para capturar relaciones neurofisiológicas relevantes, reducir ruido y facilitar el aprendizaje de los modelos.

In [3]:
if all(col in df.columns for col in ["Delta","Theta","Alpha","Beta","Gamma"]):
    df["TotalPower"] = df[["Delta","Theta","Alpha","Beta","Gamma"]].sum(axis=1)
    df["Theta_Alpha"] = df["Theta"] / (df["Alpha"] + 1e-6)
    df["Alpha_Beta"]  = df["Alpha"] / (df["Beta"] + 1e-6)
    df["Delta_Alpha"] = df["Delta"] / (df["Alpha"] + 1e-6)
    df["Beta_Gamma"]  = df["Beta"]  / (df["Gamma"] + 1e-6)
    df["Slow_Fast"]   = (df["Delta"] + df["Theta"]) / (df["Alpha"] + df["Beta"] + 1e-6)

    for col in ["Delta","Theta","Alpha","Beta","Gamma"]:
        df[f"log_{col}"] = np.log1p(df[col])
        df[f"sqrt_{col}"] = np.sqrt(df[col])

X = df.drop(columns=["Age"])
y = df["Age"]

Z-score filtering was applied to remove outliers, features were standardized to ensure equal scaling, and balanced sample weights were used to reduce bias caused by uneven target distribution. 

In [4]:
z_scores = np.abs(stats.zscore(X))
filtered_entries = (z_scores < 3).all(axis=1)
X = X[filtered_entries].reset_index(drop=True)
y = y[filtered_entries].reset_index(drop=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balanced sample weights
bins = pd.cut(y, bins=5, labels=False)
weights = 1 / np.bincount(bins)
sample_weights = np.array([weights[b] for b in bins])

# **Training MLP**

## **Bayesian hyperparameter optimization using Optuna**

In [5]:
def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 500, 2000),
        max_depth=trial.suggest_int("max_depth", 10, 80),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = []
    for train_idx, test_idx in cv.split(X_scaled, y):
        X_tr, X_te = X_scaled[train_idx], X_scaled[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
        w_tr = sample_weights[train_idx]
        rf.fit(X_tr, y_tr, sample_weight=w_tr)
        y_pred = rf.predict(X_te)
        scores.append(r2_score(y_te, y_pred))
    return np.mean(scores)

In [8]:
print("\n Running Optuna Optimization (Random Forest + EEG)...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n[Optuna Results]")
print(f"Best CV R²: {study.best_value:.3f}")
for k, v in study.best_params.items():
    print(f"  - {k}: {v}")

[I 2026-01-06 19:07:14,041] A new study created in memory with name: no-name-9f524b53-290c-4590-ab42-f4871ff18bd3



 Running Optuna Optimization (Random Forest + EEG)...


Best trial: 0. Best value: 0.883202:   2%|▎         | 1/40 [01:58<1:17:16, 118.88s/it]

[I 2026-01-06 19:09:12,939] Trial 0 finished with value: 0.8832024822962707 and parameters: {'n_estimators': 1872, 'max_depth': 63, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8832024822962707.


Best trial: 0. Best value: 0.883202:   5%|▌         | 2/40 [03:01<54:10, 85.54s/it]   

[I 2026-01-06 19:10:15,130] Trial 1 finished with value: 0.7165230185774624 and parameters: {'n_estimators': 1514, 'max_depth': 61, 'max_features': 'sqrt', 'min_samples_split': 7, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8832024822962707.


Best trial: 0. Best value: 0.883202:   8%|▊         | 3/40 [05:09<1:04:48, 105.10s/it]

[I 2026-01-06 19:12:23,504] Trial 2 finished with value: 0.8266243274560264 and parameters: {'n_estimators': 1921, 'max_depth': 44, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8832024822962707.


Best trial: 3. Best value: 0.939515:  10%|█         | 4/40 [06:34<58:23, 97.31s/it]   

[I 2026-01-06 19:13:48,882] Trial 3 finished with value: 0.9395145052054973 and parameters: {'n_estimators': 1235, 'max_depth': 36, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  12%|█▎        | 5/40 [08:10<56:30, 96.88s/it]

[I 2026-01-06 19:15:24,989] Trial 4 finished with value: 0.8029266739260701 and parameters: {'n_estimators': 1408, 'max_depth': 21, 'max_features': 'sqrt', 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  15%|█▌        | 6/40 [10:07<58:41, 103.56s/it]

[I 2026-01-06 19:17:21,522] Trial 5 finished with value: 0.8531449998459322 and parameters: {'n_estimators': 1918, 'max_depth': 21, 'max_features': 'sqrt', 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  18%|█▊        | 7/40 [11:09<49:33, 90.12s/it] 

[I 2026-01-06 19:18:23,960] Trial 6 finished with value: 0.750293868413376 and parameters: {'n_estimators': 860, 'max_depth': 45, 'max_features': 'sqrt', 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  20%|██        | 8/40 [13:31<56:43, 106.37s/it]

[I 2026-01-06 19:20:45,124] Trial 7 finished with value: 0.8265812255735832 and parameters: {'n_estimators': 1908, 'max_depth': 65, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  22%|██▎       | 9/40 [14:11<44:14, 85.63s/it] 

[I 2026-01-06 19:21:25,144] Trial 8 finished with value: 0.7697280552881843 and parameters: {'n_estimators': 508, 'max_depth': 73, 'max_features': 'sqrt', 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 3. Best value: 0.939515:  25%|██▌       | 10/40 [16:13<48:28, 96.95s/it]

[I 2026-01-06 19:23:27,452] Trial 9 finished with value: 0.7623837989135847 and parameters: {'n_estimators': 1691, 'max_depth': 69, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9395145052054973.


Best trial: 10. Best value: 0.940343:  28%|██▊       | 11/40 [17:22<42:42, 88.35s/it]

[I 2026-01-06 19:24:36,307] Trial 10 finished with value: 0.9403431304353016 and parameters: {'n_estimators': 1085, 'max_depth': 32, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 10 with value: 0.9403431304353016.


Best trial: 11. Best value: 0.940355:  30%|███       | 12/40 [18:17<36:33, 78.34s/it]

[I 2026-01-06 19:25:31,731] Trial 11 finished with value: 0.9403550798776596 and parameters: {'n_estimators': 1054, 'max_depth': 31, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  32%|███▎      | 13/40 [19:35<35:14, 78.33s/it]

[I 2026-01-06 19:26:50,035] Trial 12 finished with value: 0.9403292393817533 and parameters: {'n_estimators': 1047, 'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  35%|███▌      | 14/40 [20:39<32:01, 73.91s/it]

[I 2026-01-06 19:27:53,742] Trial 13 finished with value: 0.8719331144018321 and parameters: {'n_estimators': 827, 'max_depth': 32, 'max_features': 'log2', 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  38%|███▊      | 15/40 [22:00<31:42, 76.09s/it]

[I 2026-01-06 19:29:14,903] Trial 14 finished with value: 0.9375871716999679 and parameters: {'n_estimators': 1093, 'max_depth': 13, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  40%|████      | 16/40 [22:57<28:09, 70.39s/it]

[I 2026-01-06 19:30:12,044] Trial 15 finished with value: 0.8813361277566334 and parameters: {'n_estimators': 782, 'max_depth': 52, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  42%|████▎     | 17/40 [24:21<28:30, 74.37s/it]

[I 2026-01-06 19:31:35,675] Trial 16 finished with value: 0.9015592383684634 and parameters: {'n_estimators': 1102, 'max_depth': 23, 'max_features': 'log2', 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  45%|████▌     | 18/40 [25:06<24:03, 65.62s/it]

[I 2026-01-06 19:32:20,929] Trial 17 finished with value: 0.8815185251566267 and parameters: {'n_estimators': 593, 'max_depth': 38, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  48%|████▊     | 19/40 [26:49<26:53, 76.86s/it]

[I 2026-01-06 19:34:03,957] Trial 18 finished with value: 0.8203895648632059 and parameters: {'n_estimators': 1353, 'max_depth': 52, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  50%|█████     | 20/40 [27:58<24:47, 74.35s/it]

[I 2026-01-06 19:35:12,476] Trial 19 finished with value: 0.8776045023778103 and parameters: {'n_estimators': 898, 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  52%|█████▎    | 21/40 [29:19<24:08, 76.24s/it]

[I 2026-01-06 19:36:33,102] Trial 20 finished with value: 0.8201550651811975 and parameters: {'n_estimators': 1254, 'max_depth': 28, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  55%|█████▌    | 22/40 [30:29<22:20, 74.46s/it]

[I 2026-01-06 19:37:43,412] Trial 21 finished with value: 0.9403100409411533 and parameters: {'n_estimators': 1033, 'max_depth': 27, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  57%|█████▊    | 23/40 [31:05<17:52, 63.08s/it]

[I 2026-01-06 19:38:19,941] Trial 22 finished with value: 0.9365530329526012 and parameters: {'n_estimators': 949, 'max_depth': 41, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  60%|██████    | 24/40 [32:41<19:23, 72.69s/it]

[I 2026-01-06 19:39:55,067] Trial 23 finished with value: 0.8815549170546196 and parameters: {'n_estimators': 1199, 'max_depth': 17, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  62%|██████▎   | 25/40 [33:34<16:45, 67.00s/it]

[I 2026-01-06 19:40:48,799] Trial 24 finished with value: 0.9361460419216724 and parameters: {'n_estimators': 686, 'max_depth': 31, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  65%|██████▌   | 26/40 [34:52<16:22, 70.18s/it]

[I 2026-01-06 19:42:06,404] Trial 25 finished with value: 0.872399217319447 and parameters: {'n_estimators': 1024, 'max_depth': 34, 'max_features': 'log2', 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  68%|██████▊   | 27/40 [36:44<17:57, 82.91s/it]

[I 2026-01-06 19:43:59,019] Trial 26 finished with value: 0.9288592625848202 and parameters: {'n_estimators': 1424, 'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  70%|███████   | 28/40 [38:29<17:52, 89.40s/it]

[I 2026-01-06 19:45:43,570] Trial 27 finished with value: 0.881689936740025 and parameters: {'n_estimators': 1566, 'max_depth': 26, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  72%|███████▎  | 29/40 [39:10<13:45, 75.00s/it]

[I 2026-01-06 19:46:24,973] Trial 28 finished with value: 0.7045464049050086 and parameters: {'n_estimators': 1181, 'max_depth': 40, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  75%|███████▌  | 30/40 [39:41<10:15, 61.55s/it]

[I 2026-01-06 19:46:55,146] Trial 29 finished with value: 0.8815870992970832 and parameters: {'n_estimators': 973, 'max_depth': 57, 'max_features': 'log2', 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  78%|███████▊  | 31/40 [40:34<08:51, 59.01s/it]

[I 2026-01-06 19:47:48,224] Trial 30 finished with value: 0.8847019182741256 and parameters: {'n_estimators': 730, 'max_depth': 17, 'max_features': 'log2', 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  80%|████████  | 32/40 [41:51<08:37, 64.63s/it]

[I 2026-01-06 19:49:05,976] Trial 31 finished with value: 0.9403458636955515 and parameters: {'n_estimators': 1048, 'max_depth': 28, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  82%|████████▎ | 33/40 [43:12<08:06, 69.51s/it]

[I 2026-01-06 19:50:26,872] Trial 32 finished with value: 0.9403187825781999 and parameters: {'n_estimators': 1113, 'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  85%|████████▌ | 34/40 [44:49<07:45, 77.52s/it]

[I 2026-01-06 19:52:03,081] Trial 33 finished with value: 0.9369300262736817 and parameters: {'n_estimators': 1314, 'max_depth': 35, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  88%|████████▊ | 35/40 [45:56<06:12, 74.52s/it]

[I 2026-01-06 19:53:10,589] Trial 34 finished with value: 0.9401409106050319 and parameters: {'n_estimators': 939, 'max_depth': 23, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 11. Best value: 0.940355:  90%|█████████ | 36/40 [47:16<05:04, 76.10s/it]

[I 2026-01-06 19:54:30,397] Trial 35 finished with value: 0.881401761463793 and parameters: {'n_estimators': 1151, 'max_depth': 47, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 11 with value: 0.9403550798776596.


Best trial: 36. Best value: 0.940397:  92%|█████████▎| 37/40 [48:32<03:48, 76.02s/it]

[I 2026-01-06 19:55:46,227] Trial 36 finished with value: 0.9403969429264177 and parameters: {'n_estimators': 1036, 'max_depth': 18, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 36 with value: 0.9403969429264177.


Best trial: 36. Best value: 0.940397:  95%|█████████▌| 38/40 [49:17<02:13, 66.93s/it]

[I 2026-01-06 19:56:31,950] Trial 37 finished with value: 0.9284103791732808 and parameters: {'n_estimators': 1321, 'max_depth': 17, 'max_features': 'sqrt', 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 36 with value: 0.9403969429264177.


Best trial: 36. Best value: 0.940397:  98%|█████████▊| 39/40 [50:03<01:00, 60.45s/it]

[I 2026-01-06 19:57:17,262] Trial 38 finished with value: 0.88160953804219 and parameters: {'n_estimators': 1533, 'max_depth': 25, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 36 with value: 0.9403969429264177.


Best trial: 36. Best value: 0.940397: 100%|██████████| 40/40 [50:41<00:00, 76.04s/it]

[I 2026-01-06 19:57:55,797] Trial 39 finished with value: 0.9395399085862195 and parameters: {'n_estimators': 1272, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 36 with value: 0.9403969429264177.

[Optuna Results]
Best CV R²: 0.940
  - n_estimators: 1036
  - max_depth: 18
  - max_features: log2
  - min_samples_split: 2
  - min_samples_leaf: 1





### **Train Final Model**

In [15]:
best_params = study.best_params
rf_best = RandomForestRegressor(**best_params, bootstrap=True, random_state=42, n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=True)
rf_best.fit(X_train, y_train, sample_weight=sample_weights[y_train.index])
y_pred = rf_best.predict(X_test)

In [16]:
mae = mean_absolute_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)
r   = np.corrcoef(np.array(y_test), np.array(y_pred))[0, 1]

print("\n================= FINAL TEST RESULTS =================")
print(f"MAE (years): {mae:.2f}")
print(f"R²: {r2:.3f}")
print(f"Pearson r: {r:.3f}")


MAE (years): 4.46
R²: 0.898
Pearson r: 0.960


### **BAG Calculation and Categorization**

In [18]:
BAG = y_pred - y_test.values
bag_mean, bag_std = BAG.mean(), BAG.std()
print(f"\n Brain Age Gap (BAG): mean={bag_mean:.2f}, std={bag_std:.2f}")

def categorize_bag(bag):
    if bag < -3: return "Resilient"
    elif bag > 3: return "Accelerated"
    else: return "Normal"

bag_categories = np.vectorize(categorize_bag)(BAG)
bag_df = pd.DataFrame({
    "Chronological_Age": y_test.values,
    "Predicted_Age": y_pred,
    "BAG": BAG,
    "Category": bag_categories
})
bag_df.to_csv(os.path.join(SAVE_DIR, "EEG_Brain_Age_Gap_RF.csv"), index=False)


 Brain Age Gap (BAG): mean=-0.61, std=6.73


### Histograms

In [None]:
# BAG histogram
plt.figure(figsize=(7,5))
plt.hist(BAG, bins=25, color="lightcoral", edgecolor="black")
plt.axvline(0, color="blue", linestyle="--")
plt.title("Brain Age Gap (Predicted - Chronological) – Random Forest")
plt.xlabel("BAG (years)")
plt.ylabel("Count")
plt.grid(alpha=0.4)
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "bag_hist_RF.png"))
plt.close()

# BAG categories
plt.figure(figsize=(6,4))
pd.Series(bag_categories).value_counts().reindex(["Resilient","Normal","Accelerated"]).plot(kind="bar", color=["#66c2a5","#fc8d62","#8da0cb"])
plt.title("BAG Categories Distribution (±3 years)")
plt.ylabel("Count")
plt.grid(axis="y", alpha=0.4)
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "bag_categories_RF.png"))
plt.close()



### Feature Importance

In [19]:
feat_imp = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_best.feature_importances_
}).sort_values("Importance", ascending=False)

feat_imp.to_csv(os.path.join(SAVE_DIR, "RF_feature_importance.csv"), index=False)
plt.figure(figsize=(8,6))
plt.barh(feat_imp["Feature"][:10], feat_imp["Importance"][:10], color="darkorange")
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances – Random Forest")
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "RF_feature_importance.png"))
plt.close()

### Permutation Importance

In [20]:

print("\n Computing permutation importance...")
perm_result = permutation_importance(
    rf_best, X_test, y_test, n_repeats=10, random_state=42, scoring="r2"
)
perm_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": perm_result.importances_mean
}).sort_values("Importance", ascending=False)

perm_df.to_csv(os.path.join(SAVE_DIR, "RF_permutation_importance.csv"), index=False)
plt.figure(figsize=(8,6))
plt.barh(perm_df["Feature"][:10], perm_df["Importance"][:10], color="goldenrod")
plt.gca().invert_yaxis()
plt.title("Top 10 Features – Permutation Importance (RF)")
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "RF_permutation_importance.png"))
plt.close()




 Computing permutation importance...


In [21]:

pd.DataFrame([{
    "MAE": mae, "R2": r2, "r": r, "BAG_mean": bag_mean, "BAG_std": bag_std,
    "Best_Params": best_params
}]).to_csv(os.path.join(SAVE_DIR, "RF_final_results.csv"), index=False)

print(f"\n All Random Forest results saved in: {SAVE_DIR}")


 All Random Forest results saved in: C:\Users\Ale\Downloads\Jade\Ale\Jade_Saves\Results_RF_Bayessian


### Comparison Trials

In [25]:
def BAG_stadistics(y_pred, y_test):
    BAG = y_pred - y_test.values
    bag_mean, bag_std = BAG.mean(), BAG.std()
    print(f"\n Brain Age Gap (BAG): mean={bag_mean:.2f}, std={bag_std:.2f}")
    return bag_mean, bag_std

    
def comparison_trials(n_trials, best_params, X_train, y_train, X_test, y_test):
    results = []
    for i in range(n_trials):
        print("\nTraining final model with best hyperparameters...")
        print("Trial:", i+1)
        
        rf_best = RandomForestRegressor(**best_params, bootstrap=True, random_state=42, n_jobs=-1)

        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=True)
        rf_best.fit(X_train, y_train, sample_weight=sample_weights[y_train.index])
        y_pred = rf_best.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        r = np.corrcoef(y_test, y_pred)[0, 1]
        bag_mean, bag_std = BAG_stadistics(y_pred, y_test)

        results.append({
    "Trial": i + 1,
    "MAE": mae,
    "R2": r2,
    "Pearson_r": r,
    "BAG_mean": bag_mean,
    "BAG_std": bag_std,
    "Model": "RandomForest",
    "N_estimators": best_params["n_estimators"],
    "Max_depth": best_params["max_depth"]
        })


    # save once at the end
    results_df = pd.DataFrame(results)
    results_df.to_csv(
        os.path.join(SAVE_DIR, "TRIALS_RF_Optuna_Bayessian.csv"),
        index=False
    )

    print(f"\nAll results saved in: {SAVE_DIR}")

In [27]:
import time
star_time = time.time()
n=20
comparison_trials(20, best_params, X_train, y_train, X_test, y_test)
end_time = time.time()
print(f"\n Total comparison trials time: {end_time - star_time:.2f} seconds") 


Training final model with best hyperparameters...
Trial: 1

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 2

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 3

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 4

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 5

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 6

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 7

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 8

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 9

 Brain Age Gap (BAG): mean=-0.61, std=6.73

Training final model with best hyperparameters...
Trial: 10

 B

Trials Time: 32.46 seconds