In [2]:
import sys, os
sys.path.insert(0, os.path.abspath("../src"))
from BRAT.algorithms import BRATD, BRATP
from BRAT.utils import load_and_clean_uci_data, train_all_models, plot_mean_std_trajectories
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from pathlib import Path

This notebook provides the MSE horserace we presented in the paper. We segement this notebook into 9 sections, with each section containing the experiments for each individual dataset. The datasets we used are:
1. [Air Quality](https://archive.ics.uci.edu/dataset/360/air+quality)
2. [Abalone](https://archive.ics.uci.edu/dataset/1/abalone)
3. [Obesity Level](https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition)
4. [Infared Thermography Temperature](https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset)
5. [Wine Quality](https://archive.ics.uci.edu/dataset/186/wine+quality)
6. [Communities and Crimes](https://archive.ics.uci.edu/dataset/183/communities+and+crime)
7. [AIDS Clinical Trials Group Study 175](https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175)
8. [Automobile](https://archive.ics.uci.edu/dataset/10/automobile)
9. [Student Performance](https://archive.ics.uci.edu/dataset/320/student+performance)

For each dataset, the first cell creates a optuna study of the hyperparameters according the following configuration:

In [3]:
# placeholder for the minute
X_train, y_train, X_test, y_test = None, None, None, None
def objective_gbt(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
    }
    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }
    model = XGBRegressor(**params, use_label_encoder=False, eval_metric='rmse')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', -1, 16),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
    }
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
    }
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_elasticnet(trial):
    params = {
        'alpha': trial.suggest_float('alpha', 0.1, 10.0),
        'l1_ratio': trial.suggest_float('l1_ratio', 0.1, 1.0),
    }
    model = ElasticNet(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_bratd(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 1.0),
        'subsample_rate': trial.suggest_float('subsample_rate', 0.5, 1.0),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.99),
    }
    model = BRATD(**params, disable_tqdm=True)
    model.fit(X_train, y_train, X_test, y_test)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_boulevard(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 1.0),
        'subsample_rate': trial.suggest_float('subsample_rate', 0.5, 1.0),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.0, 0.0),
    }
    model = BRATD(**params, disable_tqdm=True)
    model.fit(X_train, y_train, X_test, y_test)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

def objective_bratp(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 1.0),
        'subsample_rate': trial.suggest_float('subsample_rate', 0.5, 1.0),
        'n_trees_per_group': trial.suggest_int('n_trees_per_group', 2, 30),
    }
    model = BRATP(**params, disable_tqdm=True)
    model.fit(X_train, y_train, X_test, y_test)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

We save the suggestions to:

In [4]:
storage_dir = Path("optuna_suggestions")
storage_dir.mkdir(parents=True, exist_ok=True)
mse_dir = Path("mse_trajectory/data/")
mse_dir.mkdir(parents=True, exist_ok=True)
plot_dir = Path("mse_trajectory/plots/")

For detailed manual to reproduce the results, see Air Quality for example.

# Air Quality

The following cell will optimize the hyperparameters of each model and save them to the storage url `../experiments/optuna_suggestions/`. After running it, you should see a corresponding `db` file and a `csv` file. The `db` file stores the trajectory of the optimization executed by Optuna. You can open it using Optuna Dashboard, available in VSCode extension. The `csv` file stores the best parameters.

In [None]:
uci_id = 360
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id, target_column='C6H6(GT)')

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'air_quality.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    # Try to load an existing study; if it doesn't exist, create a new one
    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    # Skip optimization if the study already has trials
    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

Simply fill in the tuned hyperparameters in the next cell we obtain the results in the paper. You can see the stored trajectory at `../experiments/mse_trajectory/`.

In [4]:
uci_id = 360
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    target_column='C6H6(GT)',      
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.029,
        'max_depth': 5
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.07,
        'max_depth': 11,
        'subsample': 0.55
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 6,
        'num_leaves': 21
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 8
    },
    'ElasticNet': {
        'alpha': 0.13,
        'l1_ratio': 1.0
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 13,
        'min_samples_split': 13,
        'learning_rate': 0.94,
        'subsample_rate': 0.98,
        'dropout_rate': 0.11
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 13,
        'min_samples_split': 8,
        'learning_rate': 0.73,
        'subsample_rate': 0.99,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 20,
        'learning_rate': 0.93,
        'subsample_rate': 0.79,
        'n_trees_per_group': 6
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

# Flatten mse_runs into a DataFrame
flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

# Create DataFrame
mse_df = pd.DataFrame(flattened_data)

# Sort by Model and Run_ID
mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

# Plot and save the aggregated results
plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Air Quality")

  y_train_arr = y_train.astype(np.float32).ravel()
  y_test_arr = y_test.astype(np.float32).ravel()
Run 1: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:05<00:00, 94.44it/s]
Run 1: Training Models:  12%|█▎        | 1/8 [00:06<00:47,  6.75s/it]

Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [06:08<00:00,  1.36it/s]

Run 1: Training Models:  25%|██▌       | 2/8 [06:15<21:57, 219.61s/it]


Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4252.06it/s]
Run 1: Training Models:  38%|███▊      | 3/8 [06:49<11:14, 134.92s/it]

Training ElasticNet with manual configuration...
Training ElasticNet...


Run 1: Training Models:  50%|█████     | 4/8 [06:49<05:27, 81.78s/it] 

Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:03<00:00, 150.05it/s]
Run 1: Training Models:  62%|██████▎   | 5/8 [06:56<02:44, 54.71s/it]

Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 3454.78it/s]
Run 1: Training Models:  75%|███████▌  | 6/8 [07:31<01:35, 47.89s/it]

Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [02:42<00:00,  3.07it/s]

Run 1: Training Models:  88%|████████▊ | 7/8 [10:13<01:25, 85.46s/it]


Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [06:30<00:00,  1.28it/s]

Run 1: Training Models: 100%|██████████| 8/8 [16:44<00:00, 125.53s/it]





Run 1: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:05<00:00, 91.48it/s]
Run 1: Training Models:  12%|█▎        | 1/8 [00:06<00:47,  6.78s/it]

Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [06:07<00:00,  1.36it/s]

Run 1: Training Models:  25%|██▌       | 2/8 [06:13<21:52, 218.74s/it]


Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4244.23it/s]
Run 1: Training Models:  38%|███▊      | 3/8 [06:48<11:13, 134.74s/it]

Training ElasticNet with manual configuration...
Training ElasticNet...


Run 1: Training Models:  50%|█████     | 4/8 [06:49<05:26, 81.68s/it] 

Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:02<00:00, 173.57it/s]
Run 1: Training Models:  62%|██████▎   | 5/8 [06:54<02:42, 54.29s/it]

Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 3620.77it/s]
Run 1: Training Models:  75%|███████▌  | 6/8 [07:29<01:35, 47.58s/it]

Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [02:44<00:00,  3.04it/s]

Run 1: Training Models:  88%|████████▊ | 7/8 [10:13<01:25, 85.75s/it]


Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [06:29<00:00,  1.28it/s]

Run 1: Training Models: 100%|██████████| 8/8 [16:42<00:00, 125.36s/it]





Run 2: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:04<00:00, 120.76it/s]
Run 2: Training Models:  12%|█▎        | 1/8 [00:05<00:38,  5.52s/it]

Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [06:08<00:00,  1.36it/s]

Run 2: Training Models:  25%|██▌       | 2/8 [06:14<21:55, 219.24s/it]


Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4324.59it/s]
Run 2: Training Models:  38%|███▊      | 3/8 [06:48<11:13, 134.75s/it]

Training ElasticNet with manual configuration...
Training ElasticNet...


Run 2: Training Models:  50%|█████     | 4/8 [06:48<05:26, 81.68s/it] 

Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:03<00:00, 151.88it/s]
Run 2: Training Models:  62%|██████▎   | 5/8 [06:55<02:43, 54.66s/it]

Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 3612.90it/s]
Run 2: Training Models:  75%|███████▌  | 6/8 [07:30<01:35, 47.86s/it]

Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [02:43<00:00,  3.06it/s]

Run 2: Training Models:  88%|████████▊ | 7/8 [10:13<01:25, 85.58s/it]


Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [06:26<00:00,  1.29it/s]

Run 2: Training Models: 100%|██████████| 8/8 [16:39<00:00, 124.96s/it]





Run 3: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:04<00:00, 110.16it/s]
Run 3: Training Models:  12%|█▎        | 1/8 [00:05<00:40,  5.83s/it]

Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [06:07<00:00,  1.36it/s]

Run 3: Training Models:  25%|██▌       | 2/8 [06:12<21:50, 218.38s/it]


Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4389.66it/s]
Run 3: Training Models:  38%|███▊      | 3/8 [06:46<11:10, 134.16s/it]

Training ElasticNet with manual configuration...
Training ElasticNet...


Run 3: Training Models:  50%|█████     | 4/8 [06:47<05:25, 81.31s/it] 

Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:02<00:00, 191.43it/s]
Run 3: Training Models:  62%|██████▎   | 5/8 [06:52<02:41, 53.98s/it]

Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 3668.08it/s]
Run 3: Training Models:  75%|███████▌  | 6/8 [07:27<01:34, 47.32s/it]

Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [02:42<00:00,  3.09it/s]

Run 3: Training Models:  88%|████████▊ | 7/8 [10:09<01:24, 84.83s/it]


Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [06:24<00:00,  1.30it/s]

Run 3: Training Models: 100%|██████████| 8/8 [16:33<00:00, 124.22s/it]





Run 4: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:04<00:00, 114.53it/s]
Run 4: Training Models:  12%|█▎        | 1/8 [00:05<00:39,  5.66s/it]

Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [06:02<00:00,  1.38it/s]

Run 4: Training Models:  25%|██▌       | 2/8 [06:08<21:34, 215.76s/it]


Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4419.62it/s]
Run 4: Training Models:  38%|███▊      | 3/8 [06:42<11:03, 132.67s/it]

Training ElasticNet with manual configuration...
Training ElasticNet...


Run 4: Training Models:  50%|█████     | 4/8 [06:42<05:21, 80.42s/it] 

Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:02<00:00, 173.61it/s]
Run 4: Training Models:  62%|██████▎   | 5/8 [06:49<02:41, 53.75s/it]

Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 3632.00it/s]
Run 4: Training Models:  75%|███████▌  | 6/8 [07:23<01:34, 47.14s/it]

Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [02:40<00:00,  3.11it/s]

Run 4: Training Models:  88%|████████▊ | 7/8 [10:03<01:24, 84.21s/it]


Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [06:34<00:00,  1.27it/s]

Run 4: Training Models: 100%|██████████| 8/8 [16:38<00:00, 124.84s/it]







# Abalone

In [None]:
uci_id = 1
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'abalone.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 1
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 3
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 4,
        'subsample': 0.63
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 9,
        'num_leaves': 37
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 11
    },
    'ElasticNet': {
        'alpha': 0.13,
        'l1_ratio': 0.38
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 43,
        'learning_rate': 0.55,
        'subsample_rate': 0.51,
        'dropout_rate': 0.48
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 42,
        'learning_rate': 0.57,
        'subsample_rate': 0.51,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 5,
        'learning_rate': 0.27,
        'subsample_rate': 0.63,
        'n_trees_per_group': 16
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

# Flatten mse_runs into a DataFrame
flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

# Create DataFrame
mse_df = pd.DataFrame(flattened_data)

# Sort by Model and Run_ID
mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

# Plot and save the aggregated results
plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Abalone")

# Obesity Level

In [None]:
uci_id = 544
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'obesity.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 544
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.19,
        'max_depth': 7
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.09,
        'max_depth': 7,
        'subsample': 0.81
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.08,
        'max_depth': 16,
        'num_leaves': 25
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 16
    },
    'ElasticNet': {
        'alpha': 0.21,
        'l1_ratio': 0.12
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 3,
        'learning_rate': 0.25,
        'subsample_rate': 0.65,
        'dropout_rate': 0.81
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 3,
        'learning_rate': 0.54,
        'subsample_rate': 0.79,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 11,
        'min_samples_split': 4,
        'learning_rate': 0.36,
        'subsample_rate': 0.56,
        'n_trees_per_group': 4
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

mse_df = pd.DataFrame(flattened_data)

mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Obesity")

# Infared Thermography Temperature

In [None]:
uci_id = 925
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id, target_column='aveOralF')

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'temperature.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 925
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    target_column = 'aveOralF',
    normalize=True
)

epoch = 500

# 2. Which models to include
models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 5
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 3,
        'subsample': 0.70
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.18,
        'max_depth': 1,
        'num_leaves': 24
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 20
    },
    'ElasticNet': {
        'alpha': 3.81,
        'l1_ratio': 0.96
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 4,
        'learning_rate': 0.76,
        'subsample_rate': 0.51,
        'dropout_rate': 0.23
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 4,
        'learning_rate': 0.65,
        'subsample_rate': 0.54,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 9,
        'learning_rate': 0.98,
        'subsample_rate': 0.64,
        'n_trees_per_group': 2
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

# Flatten mse_runs into a DataFrame
flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

# Create DataFrame
mse_df = pd.DataFrame(flattened_data)

# Sort by Model and Run_ID
mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

# Plot and save the aggregated results
plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Infrared Thermography Temperature")

# Wine Quality

In [None]:
uci_id = 186
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'wine_quality.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 186
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.17,
        'max_depth': 7
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.04,
        'max_depth': 9,
        'subsample': 0.60
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.05,
        'max_depth': -1,
        'num_leaves': 47
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 20
    },
    'ElasticNet': {
        'alpha': 3.81,
        'l1_ratio': 0.96
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 8,
        'learning_rate': 0.71,
        'subsample_rate': 0.75,
        'dropout_rate': 0.41
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 15,
        'min_samples_split': 5,
        'learning_rate': 0.97,
        'subsample_rate': 0.59,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 13,
        'learning_rate': 0.82,
        'subsample_rate': 0.67,
        'n_trees_per_group': 10
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

# Flatten mse_runs into a DataFrame
flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

# Create DataFrame
mse_df = pd.DataFrame(flattened_data)

# Sort by Model and Run_ID
mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

# Plot and save the aggregated results
plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Wine Quality")

# Communities and Crime

In [None]:
uci_id = 183
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'community_crime.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 183
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

# 2. Which models to include
models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.15,
        'max_depth': 7
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.07,
        'max_depth': 4,
        'subsample': 0.87
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.015,
        'max_depth': 6,
        'num_leaves': 38
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 18
    },
    'ElasticNet': {
        'alpha': 3.81,
        'l1_ratio': 0.96
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 16,
        'min_samples_split': 8,
        'learning_rate': 0.71,
        'subsample_rate': 0.75,
        'dropout_rate': 0.41
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 15,
        'min_samples_split': 5,
        'learning_rate': 0.97,
        'subsample_rate': 0.59,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 5,
        'learning_rate': 0.35,
        'subsample_rate': 0.51,
        'n_trees_per_group': 11
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

# Flatten mse_runs into a DataFrame
flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

# Create DataFrame
mse_df = pd.DataFrame(flattened_data)

# Sort by Model and Run_ID
mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

# Plot and save the aggregated results
plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Communities and Crime")

# AIDS Clinical Trials Group Study 175

In [None]:
uci_id = 890
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'aids.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 890
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 3
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 3,
        'subsample': 0.75
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 9,
        'num_leaves': 37
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 5
    },
    'ElasticNet': {
        'alpha': 3.81,
        'l1_ratio': 0.96
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 12,
        'learning_rate': 0.85,
        'subsample_rate': 0.53,
        'dropout_rate': 0.34
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 27,
        'learning_rate': 0.91,
        'subsample_rate': 0.73,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 4,
        'min_samples_split': 46,
        'learning_rate': 0.3,
        'subsample_rate': 0.53,
        'n_trees_per_group': 3
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

mse_df = pd.DataFrame(flattened_data)

mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="AIDS Clinical Trials Group Study 175")

# Automobile

In [5]:
uci_id = 10
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id)

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'automobile.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

KeyboardInterrupt: 

In [6]:
uci_id = 10
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.2,
        'max_depth': 3
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.22,
        'max_depth': 3,
        'subsample': 0.98
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.23,
        'max_depth': 8,
        'num_leaves': 37
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 14
    },
    'ElasticNet': {
        'alpha': 0.36,
        'l1_ratio': 0.14
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 6,
        'min_samples_split': 23,
        'learning_rate': 0.94,
        'subsample_rate': 0.94,
        'dropout_rate': 0.21
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 11,
        'min_samples_split': 9,
        'learning_rate': 0.14,
        'subsample_rate': 0.98,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 30,
        'learning_rate': 0.57,
        'subsample_rate': 0.73,
        'n_trees_per_group': 16
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

mse_df = pd.DataFrame(flattened_data)

mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Automobile")

  y_train_arr = y_train.astype(np.float32).ravel()
  y_test_arr = y_test.astype(np.float32).ravel()
Run 1: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training ElasticNet with manual configuration...
Training ElasticNet...
Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [00:28<00:00, 17.80it/s]

Run 1: Training Models:  25%|██▌       | 2/8 [00:28<01:24, 14.05s/it]


Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [00:16<00:00, 30.01it/s]

Run 1: Training Models:  38%|███▊      | 3/8 [00:44<01:15, 15.14s/it]


Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 4127.22it/s]
Run 1: Training Models:  50%|█████     | 4/8 [00:45<00:38,  9.71s/it]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:00<00:00, 800.74it/s]
Run 1: Training Models:  62%|██████▎   | 5/8 [00:46<00:20,  6.67s/it]

Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [00:32<00:00, 15.17it/s]

Run 1: Training Models:  75%|███████▌  | 6/8 [01:19<00:30, 15.40s/it]


Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1988.94it/s]
Run 1: Training Models:  88%|████████▊ | 7/8 [01:19<00:10, 10.65s/it]

Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 1904.14it/s]
Run 1: Training Models: 100%|██████████| 8/8 [01:20<00:00, 10.11s/it]
Run 1: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training ElasticNet with manual configuration...
Training ElasticNet...
Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [00:27<00:00, 17.91it/s]

Run 1: Training Models:  25%|██▌       | 2/8 [00:27<01:23, 13.96s/it]


Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [00:16<00:00, 30.75it/s]

Run 1: Training Models:  38%|███▊      | 3/8 [00:44<01:14, 14.95s/it]


Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 2610.86it/s]
Run 1: Training Models:  50%|█████     | 4/8 [00:44<00:38,  9.59s/it]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1108.01it/s]
Run 1: Training Models:  62%|██████▎   | 5/8 [00:45<00:19,  6.52s/it]

Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [00:31<00:00, 15.82it/s]

Run 1: Training Models:  75%|███████▌  | 6/8 [01:17<00:29, 14.85s/it]


Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:00<00:00, 2004.18it/s]
Run 1: Training Models:  88%|████████▊ | 7/8 [01:17<00:10, 10.26s/it]

Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4508.98it/s]
Run 1: Training Models: 100%|██████████| 8/8 [01:18<00:00,  9.82s/it]
Run 2: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training ElasticNet with manual configuration...
Training ElasticNet...
Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [00:28<00:00, 17.73it/s]

Run 2: Training Models:  25%|██▌       | 2/8 [00:28<01:24, 14.10s/it]


Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [00:16<00:00, 29.86it/s]

Run 2: Training Models:  38%|███▊      | 3/8 [00:44<01:16, 15.20s/it]


Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 4054.43it/s]
Run 2: Training Models:  50%|█████     | 4/8 [00:45<00:38,  9.72s/it]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:00<00:00, 836.59it/s]
Run 2: Training Models:  62%|██████▎   | 5/8 [00:46<00:19,  6.66s/it]

Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [00:35<00:00, 14.26it/s]

Run 2: Training Models:  75%|███████▌  | 6/8 [01:21<00:32, 16.09s/it]


Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1708.25it/s]
Run 2: Training Models:  88%|████████▊ | 7/8 [01:22<00:11, 11.24s/it]

Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4588.95it/s]
Run 2: Training Models: 100%|██████████| 8/8 [01:23<00:00, 10.40s/it]
Run 3: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training ElasticNet with manual configuration...
Training ElasticNet...
Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [00:28<00:00, 17.68it/s]

Run 3: Training Models:  25%|██▌       | 2/8 [00:28<01:24, 14.14s/it]


Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [00:16<00:00, 29.56it/s]

Run 3: Training Models:  38%|███▊      | 3/8 [00:45<01:16, 15.30s/it]


Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 2751.02it/s]
Run 3: Training Models:  50%|█████     | 4/8 [00:45<00:39,  9.81s/it]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1109.76it/s]
Run 3: Training Models:  62%|██████▎   | 5/8 [00:46<00:20,  6.67s/it]

Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [00:31<00:00, 15.96it/s]

Run 3: Training Models:  75%|███████▌  | 6/8 [01:17<00:29, 14.86s/it]


Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1999.45it/s]
Run 3: Training Models:  88%|████████▊ | 7/8 [01:18<00:10, 10.31s/it]

Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4408.47it/s]
Run 3: Training Models: 100%|██████████| 8/8 [01:19<00:00,  9.92s/it]
Run 4: Training Models:   0%|          | 0/8 [00:00<?, ?it/s]

Training ElasticNet with manual configuration...
Training ElasticNet...
Training BRATD with manual configuration...
Training BRATD...
Building BRATD trees: 100%|██████████| 500/500 [00:27<00:00, 17.88it/s]

Run 4: Training Models:  25%|██▌       | 2/8 [00:27<01:23, 13.99s/it]


Training BRATP with manual configuration...
Training BRATP...
Building BRAT-P trees: 100%|██████████| 500/500 [00:16<00:00, 30.09it/s]

Run 4: Training Models:  38%|███▊      | 3/8 [00:44<01:15, 15.08s/it]


Training GBT with manual configuration...
Training GBT...


GBT staged_predict: 100%|██████████| 500/500 [00:00<00:00, 2713.29it/s]
Run 4: Training Models:  50%|█████     | 4/8 [00:45<00:38,  9.67s/it]

Training LightGBM with manual configuration...
Training LightGBM...


LightGBM staged_predict: 100%|██████████| 500/500 [00:00<00:00, 1112.77it/s]
Run 4: Training Models:  62%|██████▎   | 5/8 [00:45<00:19,  6.58s/it]

Training Boulevard with manual configuration...
Training Boulevard...
Building BRATD trees: 100%|██████████| 500/500 [00:31<00:00, 15.86it/s]

Run 4: Training Models:  75%|███████▌  | 6/8 [01:17<00:29, 14.86s/it]


Training XGBoost with manual configuration...
Training XGBoost...


XGBoost staged_predict: 100%|██████████| 500/500 [00:00<00:00, 2017.61it/s]
Run 4: Training Models:  88%|████████▊ | 7/8 [01:18<00:10, 10.27s/it]

Training RF with manual configuration...
Training RF...


RF incremental MSE: 100%|██████████| 500/500 [00:00<00:00, 4600.11it/s]
Run 4: Training Models: 100%|██████████| 8/8 [01:18<00:00,  9.87s/it]


# Student Performance

In [None]:
uci_id = 320
X_train, X_test, y_train, y_test = load_and_clean_uci_data(uci_id, target_column='G3')

models = {
    'GBT': objective_gbt,
    'XGBoost': objective_xgb,
    'LightGBM': objective_lgbm,
    'RF': objective_rf,
    'ElasticNet': objective_elasticnet,
    'BRATD':objective_bratd,
    'Boulevard': objective_boulevard,
    'BRATP': objective_bratp
}

best_params = {}
storage_url = f"sqlite:///{storage_dir / 'student.db'}"

for model_name, objective in models.items():
    sampler = TPESampler(
        n_startup_trials=5,
        n_ei_candidates=50,
        consider_prior=True,
        multivariate=True,
        seed=42
    )

    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        n_min_trials=5,
        interval_steps=1
    )

    try:
        study = optuna.load_study(study_name=model_name, storage=storage_url)
        print(f"Loaded existing study for {model_name}.")
    except KeyError:
        study = optuna.create_study(
            study_name=model_name,
            storage=storage_url,
            direction='minimize',
            sampler=sampler,
            pruner=pruner
        )
        print(f"Created new study for {model_name}.")

    if len(study.trials) == 0:
        print(f"Running optimization for {model_name}...")
        study.optimize(objective, n_trials=20)
        best_params[model_name] = study.best_params
        print(f"Best params for {model_name}: {study.best_params}")
        print(f"Best MSE for {model_name}: {study.best_value}")
    else:
        print(f"Study for {model_name} already has trials. Skipping optimization.")
        best_params[model_name] = study.best_params

output_dir = storage_dir
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f'{uci_id}_manual_tuning.csv')

best_params_df = pd.DataFrame.from_dict(best_params, orient='index')
best_params_df.to_csv(output_path)

In [None]:
uci_id = 320
X_train, X_test, y_train, y_test = load_and_clean_uci_data(
    dataset_id=uci_id,
    target_column = 'G3',                 
    test_size=0.2,                  
    random_state=42,
    normalize=True
)

epoch = 500

models = [
    'GBT', 'XGBoost', 'LightGBM',
    'RF', 'ElasticNet',
    'BRATD', 'Boulevard', 'BRATP'
]

manual_configs = {
    'GBT': {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 3
    },
    'XGBoost': {
        'n_estimators': 500,
        'learning_rate': 0.02,
        'max_depth': 5,
        'subsample': 0.57
    },
    'LightGBM': {
        'n_estimators': 500,
        'learning_rate': 0.03,
        'max_depth': 5,
        'num_leaves': 38
    },
    'RF': {
        'n_estimators': 500,
        'max_depth': 11
    },
    'ElasticNet': {
        'alpha': 0.49,
        'l1_ratio': 0.14
    },
    'BRATD': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 39,
        'learning_rate': 0.93,
        'subsample_rate': 0.84,
        'dropout_rate': 0.12
    },
    'Boulevard': {
        'n_estimators': 500,
        'max_depth': 9,
        'min_samples_split': 32,
        'learning_rate': 0.87,
        'subsample_rate': 0.52,
        'dropout_rate': 0.0
    },
    'BRATP': {
        'n_estimators': 500,
        'max_depth': 3,
        'min_samples_split': 48,
        'learning_rate': 0.98,
        'subsample_rate': 0.60,
        'n_trees_per_group': 13
    }
}

mse_runs = []

for i in range(5):
    mse_dict, best_params = train_all_models(
        X_train, y_train,
        X_test, y_test,
        epoch=epoch,
        tune=False,
        models=models,
        manual_configs=manual_configs,
        run_idx=i
    )
    mse_runs.append(mse_dict)

flattened_data = []
for run_id, mse_dict in enumerate(mse_runs, start=1):
    for model_name, mse_values in mse_dict.items():
        for epoch_idx, mse in enumerate(mse_values, start=1):
            flattened_data.append({
                'Model': model_name,
                'Run_ID': run_id,
                'Epoch': epoch_idx,
                'MSE': mse
            })

mse_df = pd.DataFrame(flattened_data)

mse_df = mse_df.sort_values(by=['Model', 'Run_ID', 'Epoch']).reset_index(drop=True)
output_path = os.path.join(mse_dir, f'{uci_id}.csv')
mse_df.to_csv(output_path, index=False)

plot_mean_std_trajectories(mse_runs, epoch, dataset_id=uci_id, plot_dir=plot_dir, title="Student Performance")