# Set Up

In [18]:
import pandas as pd
import json
import numpy as np
import xgboost as xgb
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import random

In [19]:
random.seed(42)
np.random.seed(42)

In [20]:
n_splits = 10

In [21]:
rename = pd.read_csv('/Users/baileyng/MIND_models/region_names/col_renames.csv')
rename_dict = dict(zip(rename['datafield_code'], rename['datafield_name']))

In [30]:
with open('/Users/baileyng/MIND_models/region_names/MIND_avg_regions.txt', 'r') as f:
    MIND_avg_regions = [line.strip() for line in f.readlines()]

with open('/Users/baileyng/MIND_models/region_names/MIND_regions.txt', 'r') as f:
    MIND_regions = [line.strip() for line in f.readlines()]

with open('/Users/baileyng/MIND_models/region_names/CT_regions.txt', 'r') as f:
    CT_regions_base = [line.strip() for line in f.readlines()]
    CT_regions = [rename_dict[region] for region in CT_regions_base]

with open('/Users/baileyng/MIND_models/region_names/FC_regions.txt', 'r') as f:
    FC_regions = [line.strip() for line in f.readlines()]

demo = []

# regions = [MIND_avg_regions, MIND_regions, CT_regions, FC_regions, demo]
# region_names = ['MIND_avg_regions', 'MIND_regions', 'CT_regions', 'FC_regions', 'demo']
# regions = [MIND_avg_regions, CT_regions, FC_regions, demo]
# region_names = ['MIND_avg_regions', 'CT_regions', 'FC_regions', 'demo']
regions = [MIND_regions]
region_names = ['MIND_regions']


In [23]:
numerical_variables = ['age']

categorical_variables = ['assessment_centre']

binary_variables = ['sex']

In [24]:
def train_test_metrics(hyperparameter_dir, categorical_variables, binary_variables, numerical_variables):
    all_results = {}

    for i, region_name in enumerate(region_names):
        print(f"\n{'='*60}")
        print(f"Running analysis for: {region_name}")
        print(f"{'='*60}")
        
        # Create a local copy for this iteration
        numerical_variables_copy = numerical_variables.copy()
        numerical_variables_copy = numerical_variables_copy + regions[i]

        if region_name == 'FC_regions':
            numerical_variables_copy = numerical_variables_copy + ['head_motion']

        region_hyperparameter_dir = os.path.join(hyperparameter_dir, region_name)
        
        # Check if the directory exists before running
        if not os.path.exists(region_hyperparameter_dir):
            print(f"Directory not found: {region_hyperparameter_dir}")
            continue
            
        mae_list, rmse_list, r2_list = [], [], []

        for split_idx in range(n_splits):
            split_dir = os.path.join(region_hyperparameter_dir, f'split_{split_idx}')

            # load data
            data = np.load(os.path.join(split_dir, 'train_test_data.npz'), allow_pickle=True)
            cols     = data['column_names']
            X_train  = pd.DataFrame(data=data['x_train'], columns=cols)
            X_test   = pd.DataFrame(data=data['x_test'],  columns=cols)
            y_train  = data['y_train']
            y_test   = data['y_test']

            # cast types
            for c in categorical_variables:
                    X_train[c] = X_train[c].astype('category')
                    X_test[c]  = X_test[c].astype('category')
            for b in binary_variables:
                    X_train[b] = pd.to_numeric(X_train[b], errors='coerce')
                    X_test[b]  = pd.to_numeric(X_test[b], errors='coerce')

            # load best hyperparams
            with open(os.path.join(split_dir, 'best_hyperparameters.json'), 'r') as f:
                params = json.load(f)

            # choose objective/metric
            if np.unique(y_train).shape[0] >= 3:
                params.update({'eval_metric':'rmse', 'objective':'reg:squarederror'})
            else:
                params.update({'eval_metric':'auc',  'objective':'binary:logistic'})

            # extract and remove n_estimators
            n_estimators = int(params.pop('n_estimators'))

            # scale numerics
            scaler = StandardScaler()
            num_vars = [v for v in cols if v in numerical_variables_copy]
            if num_vars:
                X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
                X_test[num_vars]  = scaler.transform(X_test[num_vars])

            # train XGBoost
            dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
            booster = xgb.train(params, dtrain, num_boost_round=n_estimators)

            # predict
            dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
            preds = booster.predict(dtest)

            # compute metrics
            mae  = mean_absolute_error(y_test, preds)
            rmse = np.sqrt(mean_squared_error(y_test, preds))
            r2   = r2_score(y_test, preds)
            # r2 = r2_score(y_test, preds, force_finite=False)

            mae_list.append(mae)
            rmse_list.append(rmse)
            r2_list.append(r2)

            print(f"Split {split_idx:02d} → MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")

        # after all splits, summary for this region
        print(f"\nOverall performance for {region_name}:")
        print(f"MAE  : {np.mean(mae_list):.3f} ± {np.std(mae_list):.3f}")
        print(f"RMSE : {np.mean(rmse_list):.3f} ± {np.std(rmse_list):.3f}")
        print(f"R²   : {np.mean(r2_list):.3f} ± {np.std(r2_list):.3f}")
        
        # Store results for this region
        all_results[region_name] = {
            'mae_list': mae_list,
            'rmse_list': rmse_list,
            'r2_list': r2_list,
            'mae_mean': np.mean(mae_list),
            'mae_std': np.std(mae_list),
            'rmse_mean': np.mean(rmse_list),
            'rmse_std': np.std(rmse_list),
            'r2_mean': np.mean(r2_list),
            'r2_std': np.std(r2_list)
        }

    return all_results

# GF

In [31]:
hyperparameter_dir = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_20016-2.0_reg_07-21'

results = train_test_metrics(
    hyperparameter_dir=hyperparameter_dir,
    categorical_variables=categorical_variables,
    binary_variables=binary_variables,
    numerical_variables=numerical_variables
)


Running analysis for: MIND_regions
Split 00 → MAE: 1.601, RMSE: 2.000, R²: 0.049
Split 01 → MAE: 1.627, RMSE: 2.019, R²: 0.035
Split 02 → MAE: 1.641, RMSE: 2.050, R²: 0.017
Split 03 → MAE: 1.646, RMSE: 2.047, R²: 0.011
Split 04 → MAE: 1.621, RMSE: 2.019, R²: 0.040
Split 05 → MAE: 1.645, RMSE: 2.041, R²: 0.040
Split 06 → MAE: 2.090, RMSE: 2.633, R²: -0.655
Split 07 → MAE: 1.574, RMSE: 1.976, R²: 0.028
Split 08 → MAE: 1.625, RMSE: 2.040, R²: 0.009
Split 09 → MAE: 1.581, RMSE: 1.985, R²: 0.048

Overall performance for MIND_regions:
MAE  : 1.665 ± 0.144
RMSE : 2.081 ± 0.186
R²   : -0.038 ± 0.206


# PAL

In [32]:
hyperparameter_dir = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_20197-2.0_reg_07-21'

results = train_test_metrics(
    hyperparameter_dir=hyperparameter_dir,
    categorical_variables=categorical_variables,
    binary_variables=binary_variables,
    numerical_variables=numerical_variables
)


Running analysis for: MIND_regions
Split 00 → MAE: 2.150, RMSE: 2.653, R²: -0.022


KeyboardInterrupt: 

# DSST

In [None]:
hyperparameter_dir = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_23324-2.0_reg_07-21'

results = train_test_metrics(
    hyperparameter_dir=hyperparameter_dir,
    categorical_variables=categorical_variables,
    binary_variables=binary_variables,
    numerical_variables=numerical_variables
)


Running analysis for: MIND_avg_regions
Split 00 → MAE: 3.724, RMSE: 4.739, R²: 0.183
Split 01 → MAE: 3.808, RMSE: 4.827, R²: 0.164
Split 02 → MAE: 3.867, RMSE: 4.870, R²: 0.137
Split 03 → MAE: 3.733, RMSE: 4.744, R²: 0.179
Split 04 → MAE: 3.876, RMSE: 4.938, R²: 0.143
Split 05 → MAE: 3.669, RMSE: 4.690, R²: 0.208
Split 06 → MAE: 3.726, RMSE: 4.749, R²: 0.184
Split 07 → MAE: 3.744, RMSE: 4.766, R²: 0.142
Split 08 → MAE: 3.699, RMSE: 4.682, R²: 0.167
Split 09 → MAE: 3.776, RMSE: 4.775, R²: 0.207

Overall performance for MIND_avg_regions:
MAE  : 3.762 ± 0.065
RMSE : 4.778 ± 0.076
R²   : 0.171 ± 0.024

Running analysis for: CT_regions
Split 00 → MAE: 3.985, RMSE: 5.071, R²: 0.065
Split 01 → MAE: 3.757, RMSE: 4.760, R²: 0.186
Split 02 → MAE: 4.562, RMSE: 5.764, R²: -0.209
Split 03 → MAE: 3.737, RMSE: 4.754, R²: 0.176
Split 04 → MAE: 3.770, RMSE: 4.803, R²: 0.189
Split 05 → MAE: 3.812, RMSE: 4.917, R²: 0.129
Split 06 → MAE: 3.718, RMSE: 4.728, R²: 0.191
Split 07 → MAE: 3.660, RMSE: 4.684, R

# TMT

In [None]:
hyperparameter_dir = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_trailmaking_score_reg_07-21'

results = train_test_metrics(
    hyperparameter_dir=hyperparameter_dir,
    categorical_variables=categorical_variables,
    binary_variables=binary_variables,
    numerical_variables=numerical_variables
)


Running analysis for: MIND_avg_regions
Split 00 → MAE: 167.295, RMSE: 223.983, R²: -0.079
Split 01 → MAE: 146.083, RMSE: 199.679, R²: 0.129
Split 02 → MAE: 151.973, RMSE: 204.294, R²: 0.070
Split 03 → MAE: 158.418, RMSE: 214.876, R²: 0.093
Split 04 → MAE: 144.591, RMSE: 200.149, R²: 0.121
Split 05 → MAE: 158.676, RMSE: 211.436, R²: -0.019
Split 06 → MAE: 142.486, RMSE: 192.248, R²: 0.120
Split 07 → MAE: 145.699, RMSE: 199.869, R²: 0.140
Split 08 → MAE: 157.977, RMSE: 209.115, R²: 0.039
Split 09 → MAE: 147.816, RMSE: 200.824, R²: 0.043

Overall performance for MIND_avg_regions:
MAE  : 152.101 ± 7.700
RMSE : 205.647 ± 8.779
R²   : 0.066 ± 0.068

Running analysis for: CT_regions
Split 00 → MAE: 162.727, RMSE: 216.580, R²: -0.009
Split 01 → MAE: 151.447, RMSE: 207.030, R²: 0.064
Split 02 → MAE: 148.737, RMSE: 201.301, R²: 0.097
Split 03 → MAE: 154.384, RMSE: 210.738, R²: 0.128
Split 04 → MAE: 149.310, RMSE: 205.382, R²: 0.075
Split 05 → MAE: 161.697, RMSE: 214.849, R²: -0.052
Split 06 → M