In [1]:
import pandas as pd
import json
import numpy as np
import xgboost as xgb
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import random

In [2]:
random.seed(42)
np.random.seed(42)

In [3]:
n_splits = 10

In [4]:
def train_test_metrics(output_variable):
    mae_list, rmse_list, r2_list = [], [], []

    for split_idx in range(n_splits):
        split_dir = os.path.join(BASE_FOLDER, output_variable, f'split_{split_idx}')

        # load data
        data = np.load(os.path.join(split_dir, 'train_test_data.npz'), allow_pickle=True)
        cols     = data['column_names']
        X_train  = pd.DataFrame(data=data['x_train'], columns=cols)
        X_test   = pd.DataFrame(data=data['x_test'],  columns=cols)
        y_train  = data['y_train']
        y_test   = data['y_test']

        # cast types
        for c in categorical_variables:
            if c in cols:
                X_train[c] = X_train[c].astype('category')
                X_test[c]  = X_test[c].astype('category')
        for b in binary_variables:
            if b in cols:
                X_train[b] = pd.to_numeric(X_train[b], errors='coerce')
                X_test[b]  = pd.to_numeric(X_test[b], errors='coerce')

        # load best hyperparams
        with open(os.path.join(split_dir, 'best_hyperparameters.json'), 'r') as f:
            params = json.load(f)

        # choose objective/metric
        if np.unique(y_train).shape[0] >= 3:
            params.update({'eval_metric':'rmse', 'objective':'reg:squarederror'})
        else:
            params.update({'eval_metric':'auc',  'objective':'binary:logistic'})

        # extract and remove n_estimators
        n_estimators = int(params.pop('n_estimators'))

        # scale numerics
        scaler = StandardScaler()
        num_vars = [v for v in cols if v in numerical_variables]
        X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
        X_test[num_vars]  = scaler.transform(   X_test[num_vars])

        # train XGBoost
        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        booster = xgb.train(params, dtrain, num_boost_round=n_estimators)

        # predict
        dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
        preds = booster.predict(dtest)

        # compute metrics
        mae  = mean_absolute_error(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2   = r2_score(y_test, preds)

        mae_list.append(mae)
        rmse_list.append(rmse)
        r2_list.append(r2)

        print(f"Split {split_idx:02d} → MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")

    # after all splits, summary
    print("\nOverall performance:")
    print(f"MAE  : {np.mean(mae_list):.3f} ± {np.std(mae_list):.3f}")
    print(f"RMSE : {np.mean(rmse_list):.3f} ± {np.std(rmse_list):.3f}")
    print(f"R²   : {np.mean(r2_list):.3f} ± {np.std(r2_list):.3f}")

    return mae_list, rmse_list, r2_list

### FIS vs. MIND (avg)

In [5]:
numerical_variables = ['age',
                       
        'lh_bankssts', 'lh_caudalanteriorcingulate', 'lh_caudalmiddlefrontal',
        'lh_cuneus', 'lh_entorhinal', 'lh_fusiform', 'lh_inferiorparietal', 
        'lh_inferiortemporal', 'lh_isthmuscingulate', 'lh_lateraloccipital', 
        'lh_lateralorbitofrontal', 'lh_lingual', 'lh_medialorbitofrontal', 
        'lh_middletemporal', 'lh_parahippocampal', 'lh_paracentral', 
        'lh_parsopercularis', 'lh_parsorbitalis', 'lh_parstriangularis', 
        'lh_pericalcarine', 'lh_postcentral', 'lh_posteriorcingulate', 
        'lh_precentral', 'lh_precuneus', 'lh_rostralanteriorcingulate', 
        'lh_rostralmiddlefrontal', 'lh_superiorfrontal', 'lh_superiorparietal', 
        'lh_superiortemporal', 'lh_supramarginal', 'lh_frontalpole', 
        'lh_temporalpole', 'lh_transversetemporal', 'lh_insula', 
        'rh_bankssts', 'rh_caudalanteriorcingulate', 'rh_caudalmiddlefrontal', 
        'rh_cuneus', 'rh_entorhinal', 'rh_fusiform', 'rh_inferiorparietal', 
        'rh_inferiortemporal', 'rh_isthmuscingulate', 'rh_lateraloccipital', 
        'rh_lateralorbitofrontal', 'rh_lingual', 'rh_medialorbitofrontal', 
        'rh_middletemporal', 'rh_parahippocampal', 'rh_paracentral', 
        'rh_parsopercularis', 'rh_parsorbitalis', 'rh_parstriangularis', 
        'rh_pericalcarine', 'rh_postcentral', 'rh_posteriorcingulate', 
        'rh_precentral', 'rh_precuneus', 'rh_rostralanteriorcingulate', 
        'rh_rostralmiddlefrontal', 'rh_superiorfrontal', 'rh_superiorparietal', 
        'rh_superiortemporal', 'rh_supramarginal', 'rh_frontalpole', 
        'rh_temporalpole', 'rh_transversetemporal', 'rh_insula']

categorical_variables = ['assessment centre']

binary_variables = ['sex']

OUT_VAR = '20016-2.0'

In [6]:
BASE_FOLDER = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_MIND_06-28'
maes, rmses, r2s = train_test_metrics(OUT_VAR)

Split 00 → MAE: 1.644, RMSE: 2.049, R²: 0.033
Split 01 → MAE: 1.760, RMSE: 2.190, R²: -0.127
Split 02 → MAE: 1.639, RMSE: 2.044, R²: 0.039
Split 03 → MAE: 1.695, RMSE: 2.107, R²: -0.064
Split 04 → MAE: 1.641, RMSE: 2.054, R²: 0.030
Split 05 → MAE: 1.698, RMSE: 2.132, R²: -0.095
Split 06 → MAE: 1.709, RMSE: 2.151, R²: -0.090
Split 07 → MAE: 1.677, RMSE: 2.100, R²: -0.041
Split 08 → MAE: 1.621, RMSE: 2.023, R²: 0.024
Split 09 → MAE: 1.624, RMSE: 2.021, R²: 0.036

Overall performance:
MAE  : 1.671 ± 0.043
RMSE : 2.087 ± 0.055
R²   : -0.025 ± 0.061


In [7]:
BASE_FOLDER = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_MIND_reg_06-30'
maes, rmses, r2s = train_test_metrics(OUT_VAR)

Split 00 → MAE: 1.703, RMSE: 2.124, R²: -0.040
Split 01 → MAE: 1.628, RMSE: 2.029, R²: 0.032
Split 02 → MAE: 1.639, RMSE: 2.045, R²: 0.038
Split 03 → MAE: 1.677, RMSE: 2.086, R²: -0.044
Split 04 → MAE: 1.652, RMSE: 2.069, R²: 0.016
Split 05 → MAE: 1.612, RMSE: 2.010, R²: 0.026
Split 06 → MAE: 1.609, RMSE: 2.014, R²: 0.045
Split 07 → MAE: 1.613, RMSE: 2.018, R²: 0.040
Split 08 → MAE: 1.621, RMSE: 2.023, R²: 0.024
Split 09 → MAE: 1.620, RMSE: 2.015, R²: 0.041

Overall performance:
MAE  : 1.638 ± 0.030
RMSE : 2.043 ± 0.036
R²   : 0.018 ± 0.031


### CT Models

In [8]:
numerical_variables = ['age',
                       
        'lh_caudalanteriorcingulate_thickness', 'rh_caudalanteriorcingulate_thickness', 'lh_caudalmiddlefrontal_thickness',
        'rh_caudalmiddlefrontal_thickness', 'lh_cuneus_thickness', 'rh_cuneus_thickness', 'lh_entorhinal_thickness', 
        'rh_entorhinal_thickness', 'lh_fusiform_thickness', 'rh_fusiform_thickness', 'lh_inferiorparietal_thickness', 
        'rh_inferiorparietal_thickness', 'lh_inferiortemporal_thickness', 'rh_inferiortemporal_thickness', 'lh_insula_thickness', 
        'rh_insula_thickness', 'lh_isthmuscingulate_thickness', 'rh_isthmuscingulate_thickness', 'lh_lateraloccipital_thickness', 
        'rh_lateraloccipital_thickness', 'lh_lateralorbitofrontal_thickness', 'rh_lateralorbitofrontal_thickness', 
        'lh_lingual_thickness', 'rh_lingual_thickness', 'lh_medialorbitofrontal_thickness', 'rh_medialorbitofrontal_thickness', 
        'lh_middletemporal_thickness', 'rh_middletemporal_thickness', 'lh_paracentral_thickness', 'rh_paracentral_thickness', 
        'lh_parahippocampal_thickness', 'rh_parahippocampal_thickness', 'lh_parsopercularis_thickness', 'rh_parsopercularis_thickness', 
        'lh_parsorbitalis_thickness', 'rh_parsorbitalis_thickness', 'lh_parstriangularis_thickness', 'rh_parstriangularis_thickness', 
        'lh_pericalcarine_thickness', 'rh_pericalcarine_thickness', 'lh_postcentral_thickness', 'rh_postcentral_thickness', 
        'lh_posteriorcingulate_thickness', 'rh_posteriorcingulate_thickness', 'lh_precentral_thickness', 'rh_precentral_thickness', 
        'lh_precuneus_thickness', 'rh_precuneus_thickness', 'lh_rostralanteriorcingulate_thickness', 'rh_rostralanteriorcingulate_thickness', 
        'lh_rostralmiddlefrontal_thickness', 'rh_rostralmiddlefrontal_thickness', 'lh_superiorfrontal_thickness', 'rh_superiorfrontal_thickness', 
        'lh_superiorparietal_thickness', 'rh_superiorparietal_thickness', 'lh_superiortemporal_thickness', 'rh_superiortemporal_thickness', 
        'lh_supramarginal_thickness', 'rh_supramarginal_thickness', 'lh_transversetemporal_thickness', 'rh_transversetemporal_thickness']

categorical_variables = ['assessment centre']

binary_variables = ['sex']

OUT_VAR = '20016-2.0'

In [9]:
BASE_FOLDER = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_CT_06-28'
maes, rmses, r2s = train_test_metrics(OUT_VAR)

Split 00 → MAE: 1.757, RMSE: 2.202, R²: -0.118
Split 01 → MAE: 1.631, RMSE: 2.032, R²: 0.030
Split 02 → MAE: 1.647, RMSE: 2.054, R²: 0.030
Split 03 → MAE: 1.629, RMSE: 2.020, R²: 0.021
Split 04 → MAE: 1.645, RMSE: 2.059, R²: 0.026
Split 05 → MAE: 1.663, RMSE: 2.081, R²: -0.043
Split 06 → MAE: 1.621, RMSE: 2.024, R²: 0.034
Split 07 → MAE: 1.616, RMSE: 2.020, R²: 0.037
Split 08 → MAE: 1.628, RMSE: 2.030, R²: 0.017
Split 09 → MAE: 1.813, RMSE: 2.278, R²: -0.226

Overall performance:
MAE  : 1.665 ± 0.063
RMSE : 2.080 ± 0.084
R²   : -0.019 ± 0.083


### FC Models

In [10]:
numerical_variables = ['age',
        
        'IC1IC2', 'IC1IC3', 'IC1IC4', 'IC1IC5', 'IC1IC6', 'IC1IC7', 'IC1IC8', 
        'IC1IC9', 'IC1IC10', 'IC1IC11', 'IC1IC12', 'IC1IC13', 'IC1IC14', 
        'IC1IC15', 'IC1IC16', 'IC1IC17', 'IC1IC18', 'IC1IC19', 'IC1IC20', 
        'IC1IC21', 'IC2IC3', 'IC2IC4', 'IC2IC5', 'IC2IC6', 'IC2IC7', 'IC2IC8', 
        'IC2IC9', 'IC2IC10', 'IC2IC11', 'IC2IC12', 'IC2IC13', 'IC2IC14', 'IC2IC15', 
        'IC2IC16', 'IC2IC17', 'IC2IC18', 'IC2IC19', 'IC2IC20', 'IC2IC21', 'IC3IC4', 
        'IC3IC5', 'IC3IC6', 'IC3IC7', 'IC3IC8', 'IC3IC9', 'IC3IC10', 'IC3IC11', 
        'IC3IC12', 'IC3IC13', 'IC3IC14', 'IC3IC15', 'IC3IC16', 'IC3IC17', 'IC3IC18', 
        'IC3IC19', 'IC3IC20', 'IC3IC21', 'IC4IC5', 'IC4IC6', 'IC4IC7', 'IC4IC8', 
        'IC4IC9', 'IC4IC10', 'IC4IC11', 'IC4IC12', 'IC4IC13', 'IC4IC14', 'IC4IC15', 
        'IC4IC16', 'IC4IC17', 'IC4IC18', 'IC4IC19', 'IC4IC20', 'IC4IC21', 'IC5IC6', 
        'IC5IC7', 'IC5IC8', 'IC5IC9', 'IC5IC10', 'IC5IC11', 'IC5IC12', 'IC5IC13', 
        'IC5IC14', 'IC5IC15', 'IC5IC16', 'IC5IC17', 'IC5IC18', 'IC5IC19', 'IC5IC20', 
        'IC5IC21', 'IC6IC7', 'IC6IC8', 'IC6IC9', 'IC6IC10', 'IC6IC11', 'IC6IC12', 
        'IC6IC13', 'IC6IC14', 'IC6IC15', 'IC6IC16', 'IC6IC17', 'IC6IC18', 'IC6IC19', 
        'IC6IC20', 'IC6IC21', 'IC7IC8', 'IC7IC9', 'IC7IC10', 'IC7IC11', 'IC7IC12', 
        'IC7IC13', 'IC7IC14', 'IC7IC15', 'IC7IC16', 'IC7IC17', 'IC7IC18', 'IC7IC19', 
        'IC7IC20', 'IC7IC21', 'IC8IC9', 'IC8IC10', 'IC8IC11', 'IC8IC12', 'IC8IC13', 
        'IC8IC14', 'IC8IC15', 'IC8IC16', 'IC8IC17', 'IC8IC18', 'IC8IC19', 'IC8IC20', 
        'IC8IC21', 'IC9IC10', 'IC9IC11', 'IC9IC12', 'IC9IC13', 'IC9IC14', 'IC9IC15', 
        'IC9IC16', 'IC9IC17', 'IC9IC18', 'IC9IC19', 'IC9IC20', 'IC9IC21', 'IC10IC11', 
        'IC10IC12', 'IC10IC13', 'IC10IC14', 'IC10IC15', 'IC10IC16', 'IC10IC17', 'IC10IC18', 
        'IC10IC19', 'IC10IC20', 'IC10IC21', 'IC11IC12', 'IC11IC13', 'IC11IC14', 'IC11IC15', 
        'IC11IC16', 'IC11IC17', 'IC11IC18', 'IC11IC19', 'IC11IC20', 'IC11IC21', 'IC12IC13', 
        'IC12IC14', 'IC12IC15', 'IC12IC16', 'IC12IC17', 'IC12IC18', 'IC12IC19', 'IC12IC20', 
        'IC12IC21', 'IC13IC14', 'IC13IC15', 'IC13IC16', 'IC13IC17', 'IC13IC18', 'IC13IC19', 
        'IC13IC20', 'IC13IC21', 'IC14IC15', 'IC14IC16', 'IC14IC17', 'IC14IC18', 'IC14IC19', 
        'IC14IC20', 'IC14IC21', 'IC15IC16', 'IC15IC17', 'IC15IC18', 'IC15IC19', 'IC15IC20', 
        'IC15IC21', 'IC16IC17', 'IC16IC18', 'IC16IC19', 'IC16IC20', 'IC16IC21', 'IC17IC18', 
        'IC17IC19', 'IC17IC20', 'IC17IC21', 'IC18IC19', 'IC18IC20', 'IC18IC21', 'IC19IC20', 
        'IC19IC21', 'IC20IC21']

categorical_variables = ['assessment centre']

binary_variables = ['sex']

OUT_VAR = '20016-2.0'

In [11]:
BASE_FOLDER = '/Users/baileyng/MIND_data/hyperparameters/best_hyperparameters_FC_06-28'
maes, rmses, r2s = train_test_metrics(OUT_VAR)

Split 00 → MAE: 1.613, RMSE: 2.026, R²: 0.054
Split 01 → MAE: 1.652, RMSE: 2.072, R²: -0.009
Split 02 → MAE: 1.725, RMSE: 2.152, R²: -0.065
Split 03 → MAE: 1.594, RMSE: 1.977, R²: 0.063
Split 04 → MAE: 1.616, RMSE: 2.026, R²: 0.057
Split 05 → MAE: 1.591, RMSE: 1.985, R²: 0.050
Split 06 → MAE: 1.697, RMSE: 2.116, R²: -0.055
Split 07 → MAE: 1.578, RMSE: 1.974, R²: 0.081
Split 08 → MAE: 1.593, RMSE: 1.991, R²: 0.054
Split 09 → MAE: 1.592, RMSE: 1.981, R²: 0.073

Overall performance:
MAE  : 1.625 ± 0.048
RMSE : 2.030 ± 0.060
R²   : 0.030 ± 0.051
