In [None]:
import pandas as pd
import json
import numpy as np
import xgboost as xgb
import os
from sklearn.preprocessing import RobustScaler
import copy
import random

In [2]:
random.seed(42)
np.random.seed(42)

In [None]:
numerical_variables = ['age',
                       
        'lh_bankssts', 'lh_caudalanteriorcingulate', 'lh_caudalmiddlefrontal',
        'lh_cuneus', 'lh_entorhinal', 'lh_fusiform', 'lh_inferiorparietal', 
        'lh_inferiortemporal', 'lh_isthmuscingulate', 'lh_lateraloccipital', 
        'lh_lateralorbitofrontal', 'lh_lingual', 'lh_medialorbitofrontal', 
        'lh_middletemporal', 'lh_parahippocampal', 'lh_paracentral', 
        'lh_parsopercularis', 'lh_parsorbitalis', 'lh_parstriangularis', 
        'lh_pericalcarine', 'lh_postcentral', 'lh_posteriorcingulate', 
        'lh_precentral', 'lh_precuneus', 'lh_rostralanteriorcingulate', 
        'lh_rostralmiddlefrontal', 'lh_superiorfrontal', 'lh_superiorparietal', 
        'lh_superiortemporal', 'lh_supramarginal', 'lh_frontalpole', 
        'lh_temporalpole', 'lh_transversetemporal', 'lh_insula', 
        'rh_bankssts', 'rh_caudalanteriorcingulate', 'rh_caudalmiddlefrontal', 
        'rh_cuneus', 'rh_entorhinal', 'rh_fusiform', 'rh_inferiorparietal', 
        'rh_inferiortemporal', 'rh_isthmuscingulate', 'rh_lateraloccipital', 
        'rh_lateralorbitofrontal', 'rh_lingual', 'rh_medialorbitofrontal', 
        'rh_middletemporal', 'rh_parahippocampal', 'rh_paracentral', 
        'rh_parsopercularis', 'rh_parsorbitalis', 'rh_parstriangularis', 
        'rh_pericalcarine', 'rh_postcentral', 'rh_posteriorcingulate', 
        'rh_precentral', 'rh_precuneus', 'rh_rostralanteriorcingulate', 
        'rh_rostralmiddlefrontal', 'rh_superiorfrontal', 'rh_superiorparietal', 
        'rh_superiortemporal', 'rh_supramarginal', 'rh_frontalpole', 
        'rh_temporalpole', 'rh_transversetemporal', 'rh_insula']

categorical_variables = []

binary_variables = ['sex']

In [8]:
BASE_FOLDER = '/Users/baileyng/MIND_data/best_hyperparameters'
n_splits = 10

In [6]:
def train_test_function(output_variable):
    y_test_all = np.empty((0, 1))
    y_pred_all = np.empty((0, 1))
    shap_all = None
    shap_int_all = None

    for split_idx in range(n_splits):
        split_dir = os.path.join(
            BASE_FOLDER, output_variable, f'split_{split_idx}'
        )

        # --- load train/test arrays + column names ---
        data = np.load(
            os.path.join(split_dir, 'train_test_data.npz'),
            allow_pickle=True
        )
        cols = data['column_names']
        X_train = pd.DataFrame(data=data['x_train'], columns=cols)
        y_train = data['y_train']  # shape (n_train,)
        X_test  = pd.DataFrame(data=data['x_test'],  columns=cols)
        y_test  = data['y_test']   # shape (n_test,)

        # --- cast categories & binaries ---
        for c in categorical_variables:
            if c in cols:
                X_train[c] = X_train[c].astype('category')
                X_test[c]  = X_test[c].astype('category')
        for b in binary_variables:
            if b in cols:
                X_train[b] = pd.to_numeric(X_train[b], errors='coerce')
                X_test[b]  = pd.to_numeric(X_test[b], errors='coerce')

        # --- load best hyperparams ---
        with open(
            os.path.join(split_dir, 'best_hyperparameters.json'), 'r'
        ) as f:
            params = json.load(f)

        # adjust objective/metric if regression
        if np.unique(y_train).shape[0] >= 3:
            params['eval_metric'] = 'rmse'
            params['objective']   = 'reg:squarederror'
        else:
            params['eval_metric'] = 'auc'
            params['objective']   = 'binary:logistic'

        # extract n_estimators
        n_estimators = int(params.pop('n_estimators'))

        # --- scale numeric features ---
        scaler = RobustScaler()
        num_vars = [v for v in cols if v in numerical_variables]
        X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
        X_test[num_vars]  = scaler.transform(X_test[num_vars])

        # --- train & predict ---
        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        booster = xgb.train(params, dtrain, num_boost_round=n_estimators)

        dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
        preds = booster.predict(dtest)
        shap_vals = booster.predict(dtest, pred_contribs=True)
        shap_ints = booster.predict(dtest, pred_interactions=True)

        # --- accumulate ---
        y_test_all = np.vstack([y_test_all, y_test.reshape(-1, 1)])
        y_pred_all = np.vstack([y_pred_all, preds.reshape(-1, 1)])

        if shap_all is None:
            shap_all     = shap_vals
            shap_int_all = shap_ints
        else:
            shap_all     = np.vstack([shap_all, shap_vals])
            shap_int_all = np.vstack([shap_int_all, shap_ints])

    return shap_all, shap_int_all, y_pred_all, y_test_all, cols

In [None]:
# output_variables = ['20016-2.0']

# binary_output_variables = []
# continuous_output_variables = ['20016-2.0']

In [None]:

OUT_VAR = '20016-2.0'

shap_vals, shap_ints, y_pred, y_true, feature_names = (
    train_test_function(OUT_VAR)
)

# save everything in one file
out_path = os.path.join(
    BASE_FOLDER, OUT_VAR, 'shap_values_output.npz'
)
np.savez(
    out_path,
    test_pred=y_pred,
    test_true=y_true,
    shap_values=shap_vals,
    shap_values_interaction=shap_ints,
    column_names=np.array(feature_names)
)
print(f"Saved consolidated SHAP outputs to {out_path}")