# Set up

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import random

In [2]:
random.seed(42)
np.random.seed(42)

In [None]:
rename = pd.read_csv('/external/rprshnas01/tigrlab/scratch/bng/cartbind/code/MIND_models/region_names/col_renames.csv')
rename_dict = dict(zip(rename['datafield_code'], rename['datafield_name']))

# ElasticNet Analysis Function

In [4]:
#inner parallelized
def elasticnet_analysis(X, y, continuous_vars, categorical_vars, n_splits=10):
    preprocessor = ColumnTransformer(transformers=[
        # scale continuous features
        ('num', StandardScaler(), continuous_vars),
        # one-hot encode the assessment centre (drop one level to avoid collinearity)
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_vars),
    ])

    # Cross-validation set-up
    outer_cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    outer_mae, outer_rmse, outer_r2 = [], [], []
    best_params_per_fold = []
    nonzero_predictors = []
    coefs_list = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Inner CV
        pipe = make_pipeline(
            preprocessor,
            ElasticNetCV(
                l1_ratio=np.linspace(0.3,0.9,7),
                alphas=np.logspace(-4,1,11),
                cv=10, max_iter=30000, random_state=42,
                n_jobs=-1
            )
        )

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        # --- metrics ---
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        outer_mae.append(mae)
        outer_rmse.append(rmse)
        outer_r2.append(r2)

        # --- store best α & l1_ratio for this fold ---
        est = pipe.named_steps['elasticnetcv']
        best_params_per_fold.append(
            {'alpha': est.alpha_, 'l1_ratio': est.l1_ratio_}
        )

        # --- predictors that survived ---
        coefs = est.coef_
        coefs_list.append(coefs)
        surviving = [col for col, c in zip(X.columns, coefs) if c != 0]
        nonzero_predictors.append(surviving)

        print(f'Fold {fold:02d} • MAE={mae:.3f} • RMSE={rmse:.3f} • R²={r2:.3f} '
            f'• α={est.alpha_:.4g} • l1_ratio={est.l1_ratio_:.2f}')
        

    # Aggregate results
    print('\n=== 10-fold CV summary ===')
    print(f'Mean MAE :  {np.mean(outer_mae):.3f}  ± {np.std(outer_mae):.3f}')
    print(f'Mean RMSE:  {np.mean(outer_rmse):.3f} ± {np.std(outer_rmse):.3f}')
    print(f'Mean R²  :  {np.mean(outer_r2):.3f}  ± {np.std(outer_r2):.3f}')



# GF

In [None]:
# Load the dataset
df = pd.read_csv('/external/rprshnas01/tigrlab/scratch/bng/cartbind/data/ukb_master_GF_no_outliers.csv', index_col=0)

## GF vs. MIND

In [None]:
# Set X and y
with open('/external/rprshnas01/tigrlab/scratch/bng/cartbind/code/MIND_models/region_names/MIND_regions.txt', 'r') as f:
    brain_regions = [line.strip() for line in f.readlines()]

# Define demographic/clinical features
demographic_vars = ['31-0.0', '21003-2.0', '54-2.0']

# Combine demographic features with brain region features
all_vars = demographic_vars + brain_regions

X = df[all_vars]
y = df['20016-2.0']

print(X.shape)
print(y.shape)

(33977, 2281)
(33977,)


In [9]:
# rename columns
X = X.rename(columns=rename_dict)

categorical_vars = ['sex', 'assessment_centre']
continuous_vars  = [c for c in X.columns if c not in categorical_vars]

In [10]:
elasticnet_analysis(X, y, continuous_vars, categorical_vars, n_splits=10)

Fold 01 • MAE=1.605 • RMSE=2.004 • R²=0.045 • α=0.01389 • l1_ratio=0.60
Fold 02 • MAE=1.623 • RMSE=2.019 • R²=0.045 • α=0.03162 • l1_ratio=0.30
Fold 03 • MAE=1.626 • RMSE=2.029 • R²=0.052 • α=0.01389 • l1_ratio=0.60
Fold 04 • MAE=1.645 • RMSE=2.049 • R²=0.042 • α=0.01389 • l1_ratio=0.70
Fold 05 • MAE=1.594 • RMSE=1.988 • R²=0.045 • α=0.01389 • l1_ratio=0.60
Fold 06 • MAE=1.566 • RMSE=1.953 • R²=0.065 • α=0.01389 • l1_ratio=0.60
Fold 07 • MAE=1.600 • RMSE=2.001 • R²=0.047 • α=0.01389 • l1_ratio=0.70
Fold 08 • MAE=1.647 • RMSE=2.038 • R²=0.036 • α=0.03162 • l1_ratio=0.30
Fold 09 • MAE=1.597 • RMSE=1.991 • R²=0.046 • α=0.01389 • l1_ratio=0.60
Fold 10 • MAE=1.620 • RMSE=2.026 • R²=0.042 • α=0.01389 • l1_ratio=0.50

=== 10-fold CV summary ===
Mean MAE :  1.612  ± 0.024
Mean RMSE:  2.010 ± 0.027
Mean R²  :  0.046  ± 0.007
