# ML testing: experiment #3b-requested harm (NeuroComBat)

This notebook involves testing for the clinical harmonization paper. This notebook shows mixed_dataset (TOP + StrokeMRI) based models with NeuroComBat harmonization to Insight 46 and SABRE datasets (the StrokeMRI and TOP become one dataset)

Harmonisation: NeuroComBat Combat

Training data: NORMENT whichh is StrokeMRI and TOP togehter

Testing data: test set from NORMENT

Futher data applied to: SABRE, Insight46, EDIS, (HELIUS pending)

Validation method: K-fold, double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: SubjectID, real age, predicted age of validation and testing sets

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [11]:
filepath_mri_for_ids = '../open_work/internal_results/cleaned_pvc2s' 
filename_mri_for_ids = os.path.join(filepath_mri_for_ids,'StrokeMRI_pvc2c.csv') 

filepath_top_for_ids = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top_for_ids = os.path.join(filepath_top_for_ids,'TOP_pvc2c.csv') 

filepath_topmri = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_topmri = os.path.join(filepath_topmri,'Rneuro_harm3way_topmri.csv') 
filepath_sabre = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_sabre = os.path.join(filepath_topmri,'Rneuro_harm3way_sabre.csv') 
filepath_insight46 = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_insight46 =  os.path.join(filepath_topmri,'Rneuro_harm3way_insight.csv') 
# read in data
# TOP = pd.read_csv(filename_top)
# StrokeMRI = pd.read_csv(filename_mri)
TOPMRI = pd.read_csv(filename_topmri)
SABRE = pd.read_csv(filename_sabre)
Insight46 = pd.read_csv(filename_insight46)
# take extra column off
TOPMRI = TOPMRI.drop(TOPMRI.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
Insight46 = Insight46.drop(Insight46.columns[0],axis=1)
IDS_TOP =  pd.read_csv(filename_top_for_ids)
IDS_MRI =  pd.read_csv(filename_mri_for_ids)

In [12]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,0,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,1,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [13]:
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
#Insight46.head(3)

In [14]:
coly = TOPMRI.columns
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-100096_1_ses-1_run-1,78,1,0.610012,0.482679,0.468053,0.387788,0.387788,9.071137,23.835812,6.123046,2.646049,2.824262,3.364733,2.815808,24.110961,84.6646,75.440922,50.074196,63.48628
1,sub-100331_1_ses-1_run-1,71,1,0.587432,0.460086,0.521217,0.370808,0.370808,5.741298,25.789411,8.420838,3.567374,3.018827,3.007536,2.901729,17.390889,44.479919,37.446087,31.890621,35.017552
2,sub-102285_1_ses-1_run-1,72,1,0.61711,0.524304,0.40898,0.397435,0.397435,6.510194,27.461341,11.441511,3.52137,3.312764,3.073432,3.13133,22.45342,61.511213,50.596122,40.008961,45.73423


In [7]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

# Now we need to break up the TOP and StrokeMRI datasets as well, and format them like the others

In [15]:
set_top_ids = set(IDS_TOP.participant_id)
set_mri_ids = set(IDS_MRI.participant_id)
StrokeMRI = TOPMRI[TOPMRI['participant_id'].isin(list(set_mri_ids))]
TOP = TOPMRI[TOPMRI['participant_id'].isin(list(set_top_ids))]
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,0,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,1,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [9]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
527,sub-59082_1_ses-1_run-1,43.172603,,0.619625,0.475143,0.282154,0.447639,0.447639,-2.768004,21.237884,21.007739,2.69443,3.116515,2.494894,3.00189,22.072651,99.092903,85.901007,71.422761,80.740415
528,sub-59083_1_ses-1_run-1,66.367123,,0.577321,0.466753,0.394898,0.402171,0.402171,11.695904,20.710557,14.234537,3.145519,1.8785,5.508759,2.75405,20.647118,70.595906,60.898762,63.114804,63.590018
529,sub-59085_1_ses-1_run-1,55.838356,,0.58934,0.51888,0.302086,0.415564,0.415564,2.449953,27.648636,14.348818,2.771326,3.100342,3.604489,3.081356,22.857563,75.977349,68.547695,57.958144,60.921869


In [10]:
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [16]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [85]:
# make mixed StrokeMRI and TOP dataset
#mixed_data = pd.concat([TOP, StrokeMRI], sort=False)
mixed_data = TOPMRI

In [181]:
output_folder = '3_NeuroComBat'

os.makedirs(output_folder, exist_ok=True)

## Build ML models

In [182]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')

In [183]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'req_neurocomb_mix_linr', LinearRegression(), ml_matrix, X[:,1:], y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [184]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,req_neurocomb_mix_linr.0,3.062994,0.94867,0.948688
0,linear regression-1,1,req_neurocomb_mix_linr.1,3.034374,0.727204,0.727226
0,linear regression-2,2,req_neurocomb_mix_linr.2,3.561189,0.673634,0.673716
0,linear regression-3,3,req_neurocomb_mix_linr.3,2.99355,0.949535,0.949639
0,linear regression-4,4,req_neurocomb_mix_linr.4,2.887502,0.95486,0.957222


In [185]:
linr_k_frame.to_csv(output_folder + '/linr_k_frame_neuro_comb.csv')

In [186]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 req_neurocomb_mix_linr.0 0 req_neuroco...,3.107922,0.850781,0.851298


In [187]:
linr_y_frame.to_csv(output_folder + '/linr_y_frame_neuro_comb.csv')
linr_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,53.581509
1,39.990000,41.857185
2,68.106849,68.989462
3,40.650000,41.979171
4,20.505464,27.811911
...,...,...
256,85.816139,74.825562
257,56.357923,55.104675
258,27.498630,27.918071
259,19.110000,26.206772


In [188]:
linr_y_frame.to_csv(output_folder + '/linr_y_frame_neuro_comb.csv')

In [189]:
linr = models[0]
linr[0]

In [190]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [191]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'comb_harm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'comb_harm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'comb_harm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'comb_harm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'comb_harm_mix_linr4.sav'))

In [192]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'comb_harm_mix_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,comb_harm_mix_llreg.0,3.070981,0.94851,0.948517
0,lasso regression-1,1,comb_harm_mix_llreg.1,3.108576,0.724376,0.724405
0,lasso regression-2,2,comb_harm_mix_llreg.2,3.535065,0.667551,0.667662
0,lasso regression-3,3,comb_harm_mix_llreg.3,3.006077,0.948788,0.948917
0,lasso regression-4,4,comb_harm_mix_llreg.4,2.920018,0.953782,0.956098


In [193]:
llreg_k_frame.to_csv(output_folder + '/llreg_k_frame_neuro_comb.csv')

In [194]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 comb_harm_mix_llreg.0 0 comb_harm_mix_...,3.128143,0.848601,0.84912


In [195]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,53.872435
1,39.990000,41.844090
2,68.106849,68.995274
3,40.650000,41.795470
4,20.505464,27.853244
...,...,...
256,85.816139,74.698500
257,56.357923,54.901020
258,27.498630,27.997394
259,19.110000,26.186224


In [196]:
llreg_y_frame.to_csv(output_folder + '/llreg_y_frame_neuro_comb.csv')

In [197]:
llreg = models[0]
llreg[0]

In [198]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'comb_harm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'comb_harm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'comb_harm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'comb_harm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'comb_harm_mix_linr4.sav'))

In [199]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'comb_harm_mix_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,comb_harm_mix_dtree.0,4.257149,0.894266,0.894422
0,decision tree-1,1,comb_harm_mix_dtree.1,3.691361,0.915465,0.915753
0,decision tree-2,2,comb_harm_mix_dtree.2,3.856316,0.912808,0.912865
0,decision tree-3,3,comb_harm_mix_dtree.3,4.006433,0.902817,0.906169
0,decision tree-4,4,comb_harm_mix_dtree.4,3.93275,0.915342,0.917007


In [200]:
dtree_k_frame.to_csv(output_folder + '/dtree_k_frame_neuro_comb.csv')

In [201]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 comb_harm_mix_dtree.0 0 comb_harm_mix_...,3.948802,0.90814,0.909243


In [202]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,47.410000
1,39.990000,46.500000
2,68.106849,68.435616
3,40.650000,38.940000
4,20.505464,26.270000
...,...,...
256,85.816139,84.624358
257,56.357923,58.164384
258,27.498630,26.740000
259,19.110000,30.130000


In [203]:
dtree_y_frame.to_csv(output_folder + '/dtree_y_frame_neuro_comb.csv')

In [204]:
dtree = models[0]
dtree[0]

In [205]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'comb_harm_mix_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,comb_harm_mix_regr.0,3.311913,0.943876,0.94587
0,MLP regression-1,1,comb_harm_mix_regr.1,4.333628,-0.956266,-0.948575
0,MLP regression-2,2,comb_harm_mix_regr.2,5.300812,-1.634511,-1.611688
0,MLP regression-3,3,comb_harm_mix_regr.3,3.060846,0.945278,0.945792
0,MLP regression-4,4,comb_harm_mix_regr.4,3.13828,0.947208,0.949127


In [206]:
regr_k_frame.to_csv(output_folder + '/regr_k_frame_neuro_comb.csv')

In [207]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 comb_harm_mix_regr.0 0 comb_harm_mix_r...,3.829096,0.049117,0.056105


In [208]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,54.944923
1,39.990000,39.224768
2,68.106849,70.888838
3,40.650000,39.745709
4,20.505464,26.551826
...,...,...
256,85.816139,77.037860
257,56.357923,49.593454
258,27.498630,27.867804
259,19.110000,26.385696


In [209]:
regr_y_frame.to_csv(output_folder + '/regr_y_frame_neuro_comb.csv')

In [210]:
regr = models[0]
regr[0]

In [211]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'combharm_mix_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,combharm_mix_svrp2.0,10.659703,0.452423,0.458868
0,support vector reg poly2-1,1,combharm_mix_svrp2.1,11.759952,-1.747855,-1.71156
0,support vector reg poly2-2,2,combharm_mix_svrp2.2,11.458133,-1.482858,-1.452537
0,support vector reg poly2-3,3,combharm_mix_svrp2.3,9.864051,0.463812,0.467605
0,support vector reg poly2-4,4,combharm_mix_svrp2.4,10.648682,0.42251,0.432394


In [212]:
svrp2_k_frame.to_csv(output_folder + '/svrp2_k_frame_neuro_comb.csv')

In [213]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 combharm_mix_svrp2.0 0 combharm_mix_sv...,10.878104,-0.378393,-0.361046


In [214]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,50.337346
1,39.990000,42.932059
2,68.106849,50.094154
3,40.650000,44.792323
4,20.505464,43.550864
...,...,...
256,85.816139,60.847038
257,56.357923,48.762584
258,27.498630,41.476076
259,19.110000,35.085332


In [215]:
svrp2_y_frame.to_csv(output_folder + '/svrp2_y_frame_neuro_comb.csv')

In [216]:
svrp2 = models[0]
svrp2[0]

In [217]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'combharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,combharm_mix_eregr.0,3.489464,0.934296,0.934297
0,elasticnetCV-1,1,combharm_mix_eregr.1,3.879499,0.311146,0.311751
0,elasticnetCV-2,2,combharm_mix_eregr.2,4.167636,0.314983,0.315954
0,elasticnetCV-3,3,combharm_mix_eregr.3,3.315185,0.931571,0.931772
0,elasticnetCV-4,4,combharm_mix_eregr.4,3.425403,0.934553,0.937067


In [218]:
eregr_k_frame.to_csv(output_folder + '/eregr_k_frame_neuro_comb.csv')

In [219]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 combharm_mix_eregr.0 0 combharm_mix_er...,3.655437,0.68531,0.686168


In [220]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,54.973015
1,39.990000,40.969158
2,68.106849,67.624950
3,40.650000,40.870158
4,20.505464,28.213682
...,...,...
256,85.816139,74.380789
257,56.357923,53.466100
258,27.498630,28.845698
259,19.110000,25.524879


In [221]:
eregr_y_frame.to_csv(output_folder + '/eregr_y_frame_neuro_comb.csv')

In [222]:
eregr = models[0]
eregr[0]

In [223]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'comb_harm_mix_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X[:,1:], y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,comb_harm_mix_etreg.0,3.123319,0.946758,0.946777
0,extra trees-1,1,comb_harm_mix_etreg.1,2.811637,0.955372,0.955653
0,extra trees-2,2,comb_harm_mix_etreg.2,3.095524,0.945754,0.945959
0,extra trees-3,3,comb_harm_mix_etreg.3,3.082519,0.947937,0.948935
0,extra trees-4,4,comb_harm_mix_etreg.4,3.015037,0.952518,0.953918


In [224]:
etreg_k_frame.to_csv(output_folder + '/etreg_k_frame_neuro_comb.csv')

In [225]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 comb_harm_mix_etreg.0 0 comb_harm_mix_...,3.025607,0.949668,0.950248


In [226]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,55.850000,50.724203
1,39.990000,42.053954
2,68.106849,69.836970
3,40.650000,42.163190
4,20.505464,27.993642
...,...,...
256,85.816139,80.191360
257,56.357923,52.411488
258,27.498630,27.030192
259,19.110000,26.709585


In [227]:
etreg_y_frame.to_csv(output_folder + '/etreg_y_frame_neuro_comb.csv')

In [228]:
etreg = models[0]
etreg[0]

In [65]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'comb_harm_mix_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'comb_harm_mix_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'comb_harm_mix_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'comb_harm_mix_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'comb_harm_mix_etreg4.sav'))

Show results ON AVERAGE for each model

In [66]:
mixed_based_neuro_harmonized_on_testmix =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mixed_based_neuro_harmonized_on_testmix

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 req_neurocomb_mix_linr.0 0 req_neuroco...,5.276376,0.229775,0.231148
0,0 lasso regression-0 0 lasso regression-...,0 comb_harm_mix_llreg.0 0 comb_harm_mix_...,5.299704,0.202292,0.203733
0,0 decision tree-0 0 decision tree-1 0 ...,0 comb_harm_mix_dtree.0 0 comb_harm_mix_...,6.825966,0.702965,0.70562
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 comb_harm_mix_regr.0 0 comb_harm_mix_r...,6.009869,-1.117649,-1.114435
0,0 support vector reg poly2-0 0 support v...,0 combharm_mix_svrp2.0 0 combharm_mix_sv...,10.869624,-0.336582,-0.319797
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 combharm_mix_eregr.0 0 combharm_mix_er...,8.527797,-0.644499,-0.641867
0,0 extra trees-0 0 extra trees-1 0 ext...,0 comb_harm_mix_etreg.0 0 comb_harm_mix_...,4.723614,0.862779,0.863657


In [67]:
mixed_based_neuro_harmonized_on_testmix.to_csv(output_folder + '/mixed_based_neurocomb_harmonized_on_testmix.csv')

# Running mixed model over SABRE dataset

## Here we will do an example of running  models made of the entire StrokeMRI and TOP dataset mixed as one

#### Build new models

In [68]:
TOPMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,...,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,binned,fuse_bin
0,sub-0001_1_ses-1_run-1,43.49,1,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,...,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434,1,5
1,sub-0002_1_ses-1_run-1,38.3,0,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,...,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063,1,1
2,sub-0019_1_ses-1_run-1,32.3,1,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,...,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912,0,4


In [112]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
ml_matrix = ml_matrix.drop('binned', axis =1) # these were still in the training dataset
ml_matrix = ml_matrix.drop('fuse_bin', axis =1) # these were still in the training dataset
X = ml_matrix.drop('age', axis =1)

X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [114]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [115]:
MIXlinr = LinearRegression()
MIXlinr.fit(X_train, y_train)

In [116]:
MIXllreg = linear_model.LassoLars(alpha=0.01)
MIXllreg.fit(X_train, y_train)

In [117]:
MIXeregr = ElasticNetCV(cv=5, random_state=17)
MIXeregr.fit(X_train, y_train)


In [118]:
MIXetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MIXetreg.fit(X_train, y_train)

In [119]:
##  Save these four best models

In [120]:
## optional save models
#joblib.dump(MIXlinr, ('../result_models/'+  'neurocomb_harm_mix_MIXlinr.sav'))
#joblib.dump(MIXllreg, ('../result_models/'+ 'neurocomb_harmm_mix_MIXllreg.sav'))
#joblib.dump(MIXeregr, ('../result_models/'+ 'neurocomb_harm_mix_MIXeregr.sav'))
#joblib.dump(MIXetreg, ('../result_models/'+ 'neurocomb_harm_mix_MIXetreg.sav'))

In [121]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [122]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [123]:
y_sabre_pred = MIXlinr.predict(X_sabre_test)

In [124]:
data= [[
    'Linear Reg',
    'combharm_mix_linr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXlinr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,combharm_mix_linr0.sav,8.867819,-2.019415,-1.009992


In [125]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_linr_sabre = pd.DataFrame(data)
y_frame_linr_sabre

Unnamed: 0,real,predicted
0,78.0,61.460211
1,71.0,74.198341
2,72.0,66.540866
3,78.0,76.390665
4,75.0,55.630847
...,...,...
689,72.0,71.180162
690,73.0,54.219680
691,71.0,61.581142
692,72.0,68.056745


In [126]:
y_frame_linr_sabre.to_csv(output_folder + '/y_frame_linr_sabre_neurocomb_harm.csv')

In [127]:
y_sabre_pred = MIXllreg.predict(X_sabre_test)

In [128]:
data= [[
    'Lasso',
    'comb_harm_mix_lassor0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXllreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,comb_harm_mix_lassor0.sav,8.985311,-2.102653,-1.050823


In [129]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_llreg_sabre = pd.DataFrame(data)
y_frame_llreg_sabre

Unnamed: 0,real,predicted
0,78.0,60.942565
1,71.0,73.642822
2,72.0,66.698701
3,78.0,75.703910
4,75.0,55.210859
...,...,...
689,72.0,71.112621
690,73.0,54.331853
691,71.0,61.460702
692,72.0,68.400222


In [130]:
y_frame_llreg_sabre.to_csv(output_folder + '/y_frame_llreg_sabre_neurocomb_harm.csv')

In [131]:
y_sabre_pred = MIXeregr.predict(X_sabre_test)

In [132]:
data= [[
    'ElasticnetCV',
    'harm_mix_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXeregr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,harm_mix_elasticregr.sav,15.446176,-6.047914,-1.104048


In [133]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_eregr_sabre = pd.DataFrame(data)
y_frame_eregr_sabre

Unnamed: 0,real,predicted
0,78.0,48.721277
1,71.0,60.933254
2,72.0,59.996144
3,78.0,57.247832
4,75.0,54.188824
...,...,...
689,72.0,64.159605
690,73.0,48.839050
691,71.0,59.332221
692,72.0,57.182051


In [134]:
y_frame_eregr_sabre.to_csv(output_folder + '/y_frame_eregr_sabre_neurocomb_harm.csv')

In [135]:
y_sabre_pred = MIXetreg.predict(X_sabre_test)

In [136]:
data= [[
    'Extra trees',
    'combharm_mix_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXetreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,combharm_mix_etreg.sav,5.940721,-0.318866,0.085635


In [137]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_etregr_sabre = pd.DataFrame(data)
y_frame_etregr_sabre

Unnamed: 0,real,predicted
0,78.0,68.188265
1,71.0,70.456534
2,72.0,67.548164
3,78.0,72.998984
4,75.0,64.096686
...,...,...
689,72.0,68.631251
690,73.0,58.812657
691,71.0,69.427784
692,72.0,66.525442


In [138]:
y_frame_etregr_sabre.to_csv(output_folder + '/y_frame_etregr_sabre_neurocomb_harm.csv')

In [230]:
mix_based_neurocombat_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mix_based_neurocombat_on_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,combharm_mix_linr0.sav,8.867819,-2.019415,-1.009992
0,Lasso,comb_harm_mix_lassor0.sav,8.985311,-2.102653,-1.050823
0,ElasticnetCV,harm_mix_elasticregr.sav,15.446176,-6.047914,-1.104048
0,Extra trees,combharm_mix_etreg.sav,5.940721,-0.318866,0.085635


In [231]:
mix_based_neurocombat_on_sabre.to_csv(output_folder + '/mix_based_neurocombat_harmonized_on_sabre.csv')

# Running mixed model over Insight46 dataset

## Here we will do an example of running allthe [0] models

In [141]:
insight_ml_matrix = Insight46.drop('participant_id', axis=1)
X_insight = insight_ml_matrix.drop('age', axis =1)
X_insight = X_insight.values
X_insight = X_insight.astype('float')
y_insight = insight_ml_matrix['age'].values
y_insight= y_insight.astype('float')

In [142]:
X_insight_test = X_insight
y_insight_test = y_insight

In [143]:
y_insight_pred = MIXlinr.predict(X_insight_test)

In [144]:
data= [[
    'Linear Reg',
    'neurocomb_harm_mix_linr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXlinr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
linr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,neurocomb_harm_mix_linr0.sav,7.771475,-206.606995,-106.87859


In [145]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_linr_insight = pd.DataFrame(data)
y_frame_linr_insight

Unnamed: 0,real,predicted
0,69.733333,63.731145
1,70.288889,58.475271
2,69.883333,58.959491
3,69.866667,73.053213
4,70.661111,61.385174
...,...,...
277,71.705556,72.588862
278,70.822222,61.947158
279,71.341667,54.562406
280,70.741667,66.714081


In [146]:
y_frame_linr_insight.to_csv(output_folder + '/y_frame_linr_insight_neurocomb_harm.csv')

In [147]:
y_insight_pred = MIXllreg.predict(X_insight_test)

In [148]:
data= [[
    'Lasso',
    'neuroharm_mix_lassor0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXllreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
llreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,neuroharm_mix_lassor0.sav,7.86115,-208.74498,-104.954627


In [149]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_llreg_insight = pd.DataFrame(data)
y_frame_llreg_insight

Unnamed: 0,real,predicted
0,69.733333,63.689629
1,70.288889,57.883509
2,69.883333,58.919855
3,69.866667,72.374668
4,70.661111,61.938655
...,...,...
277,71.705556,72.747622
278,70.822222,61.316005
279,71.341667,54.410687
280,70.741667,66.542079


In [150]:
y_frame_llreg_insight.to_csv(output_folder + '/y_frame_llreg_insight_neurocomb_harm.csv')

In [151]:
y_insight_pred = MIXeregr.predict(X_insight_test)

In [152]:
data= [[
    'ElasticnetCV',
    'neurocomb_mix_elasticregr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXeregr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
eregr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,neurocomb_mix_elasticregr.sav,15.672773,-680.37295,-186.938435


In [153]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_eregr_insight = pd.DataFrame(data)
y_frame_eregr_insight

Unnamed: 0,real,predicted
0,69.733333,50.583988
1,70.288889,46.104633
2,69.883333,54.299817
3,69.866667,53.608621
4,70.661111,65.031955
...,...,...
277,71.705556,62.230016
278,70.822222,53.881738
279,71.341667,63.038697
280,70.741667,68.733552


In [154]:
y_frame_eregr_insight.to_csv(output_folder + '/y_frame_eregr_insight_neuro_harm.csv')

In [155]:
y_insight_pred = MIXetreg.predict(X_insight_test)

In [156]:
data= [[
    'Extra trees',
    'neurocomb_mix_etreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXetreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
etreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,neurocomb_mix_etreg.sav,3.707949,-61.746563,-46.788014


In [157]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_etreg_insight = pd.DataFrame(data)
y_frame_etreg_insight

Unnamed: 0,real,predicted
0,69.733333,67.689630
1,70.288889,69.733410
2,69.883333,64.842466
3,69.866667,73.071327
4,70.661111,66.951275
...,...,...
277,71.705556,72.154170
278,70.822222,66.763980
279,71.341667,73.708623
280,70.741667,78.084435


In [161]:
y_frame_etreg_insight.to_csv(output_folder + '/y_frame_etreg_insight_neuro_comb.csv')

In [159]:
mix_based_neurocombat_on_insight =pd.concat([linr_results_insight,
                   llreg_results_insight,
                   eregr_results_insight,
                   etreg_results_insight],
                  axis=0)
mix_based_neurocombat_on_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,neurocomb_harm_mix_linr0.sav,7.771475,-206.606995,-106.87859
0,Lasso,neuroharm_mix_lassor0.sav,7.86115,-208.74498,-104.954627
0,ElasticnetCV,neurocomb_mix_elasticregr.sav,15.672773,-680.37295,-186.938435
0,Extra trees,neurocomb_mix_etreg.sav,3.707949,-61.746563,-46.788014


In [160]:
mix_based_neurocombat_on_insight.to_csv(output_folder + '/mix_based_neurocombat_on_insight.csv')