# ML testing: experiment #2dc- harmonized with auto combat (TOPMRI on HELIUS)

This notebook involves testing of the autoombat algorithm and ML for age prediction. This notebook shows harmonized StrokeMRI+TOP based models, and how they perform on HELIUSUS.  The autocombat algorithm requires that samples are not unique even in continous variables. Therefore before harmonization, a variable for age group was created, and then removed after harmonization. Age group can currently be split by tens or by twos, a more fine grained split. A widget allows the choice. 


Data: StrokeMRI, TOP, HELIUS

Harmonisation: Auto-combat

Training data: StrokeMRI and TOP together

Testing data: StrokeMRI and TOP together, test subsets

Futher data applied to: HELIUS

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: ? what do we want to have here?

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
# pick how you want the data split by age for autocombat 
possibilities = ['split_into_tens','split_into_twos']

split = widgets.Dropdown(options= possibilities,
                    value='split_into_tens',
                    description='split on age',
                    disabled=False)
split

Dropdown(description='split on age', options=('split_into_tens', 'split_into_twos'), value='split_into_tens')

In [3]:
filepath = 'harmonizations/harm_results/autocombat/'  

if split.value == 'split_into_tens':
    filename_topmri = os.path.join(filepath, 'autocom_harm_topmri_v_h.csv')
    filename_helius = os.path.join(filepath,'autocom_harm_HELIUS.csv') 
else: 
    filename_topmri = os.path.join(filepath, 'fg_autocom_harm_topmri_v_h.csv')
    filename_helius = os.path.join(filepath,'fg_autocom_harm_HELIUS1.csv') 

In [4]:
TOPMRI = pd.read_csv(filename_topmri)
HELIUS= pd.read_csv(filename_helius)

In [5]:
TOPMRI = TOPMRI.drop(TOPMRI.columns[0],axis=1)
HELIUS = HELIUS.drop(HELIUS.columns[0],axis=1)
HELIUS.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-207605_1,39.0,1,0.699977,0.552782,0.285122,0.455446,0.817184,0.011864,3.061378,-1.216813,-1.157043,-1.071701,-1.054588,80.436367,74.83755,57.247185,63.82286
1,sub-142310_1,41.0,0,0.631171,0.499824,0.339141,0.425034,0.76882,0.00951,3.011773,-1.473246,-1.372576,-1.517795,-1.378591,91.149244,81.495367,66.907085,73.555564
2,sub-163538_1,41.0,0,0.686535,0.612683,0.36587,0.40859,0.780585,0.012602,3.801654,-1.502907,-1.33849,-1.315878,-1.267576,94.76479,83.65981,63.951053,73.359603


In [6]:
TOPMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-0386_1,17.72,0,0.72149,0.508297,0.261765,0.481656,0.822599,-0.000853,3.058211,-1.609578,-1.378041,-1.289478,-1.315064,91.800203,88.406787,67.132154,76.671131
1,sub-0337_1,18.15,1,0.684754,0.428084,0.171019,0.527823,0.862277,-0.000692,3.010848,-1.478056,-1.381734,-1.450733,-1.351722,96.34232,89.814694,67.101798,81.364618
2,sub-0239_1,18.4,0,0.67202,0.452412,0.23001,0.493561,0.827667,0.036596,5.157816,-1.461353,-1.257479,-1.55836,-1.245212,109.543958,105.324892,74.527566,90.329177


In [7]:
output_folder = '2dd_loged_outputs'
os.makedirs(output_folder, exist_ok=True)

## Build ML models based on auto combat StrokeMRI-TOP mixed set (TOPMRI)

In [8]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [9]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'auto_harm_topmri_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [10]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,auto_harm_topmri_linr.0,6.019386,0.735471,0.736588
0,linear regression-1,1,auto_harm_topmri_linr.1,6.417635,0.763341,0.764641
0,linear regression-2,2,auto_harm_topmri_linr.2,5.992828,0.784781,0.785247
0,linear regression-3,3,auto_harm_topmri_linr.3,5.739361,0.835429,0.835833
0,linear regression-4,4,auto_harm_topmri_linr.4,5.90616,0.802398,0.802415


In [11]:
linr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_linr_k_frame.csv')

In [12]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 auto_harm_topmri_linr.0 0 auto_harm_to...,6.015074,0.784284,0.784945


In [13]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,33.910106
1,67.178082,65.341509
2,74.448087,56.836596
3,53.189041,45.242783
4,46.750000,42.440208
...,...,...
217,18.150000,16.539463
218,70.715068,62.934239
219,66.208219,52.927250
220,23.930000,29.372465


In [14]:
linr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_linr_y_frame.csv')

In [15]:
linr = models[0]
linr[0]

In [16]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [17]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'auto_harm_topmri_HELIUSUS_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'auto_harm_topmri_HELIUS_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'auto_harm_topmri_HELIUSUS_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'auto_harm_topmri_HELIUS_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'auto_harm_topmri_HELIUSUS_linr4.sav'))

In [18]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'auto_harm_topmri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,auto_harm_topmri_llreg.0,6.051026,0.726887,0.727915
0,lasso regression-1,1,auto_harm_topmri_llreg.1,6.46491,0.759762,0.76175
0,lasso regression-2,2,auto_harm_topmri_llreg.2,5.996762,0.784011,0.784487
0,lasso regression-3,3,auto_harm_topmri_llreg.3,5.845306,0.827409,0.827816
0,lasso regression-4,4,auto_harm_topmri_llreg.4,5.941891,0.800276,0.800283


In [19]:
llreg_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_llreg_k_frame.csv')

In [20]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 auto_harm_topmri_llreg.0 0 auto_harm_t...,6.059979,0.779669,0.78045


In [21]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,33.131852
1,67.178082,65.089970
2,74.448087,56.476769
3,53.189041,45.311068
4,46.750000,42.466697
...,...,...
217,18.150000,20.218783
218,70.715068,64.378906
219,66.208219,53.036900
220,23.930000,28.187983


In [22]:
llreg_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_llreg_y_frame.csv')

In [23]:
llreg = models[0]
llreg[0]

In [24]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'auto_harm_topmri_HELIUSUS_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'auto_harm_topmri_HELIUS_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'auto_harm_topmri_HELIUS_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'auto_harm_topmri_HELIUS_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'auto_harm_topmri_HELIUS_llreg4.sav'))

In [25]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'auto_harm_topmri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,auto_harm_topmri_dtree.0,6.986556,0.69466,0.695553
0,decision tree-1,1,auto_harm_topmri_dtree.1,7.051548,0.694991,0.695101
0,decision tree-2,2,auto_harm_topmri_dtree.2,6.938686,0.681425,0.681679
0,decision tree-3,3,auto_harm_topmri_dtree.3,6.70042,0.745621,0.745946
0,decision tree-4,4,auto_harm_topmri_dtree.4,7.259061,0.677964,0.678038


In [26]:
dtree_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_dtree_k_frame.csv')

In [27]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 auto_harm_topmri_dtree.0 0 auto_harm_t...,6.987254,0.698932,0.699263


In [28]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,44.350000
1,67.178082,51.660000
2,74.448087,66.446575
3,53.189041,40.650000
4,46.750000,56.632083
...,...,...
217,18.150000,26.340000
218,70.715068,72.205479
219,66.208219,65.523288
220,23.930000,37.160000


In [29]:
dtree_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_dtree_y_frame.csv')

In [30]:
dtree = models[0]
dtree[0]

In [31]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'auto_harm_topmri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,auto_harm_topmri_regr.0,7.598647,0.677573,0.683343
0,MLP regression-1,1,auto_harm_topmri_regr.1,8.574429,0.607757,0.610866
0,MLP regression-2,2,auto_harm_topmri_regr.2,8.534308,0.582714,0.605163
0,MLP regression-3,3,auto_harm_topmri_regr.3,8.259865,0.658293,0.662879
0,MLP regression-4,4,auto_harm_topmri_regr.4,8.356809,0.630129,0.630142


In [32]:
regr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_regr_k_frame.csv')

In [33]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 auto_harm_topmri_regr.0 0 auto_harm_to...,8.264812,0.631293,0.638478


In [34]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,38.408203
1,67.178082,72.825115
2,74.448087,58.133981
3,53.189041,46.289458
4,46.750000,39.190618
...,...,...
217,18.150000,24.644396
218,70.715068,76.734094
219,66.208219,45.989480
220,23.930000,32.530456


In [35]:
regr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_regr_y_frame.csv')

In [36]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [37]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'auto_harm_topmri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,auto_harm_topmri_svrp2.0,14.672366,-4.366201,-4.325845
0,support vector reg poly2-1,1,auto_harm_topmri_svrp2.1,12.434815,0.17131,0.184064
0,support vector reg poly2-2,2,auto_harm_topmri_svrp2.2,12.269147,0.207944,0.212277
0,support vector reg poly2-3,3,auto_harm_topmri_svrp2.3,13.132871,0.151488,0.164905
0,support vector reg poly2-4,4,auto_harm_topmri_svrp2.4,12.254816,0.225654,0.2304


In [38]:
svrp2_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_svrp2_k_frame.csv')

In [39]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 auto_harm_topmri_svrp2.0 0 auto_harm_t...,12.952803,-0.721961,-0.70684


In [40]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,49.919381
1,67.178082,56.334035
2,74.448087,51.899517
3,53.189041,47.208653
4,46.750000,47.020939
...,...,...
217,18.150000,40.029240
218,70.715068,54.865394
219,66.208219,43.954541
220,23.930000,47.300297


In [41]:
svrp2_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_svrp2_y_frame.csv')

In [42]:
svrp2 = models[0]
svrp2[0]

In [43]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'auto_harm_topmri_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,auto_harm_topmri_eregr.0,9.170685,0.128204,0.131838
0,elasticnetCV-1,1,auto_harm_topmri_eregr.1,9.8578,0.472961,0.477372
0,elasticnetCV-2,2,auto_harm_topmri_eregr.2,9.610719,0.479846,0.479862
0,elasticnetCV-3,3,auto_harm_topmri_eregr.3,9.398829,0.552105,0.552279
0,elasticnetCV-4,4,auto_harm_topmri_eregr.4,9.807457,0.485547,0.486537


In [44]:
eregr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_eregr_k_frame.csv')

In [45]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 auto_harm_topmri_eregr.0 0 auto_harm_t...,9.569098,0.423733,0.425578


In [46]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,41.234296
1,67.178082,70.207204
2,74.448087,59.999949
3,53.189041,47.189730
4,46.750000,38.441697
...,...,...
217,18.150000,28.151759
218,70.715068,66.440207
219,66.208219,44.929928
220,23.930000,36.183855


In [47]:
eregr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_HELIUSUS_eregr_y_frame.csv')

In [48]:
eregr = models[0]
eregr[0]

In [49]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'auto_harm_topmri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,auto_harm_topmri_etreg.0,4.984195,0.856222,0.856484
0,extra trees-1,1,auto_harm_topmri_etreg.1,5.355654,0.828255,0.828753
0,extra trees-2,2,auto_harm_topmri_etreg.2,4.915555,0.85594,0.856024
0,extra trees-3,3,auto_harm_topmri_etreg.3,4.947049,0.874092,0.874313
0,extra trees-4,4,auto_harm_topmri_etreg.4,4.975621,0.860187,0.860273


In [50]:
etreg_k_frame.to_csv(output_folder + '/auto_haromized_topmri_HELIUS_etreg_k_frame.csv')

In [51]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 auto_harm_topmri_etreg.0 0 auto_harm_t...,5.035615,0.854939,0.85517


In [52]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,31.940000,35.047023
1,67.178082,64.858246
2,74.448087,63.477783
3,53.189041,42.216363
4,46.750000,45.252602
...,...,...
217,18.150000,25.145748
218,70.715068,71.964763
219,66.208219,51.860179
220,23.930000,29.208436


In [53]:
etreg_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_HELIUS_etreg_y_frame.csv')

In [54]:
etreg = models[0]
etreg[0]

In [55]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg4.sav'))

Show results ON AVERAGE for each model

In [56]:
topmri_based_auto_harmonized_on_testtopmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
topmri_based_auto_harmonized_on_testtopmri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 auto_harm_topmri_linr.0 0 auto_harm_to...,6.015074,0.784284,0.784945
0,0 lasso regression-0 0 lasso regression-...,0 auto_harm_topmri_llreg.0 0 auto_harm_t...,6.059979,0.779669,0.78045
0,0 decision tree-0 0 decision tree-1 0 ...,0 auto_harm_topmri_dtree.0 0 auto_harm_t...,6.987254,0.698932,0.699263
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 auto_harm_topmri_regr.0 0 auto_harm_to...,8.264812,0.631293,0.638478
0,0 support vector reg poly2-0 0 support v...,0 auto_harm_topmri_svrp2.0 0 auto_harm_t...,12.952803,-0.721961,-0.70684
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 auto_harm_topmri_eregr.0 0 auto_harm_t...,9.569098,0.423733,0.425578
0,0 extra trees-0 0 extra trees-1 0 ext...,0 auto_harm_topmri_etreg.0 0 auto_harm_t...,5.035615,0.854939,0.85517


In [57]:
topmri_based_auto_harmonized_on_testtopmri.to_csv(output_folder + '/topmri_based_auto_harmonized_on_testtopmri_AVERAGES.csv')

## Now we will build  models based on the whole  autocombat harmonized TOPMRI dataset, and apply them to HELIUS. 

In [58]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [59]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [60]:
TOPMRIlinr = LinearRegression()
TOPMRIlinr.fit(X_train, y_train)

In [61]:
TOPMRIllreg = linear_model.LassoLars(alpha=0.01)
TOPMRIllreg.fit(X_train, y_train)

In [62]:
TOPMRIeregr = ElasticNetCV(cv=5, random_state=17)
TOPMRIeregr.fit(X_train, y_train)


In [63]:
TOPMRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPMRIetreg.fit(X_train, y_train)

In [64]:
##  Save these four best models

In [65]:

## optional save models
#joblib.dump(TOPMRIlinr, ('../result_models/' + 'auto_harm_topmri_HELIUS_linr.sav'))
#joblib.dump(TOPMRIllreg, ('../result_models/'+ 'auto_harm_topmri_HELIUSUS_llreg1.sav'))
#joblib.dump(TOPMRIeregr, ('../result_models/'+ 'auto_harm_topmri_HELIUS_eregr3.sav'))
#joblib.dump(TOPMRIetreg, ('../result_models/'+ 'auto_harm_topmri_HELIUS_etreg4.sav'))

# Running whole TOPMRI model over HELIUSUS dataset

In [66]:
HELIUS_ml_matrix = HELIUS.drop('participant_id', axis=1)
X_HELIUS = HELIUS_ml_matrix.drop('age', axis =1)
X_HELIUS = X_HELIUS.values
X_HELIUS = X_HELIUS.astype('float')
y_HELIUS= HELIUS_ml_matrix['age'].values
y_HELIUS=y_HELIUS.astype('float')

In [67]:
X_HELIUS_test = X_HELIUS
y_HELIUS_test = y_HELIUS


In [68]:
y_HELIUS_pred = TOPMRIlinr.predict(X_HELIUS_test)

In [69]:
data= [[
    'linear regression',
    'auto_harm_topmri_HELIUSUS_linr.sav',
    mean_absolute_error(y_HELIUS_test, y_HELIUS_pred),
    TOPMRIlinr.score(X_HELIUS_test,y_HELIUS_test),
    metrics.explained_variance_score(y_HELIUS_test, y_HELIUS_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [70]:
linr_y_test = y_HELIUS_test
linr_y_pred = y_HELIUS_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,39.0,43.641417
1,41.0,50.337025
2,41.0,53.551197
3,41.0,38.056219
4,41.0,48.572915
...,...,...
551,73.0,59.727742
552,73.0,75.644257
553,73.0,78.750854
554,74.0,72.542523


In [71]:
linr_compare.to_csv(output_folder + '/whole_auto_harm_topmri_HELIUSUS_linr_compare_on_top.csv')

In [72]:
y_HELIUS_pred = TOPMRIllreg.predict(X_HELIUS_test)

In [73]:
data= [[
    'lasso regression',
    'auto_harm_whole_topmri_HELIUSUS_llreg.sav',
    mean_absolute_error(y_HELIUS_test, y_HELIUS_pred),
    TOPMRIllreg.score(X_HELIUS_test,y_HELIUS_test),
    metrics.explained_variance_score(y_HELIUS_test, y_HELIUS_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,auto_harm_whole_topmri_HELIUSUS_llreg.sav,6.355752,-0.075617,0.195683


In [74]:
llreg_y_test = y_HELIUS_test
llreg_y_pred = y_HELIUS_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,39.0,42.942676
1,41.0,47.889521
2,41.0,51.935128
3,41.0,37.459963
4,41.0,46.256117
...,...,...
551,73.0,57.874821
552,73.0,71.459768
553,73.0,74.244903
554,74.0,67.862633


In [75]:
llreg_compare.to_csv(output_folder + '/whole_auto_harm_topmri_HELIUS_llreg_compare_on_top.csv')

In [76]:
y_HELIUS_pred = TOPMRIeregr.predict(X_HELIUS_test)

In [78]:
data= [[
    'elasticnetCV',
    'auto_harm_whole_topmri_HELIUSlinr.sav',
    mean_absolute_error(y_HELIUS_test, y_HELIUS_pred),
    TOPMRIeregr.score (X_HELIUS_test, y_HELIUS_test),
    metrics.explained_variance_score(y_HELIUS_test, y_HELIUS_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [79]:
eregr_y_test = y_HELIUS_test
eregr_y_pred = y_HELIUS_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,39.0,55.894804
1,41.0,49.600154
2,41.0,55.519189
3,41.0,55.977905
4,41.0,37.126466
...,...,...
551,73.0,69.533703
552,73.0,71.957543
553,73.0,63.347884
554,74.0,62.740989


In [80]:
eregr_compare.to_csv(output_folder + '/whole_auto_harm_topmri_HELIUS_eregr_compare_on_top.csv')

In [81]:
y_HELIUS_pred = TOPMRIetreg.predict(X_HELIUS_test)

In [82]:
data= [[
    'extra trees',
    'auto_harm_topmri_HELIUS_linr.sav',
    mean_absolute_error(y_HELIUS_test, y_HELIUS_pred),
    TOPMRIetreg.score(X_HELIUS_test,y_HELIUS_test),
    metrics.explained_variance_score(y_HELIUS_test, y_HELIUS_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [83]:
etreg_y_test = y_HELIUS_test
etreg_y_pred = y_HELIUS_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,etreg_y_pred_age
0,39.0,43.755141
1,41.0,50.563872
2,41.0,49.989869
3,41.0,39.702638
4,41.0,52.088913
...,...,...
551,73.0,64.373390
552,73.0,70.306096
553,73.0,74.393662
554,74.0,68.590922


In [84]:
etreg_compare.to_csv(output_folder + '/whole_auto_harm_topmri_HELIUSUS_etreg_compare_on_top.csv')

In [85]:
topmri_based_auto_combat_harmonized_on_HELIUS =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
topmri_based_auto_combat_harmonized_on_HELIUS

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,auto_harm_topmri_HELIUSUS_linr.sav,6.166411,-0.023054,0.010401
0,lasso regression,auto_harm_whole_topmri_HELIUSUS_llreg.sav,6.355752,-0.075617,0.195683
0,elasticnetCV,auto_harm_whole_topmri_HELIUSlinr.sav,8.19469,-0.843058,-0.754434
0,extra trees,auto_harm_topmri_HELIUS_linr.sav,5.730209,0.10287,0.105131


In [86]:
topmri_based_auto_combat_harmonized_on_HELIUS.to_csv(output_folder + '/topmri_based_auto_combat_harmonized_on_HELIUS.csv')