# ML testing: experiment #1f- harmonized with covbat

This notebook involves testing for ML algorithms. This notebook shows harmonized StrokeMRI and TOP based models, and how they perform on each other, once harmonized by covbat

Data: StrokeMRI, TOP

Harmonisation: covbat

Training data: StrokeMRI and/or TOP

Testing data: StrokeMRI and/or TOP test subsets

Futher data applied to: none

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: predicted brain age

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
 
# # demo stuff
import ipywidgets as widgets
from ipywidgets import interactive
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = 'harmonizations/harm_results/covbat/'
filename_mri = os.path.join(filepath_mri,'mri_covbat_a_top.csv') 

filepath_top = 'harmonizations/harm_results/covbat/'
filename_top = os.path.join(filepath_top,'top_covbat_a_mri.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [5]:
TOP.columns = TOP.columns.str.lower() 
StrokeMRI.columns = StrokeMRI.columns.str.lower() 

In [6]:
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-0668_1,50.4,1.0,0.644714,0.485144,0.264279,0.461614,0.810027,-0.001297,2.942878,-1.727323,-1.543032,-1.363236,-1.399086,90.038649,80.288877,58.693835,71.922282
1,sub-0532_1,37.02,1.0,0.719501,0.594294,0.318721,0.441579,0.807254,0.004726,3.257428,-1.733821,-1.428012,-1.545853,-1.42918,100.027804,88.939212,67.716732,79.450077
2,sub-0529_1,30.57,1.0,0.669736,0.459098,0.289128,0.473445,0.796244,0.015227,3.071877,-1.624623,-1.551906,-1.620578,-1.515124,106.722013,97.130204,71.910797,85.735866


In [7]:
StrokeMRI.tail(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
575,sub-5931002_1,54.769863,0.0,0.665912,0.561129,0.41317,0.406431,0.748141,0.01584,3.671001,-1.401653,-1.374196,-1.395804,-1.297663,66.87705,55.310985,43.932404,50.380975
576,sub-5925701_1,66.639344,1.0,0.581445,0.475718,0.349962,0.413974,0.751754,0.013604,3.410599,-1.729377,-1.351074,-1.2333,-1.278247,55.740392,54.957862,36.428311,41.220589
577,sub-5934201_1,45.336986,1.0,0.622442,0.447141,0.298671,0.455055,0.782394,0.005032,2.720773,-1.769296,-1.615429,-1.581923,-1.601978,99.248205,86.260016,77.306051,83.787168


In [8]:
#give a checkbox for out put folder
loged_feat = widgets.ToggleButton(
    value=False,
    description='Click me if features logged',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
loged_feat

ToggleButton(value=False, description='Click me if features logged', icon='check', tooltip='Description')

In [9]:
loged_feat.value

False

In [10]:
if loged_feat.value == False:
    output_folder = '1f_no_log_outputs'
else:
    output_folder = '1f_loged_outputs'

os.makedirs(output_folder, exist_ok=True)

## Build ML models based on covbat StrokeMRI

In [11]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [12]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'covbatharm_mri_linr', LinearRegression(), ml_matrix, X, y)

In [13]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,covbatharm_mri_linr.0,5.37184,0.767762,0.770358
0,linear regression-1,1,covbatharm_mri_linr.1,5.331662,0.778582,0.778582
0,linear regression-2,2,covbatharm_mri_linr.2,5.596723,0.760982,0.761527
0,linear regression-3,3,covbatharm_mri_linr.3,5.57875,0.740903,0.741458
0,linear regression-4,4,covbatharm_mri_linr.4,5.336366,0.779735,0.786369


In [14]:
linr_k_frame.to_csv(output_folder + '/covbatharmonized_mri_linr_k_frame.csv')

In [15]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_mri_linr.0 0 covbatharm_mri...,5.443068,0.765593,0.767659


In [16]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,84.102835
1,68.849315,73.549078
2,46.512329,43.875211
3,58.164384,57.704186
4,46.808219,49.113060
...,...,...
111,72.770559,58.102365
112,79.290411,90.440150
113,51.531507,55.297760
114,73.090411,74.949212


In [17]:
linr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_linr_y_frame.csv')

In [18]:
linr = models[0]
linr[0]

In [19]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [20]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'covbatharm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'covbatharm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'covbatharm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'covbatharm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'covbatharm_mri_linr4.sav'))

In [21]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'covbatharm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,covbatharm_mri_llreg.0,5.529257,0.754282,0.757169
0,lasso regression-1,1,covbatharm_mri_llreg.1,5.570103,0.75124,0.751452
0,lasso regression-2,2,covbatharm_mri_llreg.2,5.60973,0.746688,0.746869
0,lasso regression-3,3,covbatharm_mri_llreg.3,5.50444,0.739859,0.741097
0,lasso regression-4,4,covbatharm_mri_llreg.4,5.338254,0.767774,0.774743


In [22]:
llreg_k_frame.to_csv(output_folder + '/covbatharmonized_mri_llreg_k_frame.csv')

In [23]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_mri_llreg.0 0 covbatharm_mr...,5.510357,0.751969,0.754266


In [24]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,83.909229
1,68.849315,72.359366
2,46.512329,43.449398
3,58.164384,57.372639
4,46.808219,49.268177
...,...,...
111,72.770559,60.898814
112,79.290411,88.805216
113,51.531507,54.884158
114,73.090411,73.817150


In [25]:
llreg_y_frame.to_csv(output_folder + '/covbatharmonized_mri_llreg_y_frame.csv')

In [26]:
llreg = models[0]
llreg[0]

In [27]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'covbatharm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'covbatharm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'covbatharm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'covbatharm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'covbatharm_mri_llreg4.sav'))

In [28]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'covbatharm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,covbatharm_mri_dtree.0,6.073572,0.69837,0.699729
0,decision tree-1,1,covbatharm_mri_dtree.1,5.089484,0.768053,0.770324
0,decision tree-2,2,covbatharm_mri_dtree.2,5.705819,0.731854,0.7326
0,decision tree-3,3,covbatharm_mri_dtree.3,5.251523,0.76192,0.761923
0,decision tree-4,4,covbatharm_mri_dtree.4,5.212483,0.787222,0.787791


In [29]:
dtree_k_frame.to_csv(output_folder + '/covbatharmonized_mri_dtree_k_frame.csv')

In [30]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_mri_dtree.0 0 covbatharm_mr...,5.466576,0.749484,0.750473


In [31]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,74.852055
1,68.849315,77.035616
2,46.512329,43.778082
3,58.164384,54.471233
4,46.808219,34.150273
...,...,...
111,72.770559,68.273973
112,79.290411,79.972603
113,51.531507,58.164384
114,73.090411,74.671233


In [32]:
dtree_y_frame.to_csv(output_folder + '/covbatharmonized_mri_dtree_y_frame.csv')

In [33]:
dtree = models[0]
dtree[0]

In [34]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'covbatharm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame



Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,covbatharm_mri_regr.0,8.090152,0.474594,0.47909
0,MLP regression-1,1,covbatharm_mri_regr.1,9.058213,0.394319,0.39862
0,MLP regression-2,2,covbatharm_mri_regr.2,7.786187,0.511599,0.531745
0,MLP regression-3,3,covbatharm_mri_regr.3,7.773526,0.475418,0.485244
0,MLP regression-4,4,covbatharm_mri_regr.4,7.955084,0.51547,0.515588


In [35]:
regr_k_frame.to_csv(output_folder + '/covbatharmonized_mri_regr_k_frame.csv')

In [36]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_mri_regr.0 0 covbatharm_mri...,8.132632,0.47428,0.482057


In [37]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,71.340034
1,68.849315,68.161738
2,46.512329,42.263678
3,58.164384,61.311356
4,46.808219,58.702322
...,...,...
111,72.770559,58.333602
112,79.290411,77.911620
113,51.531507,47.833302
114,73.090411,66.534119


In [38]:
regr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_regr_y_frame.csv')

In [39]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [40]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'covbatharm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,covbatharm_mri_svrp2.0,11.107606,-0.010352,0.012482
0,support vector reg poly2-1,1,covbatharm_mri_svrp2.1,10.2673,0.11348,0.161844
0,support vector reg poly2-2,2,covbatharm_mri_svrp2.2,10.292231,0.062934,0.188866
0,support vector reg poly2-3,3,covbatharm_mri_svrp2.3,10.178085,0.070509,0.110043
0,support vector reg poly2-4,4,covbatharm_mri_svrp2.4,10.819924,0.00944,0.057283


In [41]:
svrp2_k_frame.to_csv(output_folder + '/covbatharmonized_mri_svrp2_k_frame.csv')

In [42]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_mri_svrp2.0 0 covbatharm_mr...,10.533029,0.049202,0.106104


In [43]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,66.605196
1,68.849315,58.311219
2,46.512329,55.234642
3,58.164384,65.882581
4,46.808219,66.325499
...,...,...
111,72.770559,67.584526
112,79.290411,69.466314
113,51.531507,52.772534
114,73.090411,68.024671


In [44]:
svrp2_y_frame.to_csv(output_folder + '/covbatharmonized_mrisvrp2_y_frame.csv')

In [45]:
svrp2 = models[0]
svrp2[0]

In [46]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'covbatharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,covbatharm_mix_eregr.0,8.762163,0.367449,0.368459
0,elasticnetCV-1,1,covbatharm_mix_eregr.1,8.564816,0.426907,0.427237
0,elasticnetCV-2,2,covbatharm_mix_eregr.2,8.278262,0.456909,0.46118
0,elasticnetCV-3,3,covbatharm_mix_eregr.3,8.276638,0.408323,0.408335
0,elasticnetCV-4,4,covbatharm_mix_eregr.4,8.570228,0.400881,0.401421


In [47]:
eregr_k_frame.to_csv(output_folder + '/covbatharmonized_eregr_k_frame.csv')

In [48]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_mix_eregr.0 0 covbatharm_mi...,8.490421,0.412094,0.413326


In [49]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,67.307035
1,68.849315,66.704931
2,46.512329,46.438512
3,58.164384,63.715564
4,46.808219,62.930519
...,...,...
111,72.770559,62.088156
112,79.290411,72.969592
113,51.531507,49.042650
114,73.090411,65.869812


In [50]:
eregr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_eregr_y_frame.csv')

In [51]:
eregr = models[0]
eregr[0]

In [52]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'covbatharm_mri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,covbatharm_mri_etreg.0,3.904841,0.885072,0.885074
0,extra trees-1,1,covbatharm_mri_etreg.1,3.608328,0.895927,0.896645
0,extra trees-2,2,covbatharm_mri_etreg.2,3.961536,0.885692,0.885694
0,extra trees-3,3,covbatharm_mri_etreg.3,3.58726,0.890633,0.890696
0,extra trees-4,4,covbatharm_mri_etreg.4,3.787002,0.886259,0.887807


In [53]:
etreg_k_frame.to_csv(output_folder + '/covbatharomized_etreg_k_frame.csv')

In [54]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_mri_etreg.0 0 covbatharm_mr...,3.769794,0.888717,0.889183


In [55]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,77.010959,74.933354
1,68.849315,71.367039
2,46.512329,44.702069
3,58.164384,56.742892
4,46.808219,36.734105
...,...,...
111,72.770559,64.909215
112,79.290411,78.838382
113,51.531507,54.879503
114,73.090411,70.382488


In [56]:
etreg_y_frame.to_csv(output_folder + '/covbatharmonized_etreg_y_frame.csv')

In [57]:
etreg = models[0]
etreg[0]

In [58]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'covbatharm_mri_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'covbatharm_mri_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'covbatharm_mri_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'covbatharm_mri_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'covbatharm_mri_etreg4.sav'))

Show results ON AVERAGE for each model

In [59]:
mri_based_covbatharmonized_on_testmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mri_based_covbatharmonized_on_testmri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_mri_linr.0 0 covbatharm_mri...,5.443068,0.765593,0.767659
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_mri_llreg.0 0 covbatharm_mr...,5.510357,0.751969,0.754266
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_mri_dtree.0 0 covbatharm_mr...,5.466576,0.749484,0.750473
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_mri_regr.0 0 covbatharm_mri...,8.132632,0.47428,0.482057
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_mri_svrp2.0 0 covbatharm_mr...,10.533029,0.049202,0.106104
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_mix_eregr.0 0 covbatharm_mi...,8.490421,0.412094,0.413326
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_mri_etreg.0 0 covbatharm_mr...,3.769794,0.888717,0.889183


In [60]:
mri_based_covbatharmonized_on_testmri.to_csv(output_folder + '/mri_based_covbatharmonized_on_testmri_AVERAGES.csv')

## Now we will build  models based on the whole  covbat harmonized StrokeMRI dataset, and apply them to TOP . 

In [61]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [62]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [63]:
MRIlinr = LinearRegression()
MRIlinr.fit(X_train, y_train)

In [64]:
MRIllreg = linear_model.LassoLars(alpha=0.01)
MRIllreg.fit(X_train, y_train)

In [65]:
MRIeregr = ElasticNetCV(cv=5, random_state=17)
MRIeregr.fit(X_train, y_train)


In [66]:
MRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MRIetreg.fit(X_train, y_train)

In [67]:
##  Save these four best models

In [68]:

## optional save models
#joblib.dump(MRIlinr, ('../result_models/' + 'covbatharm_whole_mri_linr.sav'))
#joblib.dump(MRIllreg, ('../result_models/'+ 'covbatharm_whole_mri_llreg1.sav'))
#joblib.dump(MRIeregr, ('../result_models/'+ 'covbatharm_whole_mri_eregr3.sav'))
#joblib.dump(MRIetreg, ('../result_models/'+ 'covbatharm_whole_mri_etreg4.sav'))

# Running whole MRI model over TOP dataset

In [69]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [70]:
X_top_test = X_top
y_top_test = y_top


In [71]:
y_top_pred = MRIlinr.predict(X_top_test)

In [72]:
# print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [73]:
data= [[
    'linear regression',
    'covbatharm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIlinr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [74]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,index,y_test_real_age,linr_y_pred_age
0,0,50.40,47.136124
1,1,37.02,44.441973
2,2,30.57,41.056320
3,3,47.05,40.461009
4,4,44.63,45.076491
...,...,...,...
524,524,33.55,45.817999
525,525,44.43,49.053509
526,526,45.60,41.640417
527,527,46.20,45.116790


In [75]:
linr_compare.to_csv(output_folder + '/whole_covbatharm_mri_linr_compare_on_top.csv')

In [76]:
y_top_pred = MRIllreg.predict(X_top_test)

In [77]:
data= [[
    'lasso regression',
    'covbatharm_whole_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIllreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,covbatharm_whole_mri_llreg.sav,7.715981,0.018581,0.48334


In [78]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,47.118538
1,37.02,43.786260
2,30.57,43.502503
3,47.05,40.602647
4,44.63,47.476150
...,...,...
524,33.55,47.124343
525,44.43,49.805216
526,45.60,44.530724
527,46.20,45.185786


In [79]:
llreg_compare.to_csv(output_folder + '/whole_covbat_harm_mri_llreg_compare_on_top.csv')

In [80]:
y_top_pred = MRIeregr.predict(X_top_test)

In [81]:
data= [[
    'elasticnetCV',
    'covbatharm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIeregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [82]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,50.907792
1,37.02,54.196240
2,30.57,50.054157
3,47.05,54.699280
4,44.63,59.851988
...,...,...
524,33.55,57.261870
525,44.43,54.190135
526,45.60,55.203219
527,46.20,53.988842


In [83]:
eregr_compare.to_csv(output_folder + '/whole_covbatharm_mri_eregr_compare_on_top.csv')

In [84]:
y_top_pred = MRIetreg.predict(X_top_test)

In [85]:
data= [[
    'extra trees',
    'neruo_harm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIetreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [86]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,32.276659
1,37.02,44.333138
2,30.57,50.765399
3,47.05,39.801166
4,44.63,44.987139
...,...,...
524,33.55,51.454496
525,44.43,42.801617
526,45.60,29.532715
527,46.20,33.450420


In [87]:
etreg_compare.to_csv(output_folder + '/whole_covbatharm_mri_etreg_compare_on_top.csv')

In [88]:
mri_based_covbat_harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_covbat_harmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,covbatharm_whole_mri_linr.sav,7.636848,0.043398,0.465883
0,lasso regression,covbatharm_whole_mri_llreg.sav,7.715981,0.018581,0.48334
0,elasticnetCV,covbatharm_whole_mri_linr.sav,16.263614,-2.556164,0.276664
0,extra trees,neruo_harm_mri_linr.sav,10.649719,-0.958258,-0.750368


## Now we will run the exact opposite process.
1. We will explore TOP based models via k-folded results, 
2. We will make a general unahrmonized TOP model (based off all TOP)
3. We will apply the best of these model to the StrokeMRI dataset

### Build ML models based on covbat harmonized TOP 

In [89]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [90]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'covbatharm_top_linr', LinearRegression(), ml_matrix, X, y)

In [91]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,covbatharm_top_linr.0,4.235216,0.732673,0.741286
0,linear regression-1,1,covbatharm_top_linr.1,4.603715,0.545623,0.545804
0,linear regression-2,2,covbatharm_top_linr.2,3.751912,0.726267,0.72924
0,linear regression-3,3,covbatharm_top_linr.3,4.281317,0.70147,0.704557
0,linear regression-4,4,covbatharm_top_linr.4,3.88187,0.739439,0.73972


In [92]:
linr_k_frame.to_csv(output_folder + '/covbatharmonized_top_linr_k_frame.csv')

In [93]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_top_linr.0 0 covbatharm_top...,4.150806,0.689094,0.692121


In [94]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,40.023627
1,28.26,31.369056
2,30.75,34.442063
3,44.63,41.573050
4,30.47,33.878838
...,...,...
101,32.58,38.035917
102,50.40,40.667401
103,28.82,33.023753
104,27.09,31.135328


In [95]:
linr_y_frame.to_csv(output_folder + '/covbatharmonized_top_linr_y_frame.csv')

In [96]:
linr = models[0]
linr[0]

In [97]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [98]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'covbatharm_top_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'covbatharm_top_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'covbatharm_top_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'covbatharm_top_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'covbatharm_top_linr4.sav'))

In [99]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'covbatharm_top_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,covbatharm_top_llreg.0,4.548473,0.699748,0.707058
0,lasso regression-1,1,covbatharm_top_llreg.1,4.319586,0.629577,0.630073
0,lasso regression-2,2,covbatharm_top_llreg.2,3.87931,0.716306,0.717807
0,lasso regression-3,3,covbatharm_top_llreg.3,4.56918,0.657099,0.662619
0,lasso regression-4,4,covbatharm_top_llreg.4,4.202128,0.695943,0.695985


In [100]:
llreg_k_frame.to_csv(output_folder + '/covbatharmonized_top_llreg_k_frame.csv')

In [101]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_top_llreg.0 0 covbatharm_to...,4.303735,0.679735,0.682708


In [102]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,38.821206
1,28.26,31.332907
2,30.75,34.277174
3,44.63,41.395292
4,30.47,35.416207
...,...,...
101,32.58,38.823883
102,50.40,39.634450
103,28.82,32.856339
104,27.09,32.332973


In [103]:
llreg_y_frame.to_csv(output_folder + '/covbatharmonized_top_llreg_y_frame.csv')

In [104]:
llreg = models[0]
llreg[0]

In [105]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'covbatharm_top_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'covbatharm_top_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'covbatharm_top_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'covbatharm_top_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'covbatharm_top_llreg4.sav'))

In [106]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'covbatharm_top_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,covbatharm_top_dtree.0,4.978302,0.650317,0.650317
0,decision tree-1,1,covbatharm_top_dtree.1,5.005377,0.492913,0.503522
0,decision tree-2,2,covbatharm_top_dtree.2,4.523208,0.606029,0.606721
0,decision tree-3,3,covbatharm_top_dtree.3,5.300566,0.496984,0.499002
0,decision tree-4,4,covbatharm_top_dtree.4,4.913868,0.513424,0.535356


In [107]:
dtree_k_frame.to_csv(output_folder + '/covbatharmonized_top_dtree_k_frame.csv')

In [108]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_top_dtree.0 0 covbatharm_to...,4.944264,0.551933,0.558984


In [109]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,43.46
1,28.26,31.94
2,30.75,29.03
3,44.63,43.59
4,30.47,28.02
...,...,...
101,32.58,32.65
102,50.40,34.41
103,28.82,30.75
104,27.09,22.95


In [110]:
dtree_y_frame.to_csv(output_folder + '/covbatharmonized_top_dtree_y_frame.csv')

In [111]:
dtree = models[0]
dtree[0]

In [112]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'covbatharm_top_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,covbatharm_top_regr.0,7.461863,0.25751,0.267426
0,MLP regression-1,1,covbatharm_top_regr.1,6.873801,0.231808,0.246502
0,MLP regression-2,2,covbatharm_top_regr.2,6.447736,0.280917,0.282697
0,MLP regression-3,3,covbatharm_top_regr.3,10.843247,-0.761225,-0.61047
0,MLP regression-4,4,covbatharm_top_regr.4,7.287779,0.167926,0.171493


In [113]:
regr_k_frame.to_csv(output_folder + '/covbatharmonized_top_regr_k_frame.csv')

In [114]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_top_regr.0 0 covbatharm_top...,7.782885,0.035387,0.07153


In [115]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,37.813969
1,28.26,36.215981
2,30.75,39.676125
3,44.63,40.320846
4,30.47,35.480568
...,...,...
101,32.58,37.954439
102,50.40,35.269021
103,28.82,37.411935
104,27.09,34.173357


In [116]:
regr_y_frame.to_csv(output_folder + '/covbatharmonized_top_regr_y_frame.csv')

In [117]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [118]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'covbatharm_top_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,covbatharm_top_svrp2.0,8.266932,0.132376,0.132438
0,support vector reg poly2-1,1,covbatharm_top_svrp2.1,7.645173,0.083755,0.08547
0,support vector reg poly2-2,2,covbatharm_top_svrp2.2,7.130636,0.148197,0.148217
0,support vector reg poly2-3,3,covbatharm_top_svrp2.3,7.696211,0.121492,0.126783
0,support vector reg poly2-4,4,covbatharm_top_svrp2.4,7.619091,0.090907,0.099785


In [119]:
svrp2_k_frame.to_csv(output_folder + '/covbatharmonized_top_svrp2_k_frame.csv')

In [120]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_top_svrp2.0 0 covbatharm_to...,7.671608,0.115345,0.118539


In [121]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,37.870061
1,28.26,38.357449
2,30.75,38.164241
3,44.63,37.904329
4,30.47,37.020591
...,...,...
101,32.58,36.868189
102,50.40,37.439746
103,28.82,38.061032
104,27.09,34.386463


In [122]:
svrp2_y_frame.to_csv(output_folder + '/covbatharmonized_top_svrp2_y_frame.csv')

In [123]:
svrp2 = models[0]
svrp2[0]

In [124]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'covbatharm_top_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,covbatharm_top_eregr.0,8.602743,0.056034,0.056477
0,elasticnetCV-1,1,covbatharm_top_eregr.1,7.775574,0.058477,0.058487
0,elasticnetCV-2,2,covbatharm_top_eregr.2,7.369008,0.091656,0.092104
0,elasticnetCV-3,3,covbatharm_top_eregr.3,7.559489,0.149829,0.16535
0,elasticnetCV-4,4,covbatharm_top_eregr.4,7.827418,0.051441,0.055796


In [125]:
eregr_k_frame.to_csv(output_folder + '/covbatharmonized_top_eregr_k_frame.csv')

In [126]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_top_eregr.0 0 covbatharm_to...,7.826847,0.081487,0.085643


In [127]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,37.499062
1,28.26,37.690028
2,30.75,37.599246
3,44.63,37.529756
4,30.47,37.340638
...,...,...
101,32.58,37.172150
102,50.40,37.513828
103,28.82,37.750942
104,27.09,36.393301


In [128]:
eregr_y_frame.to_csv(output_folder + '/covbatharmonized_top_eregr_y_frame.csv')

In [129]:
eregr = models[0]
eregr[0]

In [130]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'covbatharm_top_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,covbatharm_top_etreg.0,3.775432,0.791439,0.791505
0,extra trees-1,1,covbatharm_top_etreg.1,3.748597,0.734811,0.738538
0,extra trees-2,2,covbatharm_top_etreg.2,3.50876,0.772733,0.773733
0,extra trees-3,3,covbatharm_top_etreg.3,3.735189,0.773941,0.773942
0,extra trees-4,4,covbatharm_top_etreg.4,3.564259,0.777539,0.782005


In [131]:
etreg_k_frame.to_csv(output_folder + '/covbatharomized_top_etreg_k_frame.csv')

In [132]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_top_etreg.0 0 covbatharm_to...,3.666448,0.770093,0.771945


In [133]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,40.06,40.0350
1,28.26,30.4289
2,30.75,33.8126
3,44.63,38.2709
4,30.47,32.1633
...,...,...
101,32.58,35.8931
102,50.40,44.5613
103,28.82,34.1495
104,27.09,25.8342


In [134]:
etreg_y_frame.to_csv(output_folder + '/covbatharmonized_top_etreg_y_frame.csv')

In [135]:
etreg = models[0]
etreg[0]

In [136]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'covbatharm_top_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'covbatharm_top_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'covbatharm_top_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'covbatharm_top_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'covbatharm_top_etreg4.sav'))

Show results ON AVERAGE for each model

In [137]:
top_based_covbatharmonized_on_testtop =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
top_based_covbatharmonized_on_testtop

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_top_linr.0 0 covbatharm_top...,4.150806,0.689094,0.692121
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_top_llreg.0 0 covbatharm_to...,4.303735,0.679735,0.682708
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_top_dtree.0 0 covbatharm_to...,4.944264,0.551933,0.558984
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_top_regr.0 0 covbatharm_top...,7.782885,0.035387,0.07153
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_top_svrp2.0 0 covbatharm_to...,7.671608,0.115345,0.118539
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_top_eregr.0 0 covbatharm_to...,7.826847,0.081487,0.085643
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_top_etreg.0 0 covbatharm_to...,3.666448,0.770093,0.771945


In [138]:
top_based_covbatharmonized_on_testtop.to_csv(output_folder + '/top_based_covbatharmonized_on_topt_AVERAGES.csv')

## Now we will build  models based on the whole harmonized StrokeTOP dataset, and apply them to StrokeMRI. 

In [139]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [140]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [141]:
TOPlinr = LinearRegression()
TOPlinr.fit(X_train, y_train)

In [142]:
TOPllreg = linear_model.LassoLars(alpha=0.01)
TOPllreg.fit(X_train, y_train)

In [143]:
TOPeregr = ElasticNetCV(cv=5, random_state=17)
TOPeregr.fit(X_train, y_train)

In [144]:
TOPetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPetreg.fit(X_train, y_train)

##  Save these four best models

In [145]:
## optional save models
#joblib.dump(TOPlinr, ('../result_models/' + 'covbatharm_whole_top_linr.sav'))
#joblib.dump(TOPllreg, ('../result_models/'+ 'covbatharm_whole_top_llreg1.sav'))
#joblib.dump(TOPeregr, ('../result_models/'+ 'covbatharm_whole_top_eregr3.sav'))
#joblib.dump(TOPetreg, ('../result_models/'+ 'covbatharm_whole_top_etreg4.sav'))

# Running whole TOP model over MRI dataset

In [146]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri= y_mri.astype('float')

In [147]:
X_mri_test = X_mri
y_mri_test = y_mri


In [148]:
y_mri_pred = TOPlinr.predict(X_mri_test)

In [149]:
data= [[
    'linear regression',
    'covbatharm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPlinr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [150]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,38.691725
1,47.583562,43.565375
2,74.104110,44.892966
3,68.843836,38.103296
4,61.978082,46.714593
...,...,...
573,67.178082,46.057697
574,67.794521,51.342369
575,54.769863,43.875160
576,66.639344,49.998803


In [151]:
linr_compare.to_csv(output_folder + '/whole_covbat_harm_top_linr_compare_on_mti.csv')

In [152]:
y_mri_pred = TOPllreg.predict(X_mri_test)

In [153]:
data= [[
    'lasso regression',
    'covbatharm_whole_top_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPllreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,covbatharm_whole_top_llreg.sav,17.137091,-1.094108,0.214507


In [154]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,42.641529
1,47.583562,43.553836
2,74.104110,52.074319
3,68.843836,43.429955
4,61.978082,47.738040
...,...,...
573,67.178082,49.130588
574,67.794521,53.967142
575,54.769863,47.541767
576,66.639344,52.173166


In [155]:
llreg_compare.to_csv(output_folder + '/whole_covbatharm_top_llreg_compare_on_mri.csv')

In [156]:
y_mri_pred = TOPeregr.predict(X_mri_test)

In [157]:
data= [[
    'elasticnetCV',
    'covbatharm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPeregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [158]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,37.419650
1,47.583562,37.001412
2,74.104110,38.884321
3,68.843836,38.822085
4,61.978082,35.488174
...,...,...
573,67.178082,39.253448
574,67.794521,39.253284
575,54.769863,39.484169
576,66.639344,39.535995


In [159]:
eregr_compare.to_csv(output_folder + '/whole_covbatharm_top_eregr_compare_on_mri.csv')

In [160]:
y_mri_pred = TOPetreg.predict(X_mri_test)

In [161]:
data= [[
    'extra trees',
    'covbatharm_mri_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPetreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [162]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,47.8426
1,47.583562,40.4329
2,74.104110,51.4189
3,68.843836,51.6492
4,61.978082,41.6395
...,...,...
573,67.178082,49.8876
574,67.794521,50.0068
575,54.769863,47.7862
576,66.639344,49.2632


In [163]:
etreg_compare.to_csv(output_folder + '/whole_covbatharm_top_etreg_compare_on_mri.csv')

compile csvs of results

In [164]:
top_based_covbatharmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_covbatharmonized_on_mri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,covbatharm_whole_top_linr.sav,20.331347,-2.109943,-0.240329
0,lasso regression,covbatharm_whole_top_llreg.sav,17.137091,-1.094108,0.214507
0,elasticnetCV,covbatharm_whole_top_linr.sav,23.802267,-2.507116,0.04816
0,extra trees,covbatharm_mri_linr.sav,16.672375,-0.79841,0.388652


In [165]:
top_based_covbatharmonized_on_mri.to_csv(output_folder + '/whole_top_based_covbatharmonized_on_mri.csv')