# ML testing: experiment #1f- harmonized with covbat

This notebook involves testing for ML algorithms. This notebook shows harmonized StrokeMRI and TOP based models, and how they perform on each other, once harmonized by covbat

Data: StrokeMRI, TOP

Harmonisation: covbat

Training data: StrokeMRI and/or TOP

Testing data: StrokeMRI and/or TOP test subsets

Futher data applied to: none

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: predicted brain age

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
 
# # demo stuff
import ipywidgets as widgets
from ipywidgets import interactive
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = 'harmonizations/harm_results/covbat/'
filename_mri = os.path.join(filepath_mri,'mri_covbat_a_top.csv') 

filepath_top = 'harmonizations/harm_results/covbat/'
filename_top = os.path.join(filepath_top,'top_covbat_a_mri.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [5]:
TOP.columns = TOP.columns.str.lower() 
StrokeMRI.columns = StrokeMRI.columns.str.lower() 

In [6]:
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-0668_1,50.4,1.0,0.644095,0.485875,0.264301,0.461812,0.810756,-0.001395,16.788661,0.269203,0.177777,0.212461,0.258591,0.247665,25.200002,90.14575,80.434577,58.712359,71.928965
1,sub-0532_1,37.02,1.0,0.718853,0.594922,0.318534,0.441841,0.808059,0.005069,26.013332,0.255079,0.176816,0.238987,0.214094,0.240499,24.486767,100.134849,89.085626,67.735777,79.457414
2,sub-0529_1,30.57,1.0,0.669144,0.459872,0.288953,0.473679,0.797107,0.015837,21.356134,0.276472,0.196664,0.210531,0.198005,0.220283,26.144517,106.82901,97.277775,71.930078,85.744208


In [9]:
StrokeMRI.tail(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
575,sub-5931002_1,54.769863,0.0,0.666513,0.560411,0.413253,0.406221,0.747382,0.015895,37.941582,0.27049,0.248138,0.254496,0.246506,0.272979,22.647282,66.810349,55.237217,43.950321,50.426726
576,sub-5925701_1,66.639344,1.0,0.582027,0.475014,0.350055,0.413751,0.75098,0.013629,29.427311,0.282273,0.17759,0.260293,0.289598,0.278285,18.194845,55.713503,54.884233,36.483087,41.309589
577,sub-5934201_1,45.336986,1.0,0.623057,0.446491,0.298757,0.454832,0.781636,0.004873,14.876094,0.264497,0.170445,0.20034,0.20535,0.201884,24.617748,99.102444,86.081573,77.232301,83.71988


In [10]:
#give a checkbox for out put folder
loged_feat = widgets.ToggleButton(
    value=False,
    description='Click me if features logged',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
loged_feat

ToggleButton(value=False, description='Click me if features logged', icon='check', tooltip='Description')

In [11]:
loged_feat.value

False

In [12]:
if loged_feat.value == False:
    output_folder = '1f_no_log_outputs'
else:
    output_folder = '1f_loged_outputs'

os.makedirs(output_folder, exist_ok=True)

## Build ML models based on covbat StrokeMRI

In [13]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [14]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'covbatharm_mri_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [15]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,covbatharm_mri_linr.0,5.373802,0.757511,0.757887
0,linear regression-1,1,covbatharm_mri_linr.1,5.305123,0.778792,0.789342
0,linear regression-2,2,covbatharm_mri_linr.2,6.035909,0.701416,0.701476
0,linear regression-3,3,covbatharm_mri_linr.3,5.730553,0.751179,0.75119
0,linear regression-4,4,covbatharm_mri_linr.4,5.325835,0.786022,0.786031


In [16]:
linr_k_frame.to_csv(output_folder + '/covbatharmonized_mri_linr_k_frame.csv')

In [17]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_mri_linr.0 0 covbatharm_mri...,5.554245,0.754984,0.757185


In [18]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.488790
1,66.471233,58.723580
2,46.084932,48.392058
3,73.564384,68.103728
4,51.357923,42.861085
...,...,...
140,74.191257,62.268662
141,67.515068,64.031080
142,49.827375,58.660817
143,71.254795,69.712085


In [19]:
linr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_linr_y_frame.csv')

In [20]:
linr = models[0]
linr[0]

In [21]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [22]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'covbatharm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'covbatharm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'covbatharm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'covbatharm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'covbatharm_mri_linr4.sav'))

In [23]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'covbatharm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,covbatharm_mri_llreg.0,5.677424,0.73053,0.731041
0,lasso regression-1,1,covbatharm_mri_llreg.1,5.437922,0.768942,0.777351
0,lasso regression-2,2,covbatharm_mri_llreg.2,6.168455,0.674268,0.674295
0,lasso regression-3,3,covbatharm_mri_llreg.3,6.109046,0.716988,0.717173
0,lasso regression-4,4,covbatharm_mri_llreg.4,5.466626,0.773764,0.773771


In [24]:
llreg_k_frame.to_csv(output_folder + '/covbatharmonized_mri_llreg_k_frame.csv')

In [25]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_mri_llreg.0 0 covbatharm_mr...,5.771895,0.732898,0.734726


In [26]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.425259
1,66.471233,58.853270
2,46.084932,49.096705
3,73.564384,67.734340
4,51.357923,43.020242
...,...,...
140,74.191257,63.533600
141,67.515068,64.372583
142,49.827375,61.402844
143,71.254795,71.495420


In [27]:
llreg_y_frame.to_csv(output_folder + '/covbatharmonized_mri_llreg_y_frame.csv')

In [28]:
llreg = models[0]
llreg[0]

In [29]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'covbatharm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'covbatharm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'covbatharm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'covbatharm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'covbatharm_mri_llreg4.sav'))

In [30]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'covbatharm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,covbatharm_mri_dtree.0,5.342324,0.754224,0.756305
0,decision tree-1,1,covbatharm_mri_dtree.1,5.327961,0.78849,0.797313
0,decision tree-2,2,covbatharm_mri_dtree.2,5.616736,0.744188,0.746706
0,decision tree-3,3,covbatharm_mri_dtree.3,5.087252,0.79255,0.794696
0,decision tree-4,4,covbatharm_mri_dtree.4,5.541729,0.734328,0.735314


In [31]:
dtree_k_frame.to_csv(output_folder + '/covbatharmonized_mri_dtree_k_frame.csv')

In [32]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_mri_dtree.0 0 covbatharm_mr...,5.3832,0.762756,0.766067


In [33]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,42.248881
1,66.471233,57.756164
2,46.084932,43.487671
3,73.564384,68.106849
4,51.357923,54.134247
...,...,...
140,74.191257,79.528767
141,67.515068,66.821918
142,49.827375,51.052055
143,71.254795,75.122951


In [34]:
dtree_y_frame.to_csv(output_folder + '/covbatharmonized_mri_dtree_y_frame.csv')

In [35]:
dtree = models[0]
dtree[0]

In [36]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'covbatharm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,covbatharm_mri_regr.0,7.435156,0.538995,0.540924
0,MLP regression-1,1,covbatharm_mri_regr.1,7.672557,0.543736,0.54511
0,MLP regression-2,2,covbatharm_mri_regr.2,8.01527,0.482461,0.494199
0,MLP regression-3,3,covbatharm_mri_regr.3,8.768018,0.37241,0.442435
0,MLP regression-4,4,covbatharm_mri_regr.4,8.300073,0.446091,0.448022


In [37]:
regr_k_frame.to_csv(output_folder + '/covbatharmonized_mri_regr_k_frame.csv')

In [38]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_mri_regr.0 0 covbatharm_mri...,8.038215,0.476739,0.494138


In [39]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,43.305703
1,66.471233,59.932401
2,46.084932,60.677646
3,73.564384,62.127756
4,51.357923,46.929374
...,...,...
140,74.191257,58.088114
141,67.515068,62.262742
142,49.827375,54.846188
143,71.254795,69.701178


In [40]:
regr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_regr_y_frame.csv')

In [41]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [42]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'covbatharm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,covbatharm_mri_svrp2.0,9.545307,0.200893,0.257413
0,support vector reg poly2-1,1,covbatharm_mri_svrp2.1,10.312043,0.193205,0.265822
0,support vector reg poly2-2,2,covbatharm_mri_svrp2.2,9.115734,0.219993,0.302283
0,support vector reg poly2-3,3,covbatharm_mri_svrp2.3,9.669306,0.249325,0.291508
0,support vector reg poly2-4,4,covbatharm_mri_svrp2.4,9.930901,0.23888,0.293172


In [43]:
svrp2_k_frame.to_csv(output_folder + '/covbatharmonized_mri_svrp2_k_frame.csv')

In [44]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_mri_svrp2.0 0 covbatharm_mr...,9.714658,0.220459,0.282039


In [45]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,58.532293
1,66.471233,69.074667
2,46.084932,56.512081
3,73.564384,64.963939
4,51.357923,63.784893
...,...,...
140,74.191257,60.843052
141,67.515068,62.438972
142,49.827375,65.462721
143,71.254795,67.447641


In [46]:
svrp2_y_frame.to_csv(output_folder + '/covbatharmonized_mrisvrp2_y_frame.csv')

In [47]:
svrp2 = models[0]
svrp2[0]

In [48]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'covbatharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,covbatharm_mix_eregr.0,7.691024,0.502325,0.502356
0,elasticnetCV-1,1,covbatharm_mix_eregr.1,8.324058,0.467741,0.481695
0,elasticnetCV-2,2,covbatharm_mix_eregr.2,7.802489,0.469198,0.471463
0,elasticnetCV-3,3,covbatharm_mix_eregr.3,8.467575,0.435746,0.441641
0,elasticnetCV-4,4,covbatharm_mix_eregr.4,7.797016,0.531952,0.533054


In [49]:
eregr_k_frame.to_csv(output_folder + '/covbatharmonized_eregr_k_frame.csv')

In [50]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_mix_eregr.0 0 covbatharm_mi...,8.016432,0.481392,0.486042


In [51]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.501690
1,66.471233,65.156208
2,46.084932,53.896101
3,73.564384,65.086675
4,51.357923,50.672792
...,...,...
140,74.191257,53.700500
141,67.515068,60.542611
142,49.827375,60.339485
143,71.254795,70.870898


In [52]:
eregr_y_frame.to_csv(output_folder + '/covbatharmonized_mri_eregr_y_frame.csv')

In [53]:
eregr = models[0]
eregr[0]

In [54]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'covbatharm_mri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,covbatharm_mri_etreg.0,3.585222,0.89968,0.899685
0,extra trees-1,1,covbatharm_mri_etreg.1,4.011828,0.88537,0.89067
0,extra trees-2,2,covbatharm_mri_etreg.2,4.516568,0.847402,0.847403
0,extra trees-3,3,covbatharm_mri_etreg.3,3.98581,0.87693,0.877412
0,extra trees-4,4,covbatharm_mri_etreg.4,4.040397,0.873739,0.873952


In [55]:
etreg_k_frame.to_csv(output_folder + '/covbatharomized_etreg_k_frame.csv')

In [56]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_mri_etreg.0 0 covbatharm_mr...,4.027965,0.876624,0.877824


In [57]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,38.719941
1,66.471233,61.389943
2,46.084932,46.160282
3,73.564384,66.292154
4,51.357923,42.679370
...,...,...
140,74.191257,70.968873
141,67.515068,67.431710
142,49.827375,48.989407
143,71.254795,72.480065


In [58]:
etreg_y_frame.to_csv(output_folder + '/covbatharmonized_etreg_y_frame.csv')

In [59]:
etreg = models[0]
etreg[0]

In [60]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'covbatharm_mri_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'covbatharm_mri_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'covbatharm_mri_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'covbatharm_mri_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'covbatharm_mri_etreg4.sav'))

Show results ON AVERAGE for each model

In [61]:
mri_based_covbatharmonized_on_testmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mri_based_covbatharmonized_on_testmri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_mri_linr.0 0 covbatharm_mri...,5.554245,0.754984,0.757185
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_mri_llreg.0 0 covbatharm_mr...,5.771895,0.732898,0.734726
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_mri_dtree.0 0 covbatharm_mr...,5.3832,0.762756,0.766067
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_mri_regr.0 0 covbatharm_mri...,8.038215,0.476739,0.494138
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_mri_svrp2.0 0 covbatharm_mr...,9.714658,0.220459,0.282039
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_mix_eregr.0 0 covbatharm_mi...,8.016432,0.481392,0.486042
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_mri_etreg.0 0 covbatharm_mr...,4.027965,0.876624,0.877824


In [62]:
mri_based_covbatharmonized_on_testmri.to_csv(output_folder + '/mri_based_covbatharmonized_on_testmri_AVERAGES.csv')

## Now we will build  models based on the whole  covbat harmonized StrokeMRI dataset, and apply them to TOP . 

In [63]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [64]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [65]:
MRIlinr = LinearRegression()
MRIlinr.fit(X_train, y_train)

In [66]:
MRIllreg = linear_model.LassoLars(alpha=0.01)
MRIllreg.fit(X_train, y_train)

In [67]:
MRIeregr = ElasticNetCV(cv=5, random_state=17)
MRIeregr.fit(X_train, y_train)


In [68]:
MRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MRIetreg.fit(X_train, y_train)

In [69]:
##  Save these four best models

In [70]:

## optional save models
#joblib.dump(MRIlinr, ('../result_models/' + 'covbatharm_whole_mri_linr.sav'))
#joblib.dump(MRIllreg, ('../result_models/'+ 'covbatharm_whole_mri_llreg1.sav'))
#joblib.dump(MRIeregr, ('../result_models/'+ 'covbatharm_whole_mri_eregr3.sav'))
#joblib.dump(MRIetreg, ('../result_models/'+ 'covbatharm_whole_mri_etreg4.sav'))

# Running whole MRI model over TOP dataset

In [71]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [72]:
X_top_test = X_top
y_top_test = y_top


In [73]:
y_top_pred = MRIlinr.predict(X_top_test)

In [74]:
# print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [75]:
data= [[
    'linear regression',
    'covbatharm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIlinr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [76]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,index,y_test_real_age,linr_y_pred_age
0,0,50.40,46.427821
1,1,37.02,44.040439
2,2,30.57,38.362159
3,3,47.05,40.944808
4,4,44.63,44.473204
...,...,...,...
524,524,33.55,42.973337
525,525,44.43,50.043239
526,526,45.60,43.189677
527,527,46.20,46.651173


In [77]:
linr_compare.to_csv(output_folder + '/whole_covbatharm_mri_linr_compare_on_top.csv')

In [78]:
y_top_pred = MRIllreg.predict(X_top_test)

In [79]:
data= [[
    'lasso regression',
    'covbatharm_whole_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIllreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,covbatharm_whole_mri_llreg.sav,7.676812,0.06955,0.604405


In [80]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,45.775602
1,37.02,43.631024
2,30.57,40.754899
3,47.05,41.886197
4,44.63,46.502472
...,...,...
524,33.55,45.763646
525,44.43,52.769617
526,45.60,45.921263
527,46.20,48.736642


In [81]:
llreg_compare.to_csv(output_folder + '/whole_covbat_harm_mri_llreg_compare_on_top.csv')

In [82]:
y_top_pred = MRIeregr.predict(X_top_test)

In [83]:
data= [[
    'elasticnetCV',
    'covbatharm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIeregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [84]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,50.385095
1,37.02,48.492332
2,30.57,44.120782
3,47.05,54.046139
4,44.63,54.754251
...,...,...
524,33.55,54.203578
525,44.43,53.509992
526,45.60,53.908115
527,46.20,55.064625


In [85]:
eregr_compare.to_csv(output_folder + '/whole_covbatharm_mri_eregr_compare_on_top.csv')

In [86]:
y_top_pred = MRIetreg.predict(X_top_test)

In [87]:
data= [[
    'extra trees',
    'neruo_harm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIetreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [88]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,32.650265
1,37.02,45.030858
2,30.57,46.650401
3,47.05,38.849484
4,44.63,46.253536
...,...,...
524,33.55,50.205740
525,44.43,42.935715
526,45.60,33.396624
527,46.20,33.315945


In [89]:
etreg_compare.to_csv(output_folder + '/whole_covbatharm_mri_etreg_compare_on_top.csv')

In [90]:
mri_based_covbat_harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_covbat_harmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,covbatharm_whole_mri_linr.sav,7.357418,0.125269,0.526608
0,lasso regression,covbatharm_whole_mri_llreg.sav,7.676812,0.06955,0.604405
0,elasticnetCV,covbatharm_whole_mri_linr.sav,14.518302,-1.932495,0.337209
0,extra trees,neruo_harm_mri_linr.sav,10.312571,-0.756728,-0.604434


## Now we will run the exact opposite process.
1. We will explore TOP based models via k-folded results, 
2. We will make a general unahrmonized TOP model (based off all TOP)
3. We will apply the best of these model to the StrokeMRI dataset

### Build ML models based on covbat harmonized TOP 

In [91]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [92]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'covbatharm_top_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [93]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,covbatharm_top_linr.0,3.979201,0.76022,0.774406
0,linear regression-1,1,covbatharm_top_linr.1,5.940817,-7.899959,-7.833692
0,linear regression-2,2,covbatharm_top_linr.2,3.875129,0.70483,0.705445
0,linear regression-3,3,covbatharm_top_linr.3,3.996711,0.748828,0.751521
0,linear regression-4,4,covbatharm_top_linr.4,4.137086,0.732289,0.735101


In [94]:
linr_k_frame.to_csv(output_folder + '/covbatharmonized_top_linr_k_frame.csv')

In [95]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_top_linr.0 0 covbatharm_top...,4.385789,-0.990759,-0.973444


In [96]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,24.768520
1,30.75,34.257983
2,41.66,44.235433
3,31.64,36.898486
4,37.87,42.936560
...,...,...
128,22.95,28.374593
129,40.28,42.231115
130,44.15,43.709433
131,21.42,29.070493


In [97]:
linr_y_frame.to_csv(output_folder + '/covbatharmonized_top_linr_y_frame.csv')

In [98]:
linr = models[0]
linr[0]

In [99]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [100]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'covbatharm_top_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'covbatharm_top_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'covbatharm_top_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'covbatharm_top_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'covbatharm_top_linr4.sav'))

In [101]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'covbatharm_top_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,covbatharm_top_llreg.0,4.229659,0.725736,0.735865
0,lasso regression-1,1,covbatharm_top_llreg.1,6.087013,-7.749109,-7.683089
0,lasso regression-2,2,covbatharm_top_llreg.2,3.966088,0.702589,0.702691
0,lasso regression-3,3,covbatharm_top_llreg.3,4.318321,0.715458,0.720175
0,lasso regression-4,4,covbatharm_top_llreg.4,4.430446,0.693619,0.695462


In [102]:
llreg_k_frame.to_csv(output_folder + '/covbatharmonized_top_llreg_k_frame.csv')

In [103]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_top_llreg.0 0 covbatharm_to...,4.606305,-0.982341,-0.965779


In [104]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,25.210790
1,30.75,34.120376
2,41.66,43.544231
3,31.64,38.008651
4,37.87,43.222970
...,...,...
128,22.95,30.770376
129,40.28,41.010309
130,44.15,41.615009
131,21.42,30.542427


In [105]:
llreg_y_frame.to_csv(output_folder + '/covbatharmonized_top_llreg_y_frame.csv')

In [106]:
llreg = models[0]
llreg[0]

In [107]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'covbatharm_top_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'covbatharm_top_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'covbatharm_top_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'covbatharm_top_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'covbatharm_top_llreg4.sav'))

In [108]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'covbatharm_top_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,covbatharm_top_dtree.0,5.211353,0.597298,0.599601
0,decision tree-1,1,covbatharm_top_dtree.1,4.833684,0.599124,0.601789
0,decision tree-2,2,covbatharm_top_dtree.2,5.129023,0.401185,0.401263
0,decision tree-3,3,covbatharm_top_dtree.3,5.450752,0.483642,0.487325
0,decision tree-4,4,covbatharm_top_dtree.4,4.876692,0.558271,0.601207


In [109]:
dtree_k_frame.to_csv(output_folder + '/covbatharmonized_top_dtree_k_frame.csv')

In [110]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_top_dtree.0 0 covbatharm_to...,5.100301,0.527904,0.538237


In [111]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,30.28
1,30.75,34.09
2,41.66,40.65
3,31.64,39.44
4,37.87,41.94
...,...,...
128,22.95,31.72
129,40.28,38.12
130,44.15,45.61
131,21.42,24.74


In [112]:
dtree_y_frame.to_csv(output_folder + '/covbatharmonized_top_dtree_y_frame.csv')

In [113]:
dtree = models[0]
dtree[0]

In [114]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'covbatharm_top_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,covbatharm_top_regr.0,5.097301,0.598087,0.599621
0,MLP regression-1,1,covbatharm_top_regr.1,9.091053,-18.804416,-18.689384
0,MLP regression-2,2,covbatharm_top_regr.2,4.934712,0.499444,0.499797
0,MLP regression-3,3,covbatharm_top_regr.3,5.966147,0.478053,0.507289
0,MLP regression-4,4,covbatharm_top_regr.4,5.871565,0.477626,0.480358


In [115]:
regr_k_frame.to_csv(output_folder + '/covbatharmonized_top_regr_k_frame.csv')

In [116]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_top_regr.0 0 covbatharm_top...,6.192156,-3.350241,-3.320464


In [117]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,28.657071
1,30.75,37.856386
2,41.66,36.023606
3,31.64,37.764197
4,37.87,40.250031
...,...,...
128,22.95,32.313247
129,40.28,38.479886
130,44.15,39.070492
131,21.42,33.561040


In [118]:
regr_y_frame.to_csv(output_folder + '/covbatharmonized_top_regr_y_frame.csv')

In [119]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [120]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'covbatharm_top_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,covbatharm_top_svrp2.0,8.063533,0.142648,0.143008
0,support vector reg poly2-1,1,covbatharm_top_svrp2.1,9.199639,-4.067339,-4.051061
0,support vector reg poly2-2,2,covbatharm_top_svrp2.2,7.082794,0.13432,0.134576
0,support vector reg poly2-3,3,covbatharm_top_svrp2.3,7.84395,0.131618,0.136105
0,support vector reg poly2-4,4,covbatharm_top_svrp2.4,7.729496,0.107315,0.109915


In [121]:
svrp2_k_frame.to_csv(output_folder + '/covbatharmonized_top_svrp2_k_frame.csv')

In [122]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_top_svrp2.0 0 covbatharm_to...,7.983882,-0.710287,-0.705492


In [123]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,37.810981
1,30.75,38.017189
2,41.66,36.069141
3,31.64,37.036962
4,37.87,38.576819
...,...,...
128,22.95,35.889185
129,40.28,37.822876
130,44.15,37.954106
131,21.42,36.765758


In [124]:
svrp2_y_frame.to_csv(output_folder + '/covbatharmonized_top_svrp2_y_frame.csv')

In [125]:
svrp2 = models[0]
svrp2[0]

In [126]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'covbatharm_top_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,covbatharm_top_eregr.0,7.37356,0.270913,0.270913
0,elasticnetCV-1,1,covbatharm_top_eregr.1,8.536637,-16.62166,-16.468924
0,elasticnetCV-2,2,covbatharm_top_eregr.2,6.492185,0.25298,0.255882
0,elasticnetCV-3,3,covbatharm_top_eregr.3,7.27505,0.240155,0.244065
0,elasticnetCV-4,4,covbatharm_top_eregr.4,7.099332,0.218237,0.219443


In [127]:
eregr_k_frame.to_csv(output_folder + '/covbatharmonized_top_eregr_k_frame.csv')

In [128]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_top_eregr.0 0 covbatharm_to...,7.355353,-3.127875,-3.095724


In [129]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,37.018204
1,30.75,38.186605
2,41.66,34.723780
3,31.64,35.357215
4,37.87,39.152068
...,...,...
128,22.95,33.006870
129,40.28,37.938400
130,44.15,38.455811
131,21.42,34.765854


In [130]:
eregr_y_frame.to_csv(output_folder + '/covbatharmonized_top_eregr_y_frame.csv')

In [131]:
eregr = models[0]
eregr[0]

In [132]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'covbatharm_top_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,covbatharm_top_etreg.0,3.83078,0.770955,0.773368
0,extra trees-1,1,covbatharm_top_etreg.1,3.679321,0.770798,0.770866
0,extra trees-2,2,covbatharm_top_etreg.2,3.557675,0.74771,0.747952
0,extra trees-3,3,covbatharm_top_etreg.3,4.023246,0.744341,0.745305
0,extra trees-4,4,covbatharm_top_etreg.4,3.53935,0.779734,0.783744


In [133]:
etreg_k_frame.to_csv(output_folder + '/covbatharomized_top_etreg_k_frame.csv')

In [134]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_top_etreg.0 0 covbatharm_to...,3.726074,0.762708,0.764247


In [135]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,26.6389
1,30.75,31.2344
2,41.66,39.5583
3,31.64,37.0102
4,37.87,45.4427
...,...,...
128,22.95,29.1843
129,40.28,42.3808
130,44.15,44.8041
131,21.42,29.9308


In [136]:
etreg_y_frame.to_csv(output_folder + '/covbatharmonized_top_etreg_y_frame.csv')

In [137]:
etreg = models[0]
etreg[0]

In [138]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'covbatharm_top_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'covbatharm_top_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'covbatharm_top_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'covbatharm_top_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'covbatharm_top_etreg4.sav'))

Show results ON AVERAGE for each model

In [139]:
top_based_covbatharmonized_on_testtop =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
top_based_covbatharmonized_on_testtop

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 covbatharm_top_linr.0 0 covbatharm_top...,4.385789,-0.990759,-0.973444
0,0 lasso regression-0 0 lasso regression-...,0 covbatharm_top_llreg.0 0 covbatharm_to...,4.606305,-0.982341,-0.965779
0,0 decision tree-0 0 decision tree-1 0 ...,0 covbatharm_top_dtree.0 0 covbatharm_to...,5.100301,0.527904,0.538237
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 covbatharm_top_regr.0 0 covbatharm_top...,6.192156,-3.350241,-3.320464
0,0 support vector reg poly2-0 0 support v...,0 covbatharm_top_svrp2.0 0 covbatharm_to...,7.983882,-0.710287,-0.705492
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 covbatharm_top_eregr.0 0 covbatharm_to...,7.355353,-3.127875,-3.095724
0,0 extra trees-0 0 extra trees-1 0 ext...,0 covbatharm_top_etreg.0 0 covbatharm_to...,3.726074,0.762708,0.764247


In [140]:
top_based_covbatharmonized_on_testtop.to_csv(output_folder + '/top_based_covbatharmonized_on_topt_AVERAGES.csv')

## Now we will build  models based on the whole harmonized StrokeTOP dataset, and apply them to StrokeMRI. 

In [141]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [142]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [143]:
TOPlinr = LinearRegression()
TOPlinr.fit(X_train, y_train)

In [144]:
TOPllreg = linear_model.LassoLars(alpha=0.01)
TOPllreg.fit(X_train, y_train)

In [145]:
TOPeregr = ElasticNetCV(cv=5, random_state=17)
TOPeregr.fit(X_train, y_train)

In [146]:
TOPetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPetreg.fit(X_train, y_train)

##  Save these four best models

In [147]:
## optional save models
#joblib.dump(TOPlinr, ('../result_models/' + 'covbatharm_whole_top_linr.sav'))
#joblib.dump(TOPllreg, ('../result_models/'+ 'covbatharm_whole_top_llreg1.sav'))
#joblib.dump(TOPeregr, ('../result_models/'+ 'covbatharm_whole_top_eregr3.sav'))
#joblib.dump(TOPetreg, ('../result_models/'+ 'covbatharm_whole_top_etreg4.sav'))

# Running whole TOP model over MRI dataset

In [148]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri= y_mri.astype('float')

In [149]:
X_mri_test = X_mri
y_mri_test = y_mri


In [150]:
y_mri_pred = TOPlinr.predict(X_mri_test)

In [151]:
data= [[
    'linear regression',
    'covbatharm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPlinr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [152]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,59.772277
1,47.583562,47.711865
2,74.104110,50.791299
3,68.843836,39.047400
4,61.978082,48.234098
...,...,...
573,67.178082,52.237004
574,67.794521,50.063421
575,54.769863,44.517143
576,66.639344,50.663021


In [153]:
linr_compare.to_csv(output_folder + '/whole_covbat_harm_top_linr_compare_on_mti.csv')

In [154]:
y_mri_pred = TOPllreg.predict(X_mri_test)

In [155]:
data= [[
    'lasso regression',
    'covbatharm_whole_top_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPllreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,covbatharm_whole_top_llreg.sav,14.894132,-0.54518,0.437238


In [156]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,69.962917
1,47.583562,48.219039
2,74.104110,60.466370
3,68.843836,46.065969
4,61.978082,50.304959
...,...,...
573,67.178082,57.437348
574,67.794521,53.364707
575,54.769863,49.837886
576,66.639344,54.163563


In [157]:
llreg_compare.to_csv(output_folder + '/whole_covbatharm_top_llreg_compare_on_mri.csv')

In [158]:
y_mri_pred = TOPeregr.predict(X_mri_test)

In [159]:
data= [[
    'elasticnetCV',
    'covbatharm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPeregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [160]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,59.040807
1,47.583562,39.511680
2,74.104110,48.808541
3,68.843836,43.407044
4,61.978082,34.099892
...,...,...
573,67.178082,45.804551
574,67.794521,42.736274
575,54.769863,45.961676
576,66.639344,45.697266


In [161]:
eregr_compare.to_csv(output_folder + '/whole_covbatharm_top_eregr_compare_on_mri.csv')

In [162]:
y_mri_pred = TOPetreg.predict(X_mri_test)

In [163]:
data= [[
    'extra trees',
    'covbatharm_mri_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPetreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [164]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,48.8202
1,47.583562,40.8444
2,74.104110,50.4111
3,68.843836,48.0489
4,61.978082,41.5862
...,...,...
573,67.178082,48.7266
574,67.794521,50.7632
575,54.769863,45.9259
576,66.639344,49.3995


In [165]:
etreg_compare.to_csv(output_folder + '/whole_covbatharm_top_etreg_compare_on_mri.csv')

compile csvs of results

In [166]:
top_based_covbatharmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_covbatharmonized_on_mri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,covbatharm_whole_top_linr.sav,19.197562,-1.772858,-0.106864
0,lasso regression,covbatharm_whole_top_llreg.sav,14.894132,-0.54518,0.437238
0,elasticnetCV,covbatharm_whole_top_linr.sav,20.542716,-1.602406,0.297321
0,extra trees,covbatharm_mri_linr.sav,17.128484,-0.896096,0.355845


In [167]:
top_based_covbatharmonized_on_mri.to_csv(output_folder + '/whole_top_based_covbatharmonized_on_mri.csv')