# ML testing: experiment #1b- harmonized with neurocombat

This notebook involves testing for the MRI conference abstract. This notebook shows harmonized StrokeMRI and TOP based models, and how they perform on each other, once harmonized by neurcombat

Data: StrokeMRI, TOP

Harmonisation: neurocombat

Training data: StrokeMRI and/or TOP

Testing data: StrokeMRI and/or TOP test subsets

Futher data applied to: none

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: ? what do we want to have here?

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
 
# # demo stuff
import ipywidgets as widgets
from ipywidgets import interactive
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = 'harmonizations/harm_results/' 
filename_mri = os.path.join(filepath_mri,'neuro_harm_mri_for_1.csv') 

filepath_top = 'harmonizations/harm_results/' 
filename_top = os.path.join(filepath_top,'neuro_harm_top_for_1.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [5]:
TOP.columns = TOP.columns.str.lower() 
StrokeMRI.columns = StrokeMRI.columns.str.lower() 

In [6]:
# # Now we need to flip the sex back to numbers for a correlation
# sex_mapping = {'F':0,'M':1}
# TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-0668_1,50.4,1.0,0.644113,0.485854,0.264304,0.461805,0.810731,-0.001404,16.788085,0.269229,0.177786,0.212513,0.258577,0.247661,25.196563,90.142975,80.430985,58.71188,71.928822
1,sub-0532_1,37.02,1.0,0.718871,0.594901,0.318537,0.441834,0.808033,0.005061,26.012756,0.255105,0.176826,0.239043,0.21408,0.240495,24.483328,100.132081,89.082044,67.735298,79.457272
2,sub-0529_1,30.57,1.0,0.669161,0.459851,0.288956,0.473672,0.797081,0.015829,21.355558,0.276498,0.196674,0.210583,0.197991,0.220279,26.141089,106.826247,97.274207,71.929599,85.744065


In [7]:
# StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
0,sub-5917601_1,70.713706,1.0,0.646423,0.439237,0.44631,0.418752,0.707111,0.023474,20.282829,0.307419,0.238407,0.19946,0.302574,0.262404,74.772396,89.276397,79.650183,62.680388,69.156034
1,sub-5931802_1,47.583562,1.0,0.639722,0.498391,0.315831,0.439477,0.781853,0.006808,26.008071,0.234268,0.168788,0.198894,0.191304,0.207276,34.905357,93.831345,84.560377,69.09442,75.51161
2,sub-5911901_1,74.10411,1.0,0.565678,0.488417,0.447893,0.375995,0.700929,0.040916,39.950425,0.204109,0.180748,0.221134,0.271064,0.223909,37.496158,75.070813,62.169504,61.03601,60.9052


In [8]:
#give a checkbox for out put folder
loged_feat = widgets.ToggleButton(
    value=False,
    description='Click me if features logged',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
loged_feat

ToggleButton(value=False, description='Click me if features logged', icon='check', tooltip='Description')

In [9]:
loged_feat.value

False

In [10]:
if loged_feat.value == False:
    output_folder = '1b_no_log_outputs'
else:
    output_folder = '1b_loged_outputs'

os.makedirs(output_folder, exist_ok=True)

## Build ML models based on neurocombat StrokeMRI

In [11]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [12]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'neuro_harm_mri_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [13]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,neuro_harm_mri_linr.0,5.37384,0.75751,0.757885
0,linear regression-1,1,neuro_harm_mri_linr.1,5.305093,0.778802,0.789353
0,linear regression-2,2,neuro_harm_mri_linr.2,6.03591,0.701419,0.701479
0,linear regression-3,3,neuro_harm_mri_linr.3,5.730579,0.751178,0.751189
0,linear regression-4,4,neuro_harm_mri_linr.4,5.325815,0.786023,0.786033


In [14]:
linr_k_frame.to_csv(output_folder + '/neuro_harmonized_mri_linr_k_frame.csv')

In [15]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_mri_linr.0 0 neuro_harm_mri...,5.554247,0.754986,0.757188


In [16]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.489600
1,66.471233,58.724097
2,46.084932,48.392709
3,73.564384,68.104044
4,51.357923,42.861305
...,...,...
140,74.191257,62.269679
141,67.515068,64.031025
142,49.827375,58.660132
143,71.254795,69.712890


In [17]:
linr_y_frame.to_csv(output_folder + '/neuro_harmonized_mri_linr_y_frame.csv')

In [18]:
linr = models[0]
linr[0]

In [19]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [20]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'neuro_harm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'neuro_harm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'neuro_harm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'neuro_harm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'neuro_harm_mri_linr4.sav'))

In [21]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'neuro_harm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,neuro_harm_mri_llreg.0,5.67743,0.730529,0.73104
0,lasso regression-1,1,neuro_harm_mri_llreg.1,5.437946,0.76894,0.777349
0,lasso regression-2,2,neuro_harm_mri_llreg.2,6.168479,0.674264,0.674291
0,lasso regression-3,3,neuro_harm_mri_llreg.3,6.109054,0.716987,0.717172
0,lasso regression-4,4,neuro_harm_mri_llreg.4,5.466647,0.773761,0.773768


In [22]:
llreg_k_frame.to_csv(output_folder + '/neuro_harmonized_mri_llreg_k_frame.csv')

In [23]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 neuro_harm_mri_llreg.0 0 neuro_harm_mr...,5.771911,0.732896,0.734724


In [24]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.425238
1,66.471233,58.853288
2,46.084932,49.096615
3,73.564384,67.734372
4,51.357923,43.020244
...,...,...
140,74.191257,63.534075
141,67.515068,64.372385
142,49.827375,61.403330
143,71.254795,71.495612


In [25]:
llreg_y_frame.to_csv(output_folder + '/neuro_harmonized_mri_llreg_y_frame.csv')

In [26]:
llreg = models[0]
llreg[0]

In [27]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'neuro_harm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'neuro_harm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'neuro_harm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'neuro_harm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'neuro_harm_mri_llreg4.sav'))

In [28]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'neuro_harm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,neuro_harm_mri_dtree.0,5.150051,0.762224,0.763275
0,decision tree-1,1,neuro_harm_mri_dtree.1,5.633059,0.770467,0.779987
0,decision tree-2,2,neuro_harm_mri_dtree.2,5.642214,0.724239,0.72516
0,decision tree-3,3,neuro_harm_mri_dtree.3,5.24082,0.771683,0.773889
0,decision tree-4,4,neuro_harm_mri_dtree.4,5.575503,0.729054,0.72916


In [29]:
dtree_k_frame.to_csv(output_folder + '/neuro_harmonized_mri_dtree_k_frame.csv')

In [30]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 neuro_harm_mri_dtree.0 0 neuro_harm_mr...,5.44833,0.751533,0.754294


In [31]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,42.248881
1,66.471233,59.706849
2,46.084932,37.568763
3,73.564384,68.106849
4,51.357923,48.872049
...,...,...
140,74.191257,79.210959
141,67.515068,64.367123
142,49.827375,51.052055
143,71.254795,71.060274


In [32]:
dtree_y_frame.to_csv(output_folder + '/neuro_harmonized_mri_dtree_y_frame.csv')

In [33]:
dtree = models[0]
dtree[0]

In [34]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'neuro_harm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,neuro_harm_mri_regr.0,7.971571,0.47585,0.476062
0,MLP regression-1,1,neuro_harm_mri_regr.1,7.668439,0.543956,0.545398
0,MLP regression-2,2,neuro_harm_mri_regr.2,8.023781,0.481628,0.49408
0,MLP regression-3,3,neuro_harm_mri_regr.3,8.273465,0.43714,0.437145
0,MLP regression-4,4,neuro_harm_mri_regr.4,8.267472,0.461758,0.462236


In [35]:
regr_k_frame.to_csv('neuro_harmonized_mri_regr_k_frame.csv')

In [36]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 neuro_harm_mri_regr.0 0 neuro_harm_mri...,8.040946,0.480066,0.482984


In [37]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,44.971256
1,66.471233,58.063427
2,46.084932,65.116196
3,73.564384,60.878527
4,51.357923,48.231072
...,...,...
140,74.191257,57.235406
141,67.515068,60.986316
142,49.827375,54.256574
143,71.254795,68.718767


In [38]:
regr_y_frame.to_csv(output_folder + '/neuro_harmonized_mri_regr_y_frame.csv')

In [39]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [40]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'neuro_harm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,neuro_harm_mri_svrp2.0,9.54538,0.200884,0.257404
0,support vector reg poly2-1,1,neuro_harm_mri_svrp2.1,10.312173,0.193176,0.265809
0,support vector reg poly2-2,2,neuro_harm_mri_svrp2.2,9.115738,0.219991,0.302282
0,support vector reg poly2-3,3,neuro_harm_mri_svrp2.3,9.669318,0.249324,0.291507
0,support vector reg poly2-4,4,neuro_harm_mri_svrp2.4,9.930957,0.238869,0.293164


In [41]:
svrp2_k_frame.to_csv(output_folder + '/neuro_harmonized_mri_svrp2_k_frame.csv')

In [42]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 neuro_harm_mri_svrp2.0 0 neuro_harm_mr...,9.714713,0.220449,0.282033


In [43]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,58.532441
1,66.471233,69.074798
2,46.084932,56.511949
3,73.564384,64.963975
4,51.357923,63.785075
...,...,...
140,74.191257,60.843281
141,67.515068,62.439014
142,49.827375,65.462966
143,71.254795,67.447645


In [44]:
svrp2_y_frame.to_csv(output_folder + '/neuro_harmonized_mrisvrp2_y_frame.csv')

In [45]:
svrp2 = models[0]
svrp2[0]

In [46]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'neuro_harm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,neuro_harm_mix_eregr.0,7.691023,0.502325,0.502356
0,elasticnetCV-1,1,neuro_harm_mix_eregr.1,8.324058,0.46774,0.481695
0,elasticnetCV-2,2,neuro_harm_mix_eregr.2,7.802488,0.469198,0.471463
0,elasticnetCV-3,3,neuro_harm_mix_eregr.3,8.467579,0.435746,0.441641
0,elasticnetCV-4,4,neuro_harm_mix_eregr.4,7.797014,0.531952,0.533054


In [47]:
eregr_k_frame.to_csv(output_folder + '/neuro_harmonized_eregr_k_frame.csv')

In [48]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 neuro_harm_mix_eregr.0 0 neuro_harm_mi...,8.016432,0.481392,0.486042


In [49]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,48.501642
1,66.471233,65.156220
2,46.084932,53.896036
3,73.564384,65.086669
4,51.357923,50.672770
...,...,...
140,74.191257,53.700513
141,67.515068,60.542611
142,49.827375,60.339480
143,71.254795,70.870900


In [50]:
eregr_y_frame.to_csv(output_folder + '/neuro_harmonized_mri_eregr_y_frame.csv')

In [51]:
eregr = models[0]
eregr[0]

In [52]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'neuro_harm_mri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,neuro_harm_mri_etreg.0,3.590285,0.899469,0.899472
0,extra trees-1,1,neuro_harm_mri_etreg.1,4.012978,0.884934,0.890215
0,extra trees-2,2,neuro_harm_mri_etreg.2,4.514413,0.847289,0.84729
0,extra trees-3,3,neuro_harm_mri_etreg.3,3.980705,0.877118,0.877615
0,extra trees-4,4,neuro_harm_mri_etreg.4,4.042529,0.873652,0.873864


In [53]:
etreg_k_frame.to_csv(output_folder + '/neuro_haromized_etreg_k_frame.csv')

In [54]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 neuro_harm_mri_etreg.0 0 neuro_harm_mr...,4.028182,0.876492,0.877691


In [55]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,44.509589,38.719941
1,66.471233,61.389943
2,46.084932,46.160282
3,73.564384,66.299276
4,51.357923,42.679370
...,...,...
140,74.191257,70.968873
141,67.515068,67.431710
142,49.827375,48.989407
143,71.254795,72.464504


In [56]:
etreg_y_frame.to_csv(output_folder + '/neuro_harmonized_etreg_y_frame.csv')

In [57]:
etreg = models[0]
etreg[0]

In [58]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'neuro_harm_mri_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'neuro_harm_mri_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'neuro_harm_mri_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'neuro_harm_mri_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'neuro_harm_mri_etreg4.sav'))

Show results ON AVERAGE for each model

In [59]:
mri_based_neuro_harmonized_on_testmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mri_based_neuro_harmonized_on_testmri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_mri_linr.0 0 neuro_harm_mri...,5.554247,0.754986,0.757188
0,0 lasso regression-0 0 lasso regression-...,0 neuro_harm_mri_llreg.0 0 neuro_harm_mr...,5.771911,0.732896,0.734724
0,0 decision tree-0 0 decision tree-1 0 ...,0 neuro_harm_mri_dtree.0 0 neuro_harm_mr...,5.44833,0.751533,0.754294
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 neuro_harm_mri_regr.0 0 neuro_harm_mri...,8.040946,0.480066,0.482984
0,0 support vector reg poly2-0 0 support v...,0 neuro_harm_mri_svrp2.0 0 neuro_harm_mr...,9.714713,0.220449,0.282033
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 neuro_harm_mix_eregr.0 0 neuro_harm_mi...,8.016432,0.481392,0.486042
0,0 extra trees-0 0 extra trees-1 0 ext...,0 neuro_harm_mri_etreg.0 0 neuro_harm_mr...,4.028182,0.876492,0.877691


In [60]:
mri_based_neuro_harmonized_on_testmri.to_csv(output_folder + '/mri_based_neuro_harmonized_on_testmri_AVERAGES.csv')

## Now we will build  models based on the whole  neurocombat harmonized StrokeMRI dataset, and apply them to TOP . 

In [61]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [62]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [63]:
MRIlinr = LinearRegression()
MRIlinr.fit(X_train, y_train)

In [64]:
MRIllreg = linear_model.LassoLars(alpha=0.01)
MRIllreg.fit(X_train, y_train)

In [65]:
MRIeregr = ElasticNetCV(cv=5, random_state=17)
MRIeregr.fit(X_train, y_train)


In [66]:
MRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MRIetreg.fit(X_train, y_train)

In [67]:
##  Save these four best models

In [68]:

## optional save models
#joblib.dump(MRIlinr, ('../result_models/' + 'neuro_harm_whole_mri_linr.sav'))
#joblib.dump(MRIllreg, ('../result_models/'+ 'neuro_harm_whole_mri_llreg1.sav'))
#joblib.dump(MRIeregr, ('../result_models/'+ 'neuro_harm_whole_mri_eregr3.sav'))
#joblib.dump(MRIetreg, ('../result_models/'+ 'neuro_harm_whole_mri_etreg4.sav'))

# Running whole MRI model over TOP dataset

In [69]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [70]:
X_top_test = X_top
y_top_test = y_top


In [71]:
y_top_pred = MRIlinr.predict(X_top_test)

In [72]:
# print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [73]:
data= [[
    'linear regression',
    'neuro_harm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIlinr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [74]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,index,y_test_real_age,linr_y_pred_age
0,0,50.40,46.425855
1,1,37.02,44.037350
2,2,30.57,38.356746
3,3,47.05,40.944554
4,4,44.63,44.470173
...,...,...,...
524,524,33.55,42.969257
525,525,44.43,50.040332
526,526,45.60,43.185934
527,527,46.20,46.648731


In [75]:
linr_compare.to_csv(output_folder + '/whole_neuro_harm_mri_linr_compare_on_top.csv')

In [76]:
y_top_pred = MRIllreg.predict(X_top_test)

In [77]:
data= [[
    'lasso regression',
    'neuro_harm_whole_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIllreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,neuro_harm_whole_mri_llreg.sav,7.670351,0.070876,0.604403


In [78]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,45.766892
1,37.02,43.622200
2,30.57,40.746230
3,47.05,41.877371
4,44.63,46.493809
...,...,...
524,33.55,45.755038
525,44.43,52.760915
526,45.60,45.912453
527,46.20,48.727915


In [79]:
llreg_compare.to_csv(output_folder + '/whole_neuroharm_mri_llreg_compare_on_top.csv')

In [80]:
y_top_pred = MRIeregr.predict(X_top_test)

In [81]:
data= [[
    'elasticnetCV',
    'neuro_harm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIeregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [82]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,50.373300
1,37.02,48.480570
2,30.57,44.109027
3,47.05,54.034342
4,44.63,54.742495
...,...,...
524,33.55,54.191813
525,44.43,53.498214
526,45.60,53.896299
527,46.20,55.052826


In [83]:
eregr_compare.to_csv(output_folder + '/whole_neuro_harm_mri_eregr_compare_on_top.csv')

In [84]:
y_top_pred = MRIetreg.predict(X_top_test)

In [85]:
data= [[
    'extra trees',
    'neruo_harm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIetreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [86]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,50.40,32.612278
1,37.02,44.629771
2,30.57,46.680123
3,47.05,38.943151
4,44.63,45.987756
...,...,...
524,33.55,50.281091
525,44.43,42.662737
526,45.60,33.084001
527,46.20,33.164025


In [87]:
etreg_compare.to_csv(output_folder + '/whole_neuro_harm_mri_etreg_compare_on_top.csv')

In [88]:
mri_based_neurocombat_harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_neurocombat_harmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,neuro_harm_whole_mri_linr.sav,7.355296,0.125719,0.526631
0,lasso regression,neuro_harm_whole_mri_llreg.sav,7.670351,0.070876,0.604403
0,elasticnetCV,neuro_harm_whole_mri_linr.sav,14.50715,-1.928793,0.33721
0,extra trees,neruo_harm_mri_linr.sav,10.304266,-0.753124,-0.607283


## Now we will run the exact opposite process.
1. We will explore TOP based models via k-folded results, 
2. We will make a general unahrmonized TOP model (based off all TOP)
3. We will apply the best of these model to the StrokeMRI dataset

### Build ML models based on neruocombat harmonized TOP 

In [89]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [90]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'neuro_harm_top_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [91]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,neuro_harm_top_linr.0,3.979226,0.760218,0.774403
0,linear regression-1,1,neuro_harm_top_linr.1,5.940799,-7.899934,-7.833667
0,linear regression-2,2,neuro_harm_top_linr.2,3.875055,0.704843,0.705457
0,linear regression-3,3,neuro_harm_top_linr.3,3.996717,0.748827,0.751523
0,linear regression-4,4,neuro_harm_top_linr.4,4.13695,0.732303,0.735114


In [92]:
linr_k_frame.to_csv(output_folder + '/neuro_harmonized_top_linr_k_frame.csv')

In [93]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_top_linr.0 0 neuro_harm_top...,4.385749,-0.990749,-0.973434


In [94]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,24.769312
1,30.75,34.256825
2,41.66,44.234658
3,31.64,36.896013
4,37.87,42.936674
...,...,...
128,22.95,28.374716
129,40.28,42.229713
130,44.15,43.709509
131,21.42,29.071398


In [95]:
linr_y_frame.to_csv(output_folder + '/neuro_harmonized_top_linr_y_frame.csv')

In [96]:
linr = models[0]
linr[0]

In [97]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [98]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'neuro_harm_top_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'neuro_harm_top_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'neuro_harm_top_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'neuro_harm_top_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'neuro_harm_top_linr4.sav'))

In [99]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'neuro_harm_top_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,neuro_harm_top_llreg.0,4.22966,0.725736,0.735865
0,lasso regression-1,1,neuro_harm_top_llreg.1,6.087008,-7.749052,-7.683032
0,lasso regression-2,2,neuro_harm_top_llreg.2,3.966091,0.702588,0.70269
0,lasso regression-3,3,neuro_harm_top_llreg.3,4.318321,0.715457,0.720175
0,lasso regression-4,4,neuro_harm_top_llreg.4,4.430447,0.693619,0.695462


In [100]:
llreg_k_frame.to_csv(output_folder + '/neuro_harmonized_top_llreg_k_frame.csv')

In [101]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 neuro_harm_top_llreg.0 0 neuro_harm_to...,4.606305,-0.98233,-0.965768


In [102]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,25.210756
1,30.75,34.120374
2,41.66,43.544281
3,31.64,38.008597
4,37.87,43.222901
...,...,...
128,22.95,30.770361
129,40.28,41.010344
130,44.15,41.615041
131,21.42,30.542464


In [103]:
llreg_y_frame.to_csv(output_folder + '/neuro_harmonized_top_llreg_y_frame.csv')

In [104]:
llreg = models[0]
llreg[0]

In [105]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'neuro_harm_top_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'neuro_harm_top_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'neuro_harm_top_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'neuro_harm_top_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'neuro_harm_top_llreg4.sav'))

In [106]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'neuro_harm_top_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,neuro_harm_top_dtree.0,5.230376,0.585434,0.587183
0,decision tree-1,1,neuro_harm_top_dtree.1,5.003308,0.546271,0.549006
0,decision tree-2,2,neuro_harm_top_dtree.2,5.348947,0.354419,0.354655
0,decision tree-3,3,neuro_harm_top_dtree.3,5.556391,0.482213,0.483139
0,decision tree-4,4,neuro_harm_top_dtree.4,4.920451,0.532895,0.574757


In [107]:
dtree_k_frame.to_csv(output_folder + '/neuro_harmonized_top_dtree_k_frame.csv')

In [108]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 neuro_harm_top_dtree.0 0 neuro_harm_to...,5.211895,0.500247,0.509748


In [109]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,30.28
1,30.75,34.88
2,41.66,40.65
3,31.64,39.44
4,37.87,41.94
...,...,...
128,22.95,31.72
129,40.28,41.30
130,44.15,45.50
131,21.42,24.74


In [110]:
dtree_y_frame.to_csv(output_folder + '/neuro_harmonized_top_dtree_y_frame.csv')

In [111]:
dtree = models[0]
dtree[0]

In [112]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'neuro_harm_top_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,neuro_harm_top_regr.0,5.175578,0.588204,0.588305
0,MLP regression-1,1,neuro_harm_top_regr.1,9.069018,-18.665297,-18.550246
0,MLP regression-2,2,neuro_harm_top_regr.2,4.914676,0.500836,0.501344
0,MLP regression-3,3,neuro_harm_top_regr.3,5.73836,0.508471,0.511603
0,MLP regression-4,4,neuro_harm_top_regr.4,5.721145,0.497922,0.500268


In [113]:
regr_k_frame.to_csv(output_folder + '/neuro_harmonized_top_regr_k_frame.csv')

In [114]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 neuro_harm_top_regr.0 0 neuro_harm_top...,6.123755,-3.313973,-3.289745


In [115]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,29.569763
1,30.75,38.485599
2,41.66,36.531063
3,31.64,37.812753
4,37.87,40.614137
...,...,...
128,22.95,32.018706
129,40.28,38.261211
130,44.15,38.975832
131,21.42,33.262887


In [116]:
regr_y_frame.to_csv(output_folder + '/neuro_harmonized_top_regr_y_frame.csv')

In [117]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [118]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'neuro_harm_top_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,neuro_harm_top_svrp2.0,8.063488,0.142657,0.143016
0,support vector reg poly2-1,1,neuro_harm_top_svrp2.1,9.199649,-4.067374,-4.051096
0,support vector reg poly2-2,2,neuro_harm_top_svrp2.2,7.082794,0.134321,0.134576
0,support vector reg poly2-3,3,neuro_harm_top_svrp2.3,7.843937,0.131621,0.136108
0,support vector reg poly2-4,4,neuro_harm_top_svrp2.4,7.729475,0.107319,0.109921


In [119]:
svrp2_k_frame.to_csv(output_folder + '/neuro_harmonized_top_svrp2_k_frame.csv')

In [120]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 neuro_harm_top_svrp2.0 0 neuro_harm_to...,7.983868,-0.710291,-0.705495


In [121]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,37.810885
1,30.75,38.017114
2,41.66,36.068921
3,31.64,37.036776
4,37.87,38.576758
...,...,...
128,22.95,35.889160
129,40.28,37.823123
130,44.15,37.954370
131,21.42,36.765821


In [122]:
svrp2_y_frame.to_csv(output_folder + '/neuro_harmonized_top_svrp2_y_frame.csv')

In [123]:
svrp2 = models[0]
svrp2[0]

In [124]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'neuro_harm_top_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,neuro_harm_top_eregr.0,7.373563,0.270913,0.270913
0,elasticnetCV-1,1,neuro_harm_top_eregr.1,8.536638,-16.621591,-16.468855
0,elasticnetCV-2,2,neuro_harm_top_eregr.2,6.492187,0.25298,0.255882
0,elasticnetCV-3,3,neuro_harm_top_eregr.3,7.275052,0.240154,0.244065
0,elasticnetCV-4,4,neuro_harm_top_eregr.4,7.099334,0.218236,0.219443


In [125]:
eregr_k_frame.to_csv(output_folder + '/neuro_harmonized_top_eregr_k_frame.csv')

In [126]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 neuro_harm_top_eregr.0 0 neuro_harm_to...,7.355355,-3.127861,-3.095711


In [127]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,37.018209
1,30.75,38.186611
2,41.66,34.723788
3,31.64,35.357204
4,37.87,39.152056
...,...,...
128,22.95,33.006853
129,40.28,37.938408
130,44.15,38.455808
131,21.42,34.765851


In [128]:
eregr_y_frame.to_csv(output_folder + '/neuro_harmonized_top_eregr_y_frame.csv')

In [129]:
eregr = models[0]
eregr[0]

In [130]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'neuro_harm_top_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,neuro_harm_top_etreg.0,3.83078,0.770955,0.773368
0,extra trees-1,1,neuro_harm_top_etreg.1,3.67949,0.770494,0.770561
0,extra trees-2,2,neuro_harm_top_etreg.2,3.56183,0.746806,0.747022
0,extra trees-3,3,neuro_harm_top_etreg.3,4.024142,0.744218,0.745178
0,extra trees-4,4,neuro_harm_top_etreg.4,3.539871,0.779604,0.783521


In [131]:
etreg_k_frame.to_csv(output_folder + '/neuro_haromized_top_etreg_k_frame.csv')

In [132]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 neuro_harm_top_etreg.0 0 neuro_harm_to...,3.727223,0.762416,0.76393


In [133]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,21.26,26.6389
1,30.75,31.2344
2,41.66,39.5583
3,31.64,37.0102
4,37.87,45.4427
...,...,...
128,22.95,29.1843
129,40.28,42.3808
130,44.15,44.8041
131,21.42,29.9308


In [134]:
etreg_y_frame.to_csv(output_folder + '/neuro_harmonized_top_etreg_y_frame.csv')

In [135]:
etreg = models[0]
etreg[0]

In [136]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'neuro_harm_top_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'neuro_harm_top_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'neuro_harm_top_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'neuro_harm_top_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'neuro_harm_top_etreg4.sav'))

Show results ON AVERAGE for each model

In [137]:
top_based_neuro_harmonized_on_testtop =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
top_based_neuro_harmonized_on_testtop

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_top_linr.0 0 neuro_harm_top...,4.385749,-0.990749,-0.973434
0,0 lasso regression-0 0 lasso regression-...,0 neuro_harm_top_llreg.0 0 neuro_harm_to...,4.606305,-0.98233,-0.965768
0,0 decision tree-0 0 decision tree-1 0 ...,0 neuro_harm_top_dtree.0 0 neuro_harm_to...,5.211895,0.500247,0.509748
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 neuro_harm_top_regr.0 0 neuro_harm_top...,6.123755,-3.313973,-3.289745
0,0 support vector reg poly2-0 0 support v...,0 neuro_harm_top_svrp2.0 0 neuro_harm_to...,7.983868,-0.710291,-0.705495
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 neuro_harm_top_eregr.0 0 neuro_harm_to...,7.355355,-3.127861,-3.095711
0,0 extra trees-0 0 extra trees-1 0 ext...,0 neuro_harm_top_etreg.0 0 neuro_harm_to...,3.727223,0.762416,0.76393


In [138]:
top_based_neuro_harmonized_on_testtop.to_csv(output_folder + '/top_based_neuro_harmonized_on_topt_AVERAGES.csv')

## Now we will build  models based on the whole harmonized StrokeTOP dataset, and apply them to StrokeMRI. 

In [139]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [140]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [141]:
TOPlinr = LinearRegression()
TOPlinr.fit(X_train, y_train)

In [142]:
TOPllreg = linear_model.LassoLars(alpha=0.01)
TOPllreg.fit(X_train, y_train)

In [143]:
TOPeregr = ElasticNetCV(cv=5, random_state=17)
TOPeregr.fit(X_train, y_train)

In [144]:
TOPetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPetreg.fit(X_train, y_train)

##  Save these four best models

In [145]:
## optional save models
#joblib.dump(TOPlinr, ('../result_models/' + 'neuro_harm_whole_top_linr.sav'))
#joblib.dump(TOPllreg, ('../result_models/'+ 'neuro_harm_whole_top_llreg1.sav'))
#joblib.dump(TOPeregr, ('../result_models/'+ 'neuro_harm_whole_top_eregr3.sav'))
#joblib.dump(TOPetreg, ('../result_models/'+ 'neuro_harm_whole_top_etreg4.sav'))

# Running whole TOP model over MRI dataset

In [146]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri= y_mri.astype('float')

In [147]:
X_mri_test = X_mri
y_mri_test = y_mri


In [148]:
y_mri_pred = TOPlinr.predict(X_mri_test)

In [149]:
data= [[
    'linear regression',
    'neuro_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPlinr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [150]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,59.769904
1,47.583562,47.708161
2,74.104110,50.787772
3,68.843836,39.045269
4,61.978082,48.231217
...,...,...
573,67.178082,52.232948
574,67.794521,50.059484
575,54.769863,44.513889
576,66.639344,50.658988


In [151]:
linr_compare.to_csv(output_folder + '/whole_neuro-_harm_top_linr_compare_on_mti.csv')

In [152]:
y_mri_pred = TOPllreg.predict(X_mri_test)

In [153]:
data= [[
    'lasso regression',
    'neuro_harm_whole_top_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPllreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,neuro_harm_whole_top_llreg.sav,14.89184,-0.544803,0.437229


In [154]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,69.965647
1,47.583562,48.221902
2,74.104110,60.468991
3,68.843836,46.068648
4,61.978082,50.307780
...,...,...
573,67.178082,57.440137
574,67.794521,53.367508
575,54.769863,49.840629
576,66.639344,54.166383


In [155]:
llreg_compare.to_csv(output_folder + '/whole_neuro_harm_top_llreg_compare_on_mri.csv')

In [156]:
y_mri_pred = TOPeregr.predict(X_mri_test)

In [157]:
data= [[
    'elasticnetCV',
    'neuro_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPeregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [158]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,59.042759
1,47.583562,39.513585
2,74.104110,48.810421
3,68.843836,43.408929
4,61.978082,34.101805
...,...,...
573,67.178082,45.806459
574,67.794521,42.738157
575,54.769863,45.963562
576,66.639344,45.699180


In [159]:
eregr_compare.to_csv(output_folder + '/whole_neuro_harm_top_eregr_compare_on_mri.csv')

In [160]:
y_mri_pred = TOPetreg.predict(X_mri_test)

In [161]:
data= [[
    'extra trees',
    'neuro_harm_mri_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPetreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [162]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,70.713706,48.9110
1,47.583562,40.7414
2,74.104110,50.4111
3,68.843836,48.1517
4,61.978082,41.5862
...,...,...
573,67.178082,48.7266
574,67.794521,50.7632
575,54.769863,45.9259
576,66.639344,49.3995


In [163]:
etreg_compare.to_csv(output_folder + '/whole_neuro_harm_top_etreg_compare_on_mri.csv')

compile csvs of results

In [164]:
top_based_neuro_harmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_neuro_harmonized_on_mri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,neuro_harm_whole_top_linr.sav,19.200313,-1.773389,-0.106785
0,lasso regression,neuro_harm_whole_top_llreg.sav,14.89184,-0.544803,0.437229
0,elasticnetCV,neuro_harm_whole_top_linr.sav,20.541108,-1.602038,0.297321
0,extra trees,neuro_harm_mri_linr.sav,17.128253,-0.895971,0.356594


In [165]:
top_based_neuro_harmonized_on_mri.to_csv(output_folder + '/whole_top_based_neuro_harmonized_on_mri.csv')