# ML testing: experiment #1e- harmonized with neuroharmony (neurocombat-GAM, non linear)

This notebook involves testing for the MRI conference abstract. This notebook shows harmonized StrokeMRI and TOP based models, and how they perform on each other, once harmonized by neurharmony

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
 
# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/neurocharm/' 
filename_mri = os.path.join(filepath_mri,'stroke_neuroharm_to_top.csv') 

filepath_top = '../open_work/internal_results/neurocharm/' 
filename_top = os.path.join(filepath_top,'stroke_neuroharm_to_top.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP.head(3)

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,527,sub-59082_1_ses-1_run-1,43.172603,0,0.6448,0.49387,0.309698,0.444971,0.785736,1.411956,...,8.267411,1.727019,1.96123,1.732827,1.912408,29.666115,107.419196,94.036854,75.96918,88.193616
1,528,sub-59083_1_ses-1_run-1,66.367123,0,0.60338,0.485493,0.417703,0.399662,0.72208,13.269906,...,3.915929,1.951211,1.719263,2.548152,1.884593,23.784183,74.846811,63.898945,67.378433,68.232488
2,529,sub-59085_1_ses-1_run-1,55.838356,0,0.614763,0.537819,0.335135,0.413834,0.77398,6.235503,...,5.277166,1.81164,1.965192,2.060881,1.945008,31.20807,80.532472,72.696054,60.616944,63.259764


In [5]:
StrokeMRI.head(3)

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,527,sub-59082_1_ses-1_run-1,43.172603,0,0.6448,0.49387,0.309698,0.444971,0.785736,1.411956,...,8.267411,1.727019,1.96123,1.732827,1.912408,29.666115,107.419196,94.036854,75.96918,88.193616
1,528,sub-59083_1_ses-1_run-1,66.367123,0,0.60338,0.485493,0.417703,0.399662,0.72208,13.269906,...,3.915929,1.951211,1.719263,2.548152,1.884593,23.784183,74.846811,63.898945,67.378433,68.232488
2,529,sub-59085_1_ses-1_run-1,55.838356,0,0.614763,0.537819,0.335135,0.413834,0.77398,6.235503,...,5.277166,1.81164,1.965192,2.060881,1.945008,31.20807,80.532472,72.696054,60.616944,63.259764


In [6]:
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
StrokeMRI

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.644800,0.493870,0.309698,0.444971,0.785736,1.411956,23.166862,8.267411,1.727019,1.961230,1.732827,1.912408,29.666115,107.419196,94.036854,75.969180,88.193616
1,sub-59083_1_ses-1_run-1,66.367123,0,0.603380,0.485493,0.417703,0.399662,0.722080,13.269906,25.398351,3.915929,1.951211,1.719263,2.548152,1.884593,23.784183,74.846811,63.898945,67.378433,68.232488
2,sub-59085_1_ses-1_run-1,55.838356,0,0.614763,0.537819,0.335135,0.413834,0.773980,6.235503,28.600948,5.277166,1.811640,1.965192,2.060881,1.945008,31.208070,80.532472,72.696054,60.616944,63.259764
3,sub-59086_1_ses-1_run-1,48.238356,0,0.631335,0.484823,0.298599,0.446481,0.788842,3.179088,22.712478,5.222266,1.673500,1.919052,2.010093,1.779866,24.598684,73.472966,68.702930,53.035827,59.135015
4,sub-59087_1_ses-1_run-1,58.616438,0,0.569735,0.455592,0.349758,0.414994,0.746442,3.010439,23.365088,6.881637,1.990617,1.864940,1.994002,1.913093,8.136629,59.417452,47.994307,40.104308,43.603597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,sub-59440_1_ses-1_run-1,73.928767,1,0.568202,0.525958,0.514156,0.354034,0.680986,24.006931,56.791394,2.353910,2.033042,1.999061,1.936157,2.067482,31.433102,84.877995,75.689139,55.400161,60.576995
510,sub-59440_2_ses-2_run-1,74.769863,1,0.571078,0.508225,0.522324,0.356979,0.674683,21.040159,32.110319,2.436769,2.512833,2.563213,2.236574,2.550664,33.350704,84.997781,81.147143,69.234088,63.970912
511,sub-59441_2_ses-2_run-1,74.512329,1,0.582930,0.474644,0.430297,0.392828,0.712397,7.738172,30.548252,4.001709,1.860833,1.863751,1.728481,1.771178,29.447091,64.984756,64.236570,53.387656,54.035965
512,sub-59442_1_ses-1_run-1,67.526027,1,0.636604,0.551915,0.398417,0.402065,0.749047,9.717590,13.877263,5.111400,1.714960,1.966579,2.150720,1.877366,22.110746,71.397583,65.141920,57.576579,59.579270


## Build ML models based on neurocombat StrokeMRI

In [7]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [8]:
linr_k_frame, linr_y_frame, models = sep.stratified_one_category_shuffle_split('linear regression', 'neuro_harm_mri_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [9]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,neuro_harm_mri_linr.0,5.600948,0.729061,0.729088
0,linear regression-1,1,neuro_harm_mri_linr.1,4.91882,0.804779,0.806379
0,linear regression-2,2,neuro_harm_mri_linr.2,5.4217,0.707509,0.71
0,linear regression-3,3,neuro_harm_mri_linr.3,4.886924,0.760421,0.76208
0,linear regression-4,4,neuro_harm_mri_linr.4,5.836918,0.723401,0.734491


In [10]:
linr_k_frame.to_csv('HARMneuro_harmonized_mri_linr_k_frame.csv')

In [11]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_mri_linr.0 0 neuro_harm_mri...,5.333062,0.745034,0.748408


In [12]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,55.004233
1,77.601093,78.749024
2,66.041096,51.672796
3,71.479452,67.795418
4,72.630137,76.317562
...,...,...
124,72.331507,74.582645
125,50.918819,45.287759
126,70.180328,53.703098
127,42.016393,47.074433


In [13]:
linr_y_frame.to_csv('HARMneuro_harmonized_mri_linr_y_frame.csv')

In [14]:
linr = models[0]
linr[0]

In [15]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [16]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'HARMneuro_harm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'HARMneuro_harm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'HARMneuro_harm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'HARMneuro_harm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'HARMneuro_harm_mri_linr4.sav'))

In [17]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_one_category_shuffle_split('lasso regression', 'HARMneuro_harm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,HARMneuro_harm_mri_llreg.0,5.483282,0.735343,0.735437
0,lasso regression-1,1,HARMneuro_harm_mri_llreg.1,4.797875,0.81063,0.812447
0,lasso regression-2,2,HARMneuro_harm_mri_llreg.2,5.216478,0.720623,0.723995
0,lasso regression-3,3,HARMneuro_harm_mri_llreg.3,4.808385,0.765028,0.767379
0,lasso regression-4,4,HARMneuro_harm_mri_llreg.4,5.637852,0.734984,0.744773


In [18]:
llreg_k_frame.to_csv('HARMneuro_harmonized_mri_llreg_k_frame.csv')

In [19]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 HARMneuro_harm_mri_llreg.0 0 HARMneuro...,5.188774,0.753322,0.756806


In [20]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,54.992712
1,77.601093,78.353980
2,66.041096,53.340831
3,71.479452,68.335367
4,72.630137,76.346995
...,...,...
124,72.331507,73.953625
125,50.918819,46.201334
126,70.180328,53.806488
127,42.016393,46.962212


In [21]:
llreg_y_frame.to_csv('HARMneuro_harmonized_mri_llreg_y_frame.csv')

In [22]:
llreg = models[0]
llreg[0]

In [23]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'HARMneuro_harm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'HARMneuro_harm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'HARMneuro_harm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'HARMneuro_harm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'HARMneuro_harm_mri_llreg4.sav'))

In [24]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_one_category_shuffle_split('decision tree', 'HARMneuro_harm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,HARMneuro_harm_mri_dtree.0,5.526676,0.733339,0.733924
0,decision tree-1,1,HARMneuro_harm_mri_dtree.1,5.895963,0.725367,0.725539
0,decision tree-2,2,HARMneuro_harm_mri_dtree.2,5.499788,0.685763,0.687511
0,decision tree-3,3,HARMneuro_harm_mri_dtree.3,4.828597,0.774328,0.774805
0,decision tree-4,4,HARMneuro_harm_mri_dtree.4,5.114374,0.77355,0.774528


In [25]:
dtree_k_frame.to_csv('HARMneuro_harmonized_mri_dtree_k_frame.csv')

In [26]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 HARMneuro_harm_mri_dtree.0 0 HARMneuro...,5.373079,0.738469,0.739261


In [27]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,49.317808
1,77.601093,74.802740
2,66.041096,54.701370
3,71.479452,61.271233
4,72.630137,69.158904
...,...,...
124,72.331507,72.315068
125,50.918819,45.284932
126,70.180328,74.235616
127,42.016393,50.180822


In [28]:
dtree_y_frame.to_csv('HARMneuro_harmonized_mri_dtree_y_frame.csv')

In [29]:
dtree = models[0]
dtree[0]

In [30]:
regr_k_frame, regr_y_frame, models = sep.stratified_one_category_shuffle_split('MLP regression', 'HARMneuro_harm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,HARMneuro_harm_mri_regr.0,5.90761,0.70122,0.702995
0,MLP regression-1,1,HARMneuro_harm_mri_regr.1,5.69563,0.751461,0.77122
0,MLP regression-2,2,HARMneuro_harm_mri_regr.2,5.151822,0.753565,0.760884
0,MLP regression-3,3,HARMneuro_harm_mri_regr.3,5.700283,0.707202,0.707286
0,MLP regression-4,4,HARMneuro_harm_mri_regr.4,6.156973,0.695705,0.696634


In [31]:
regr_k_frame.to_csv('HARMneuro_harmonized_mri_regr_k_frame.csv')

In [32]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 HARMneuro_harm_mri_regr.0 0 HARMneuro_...,5.722464,0.72183,0.727804


In [33]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,42.879222
1,77.601093,69.814808
2,66.041096,60.575170
3,71.479452,71.481277
4,72.630137,88.040937
...,...,...
124,72.331507,74.096130
125,50.918819,41.495514
126,70.180328,60.213695
127,42.016393,46.474635


In [34]:
regr_y_frame.to_csv('HARMneuro_harmonized_mri_regr_y_frame.csv')

In [35]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [36]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_one_category_shuffle_split('support vector reg poly2', 'HARMneuro_harm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,HARMneuro_harm_mri_svrp2.0,9.312183,0.256375,0.26772
0,support vector reg poly2-1,1,HARMneuro_harm_mri_svrp2.1,8.730392,0.35389,0.425499
0,support vector reg poly2-2,2,HARMneuro_harm_mri_svrp2.2,8.456701,0.365014,0.373129
0,support vector reg poly2-3,3,HARMneuro_harm_mri_svrp2.3,8.157983,0.367104,0.36758
0,support vector reg poly2-4,4,HARMneuro_harm_mri_svrp2.4,8.583163,0.345462,0.381483


In [37]:
svrp2_k_frame.to_csv('HARMneuro_harmonized_mri_svrp2_k_frame.csv')

In [38]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 HARMneuro_harm_mri_svrp2.0 0 HARMneuro...,8.648084,0.337569,0.363082


In [39]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,63.788070
1,77.601093,65.674894
2,66.041096,58.324031
3,71.479452,64.745224
4,72.630137,68.506028
...,...,...
124,72.331507,64.937372
125,50.918819,59.015645
126,70.180328,51.509842
127,42.016393,62.842440


In [40]:
svrp2_y_frame.to_csv('HARMneuro_harmonized_mrisvrp2_y_frame.csv')

In [41]:
svrp2 = models[0]
svrp2[0]

In [42]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_one_category_shuffle_split('elasticnetCV', 'HARMneuro_harm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,HARMneuro_harm_mix_eregr.0,6.9869,0.591701,0.592039
0,elasticnetCV-1,1,HARMneuro_harm_mix_eregr.1,6.113391,0.69651,0.696662
0,elasticnetCV-2,2,HARMneuro_harm_mix_eregr.2,6.47692,0.622017,0.638156
0,elasticnetCV-3,3,HARMneuro_harm_mix_eregr.3,6.152406,0.638483,0.646
0,elasticnetCV-4,4,HARMneuro_harm_mix_eregr.4,6.791199,0.620769,0.622587


In [43]:
eregr_k_frame.to_csv('HARMneuro_harmonized_eregr_k_frame.csv')

In [44]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 HARMneuro_harm_mix_eregr.0 0 HARMneuro...,6.504163,0.633896,0.639089


In [45]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,52.670154
1,77.601093,75.042095
2,66.041096,59.506336
3,71.479452,66.602528
4,72.630137,66.677739
...,...,...
124,72.331507,71.117569
125,50.918819,45.111649
126,70.180328,53.413002
127,42.016393,50.174277


In [46]:
eregr_y_frame.to_csv('HARMneuro_harmonized_mri_eregr_y_frame.csv')

In [47]:
eregr = models[0]
eregr[0]

In [48]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_one_category_shuffle_split('extra trees', 'HARMneuro_harm_mri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,HARMneuro_harm_mri_etreg.0,3.932544,0.864796,0.865758
0,extra trees-1,1,HARMneuro_harm_mri_etreg.1,4.06687,0.872082,0.872811
0,extra trees-2,2,HARMneuro_harm_mri_etreg.2,4.120503,0.832311,0.832373
0,extra trees-3,3,HARMneuro_harm_mri_etreg.3,3.667449,0.875932,0.876397
0,extra trees-4,4,HARMneuro_harm_mri_etreg.4,4.264123,0.855642,0.858889


In [49]:
etreg_k_frame.to_csv('HARMneuro_haromized_etreg_k_frame.csv')

In [50]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 HARMneuro_harm_mri_etreg.0 0 HARMneuro...,4.010298,0.860153,0.861245


In [51]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,53.845597
1,77.601093,74.848573
2,66.041096,56.487159
3,71.479452,71.354130
4,72.630137,71.169052
...,...,...
124,72.331507,71.959758
125,50.918819,45.904459
126,70.180328,61.916610
127,42.016393,45.984831


In [52]:
etreg_y_frame.to_csv('HARMneuro_harmonized_etreg_y_frame.csv')

In [53]:
etreg = models[0]
etreg[0]

In [54]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'HARMneuro_harm_mri_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'HARMneuro_harm_mri_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'HARMneuro_harm_mri_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'HARMneuro_harm_mri_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'HARMneuro_harm_mri_etreg4.sav'))

Show results ON AVERAGE for each model

In [55]:
mri_based_neuro_harmonized_on_testmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mri_based_neuro_harmonized_on_testmri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 neuro_harm_mri_linr.0 0 neuro_harm_mri...,5.333062,0.745034,0.748408
0,0 lasso regression-0 0 lasso regression-...,0 HARMneuro_harm_mri_llreg.0 0 HARMneuro...,5.188774,0.753322,0.756806
0,0 decision tree-0 0 decision tree-1 0 ...,0 HARMneuro_harm_mri_dtree.0 0 HARMneuro...,5.373079,0.738469,0.739261
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 HARMneuro_harm_mri_regr.0 0 HARMneuro_...,5.722464,0.72183,0.727804
0,0 support vector reg poly2-0 0 support v...,0 HARMneuro_harm_mri_svrp2.0 0 HARMneuro...,8.648084,0.337569,0.363082
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 HARMneuro_harm_mix_eregr.0 0 HARMneuro...,6.504163,0.633896,0.639089
0,0 extra trees-0 0 extra trees-1 0 ext...,0 HARMneuro_harm_mri_etreg.0 0 HARMneuro...,4.010298,0.860153,0.861245


In [56]:
mri_based_neuro_harmonized_on_testmri.to_csv('HARMmri_based_neuro_harmonized_on_testmri_AVERAGES.csv')

## Now we will build  models based on the whole  neurocombat harmonized StrokeMRI dataset, and apply them to TOP . 

In [57]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [58]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [59]:
MRIlinr = LinearRegression()
MRIlinr.fit(X_train, y_train)

In [60]:
MRIllreg = linear_model.LassoLars(alpha=0.01)
MRIllreg.fit(X_train, y_train)

In [61]:
MRIeregr = ElasticNetCV(cv=5, random_state=17)
MRIeregr.fit(X_train, y_train)


In [62]:
MRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MRIetreg.fit(X_train, y_train)

In [63]:
##  Save these four best models

In [64]:

## optional save models
#joblib.dump(MRIlinr, ('../result_models/' + 'HARMneuro_harm_whole_mri_linr.sav'))
#joblib.dump(MRIllreg, ('../result_models/'+ 'HARMneuro_harm_whole_mri_llreg1.sav'))
#joblib.dump(MRIeregr, ('../result_models/'+ 'HARMneuro_harm_whole_mri_eregr3.sav'))
#joblib.dump(MRIetreg, ('../result_models/'+ 'HARMneuro_harm_whole_mri_etreg4.sav'))

# Running whole MRI model over TOP dataset

In [65]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [66]:
X_top_test = X_top
y_top_test = y_top


In [67]:
y_top_pred = MRIlinr.predict(X_top_test)

In [68]:
# print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [69]:
data= [[
    'linear regression',
    'HARMneuro_harm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIlinr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [70]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,index,y_test_real_age,linr_y_pred_age
0,0,43.172603,44.197514
1,1,66.367123,65.468398
2,2,55.838356,62.248124
3,3,48.238356,55.176792
4,4,58.616438,60.046788
...,...,...,...
509,509,73.928767,80.942337
510,510,74.769863,77.577055
511,511,74.512329,69.828269
512,512,67.526027,62.122115


In [71]:
linr_compare.to_csv('HARMwhole_neuro_harm_mri_linr_compare_on_top.csv')

In [72]:
y_top_pred = MRIllreg.predict(X_top_test)

In [73]:
data= [[
    'lasso regression',
    'HARMneuro_harm_whole_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIllreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,HARMneuro_harm_whole_mri_llreg.sav,5.174376,0.783984,0.783984


In [74]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,44.206128
1,66.367123,64.939883
2,55.838356,61.978512
3,48.238356,55.795700
4,58.616438,59.848291
...,...,...
509,73.928767,80.954217
510,74.769863,77.456079
511,74.512329,69.560991
512,67.526027,61.504458


In [75]:
llreg_compare.to_csv('HARMwhole_neuroharm_mri_llreg_compare_on_top.csv')

In [76]:
y_top_pred = MRIeregr.predict(X_top_test)

In [77]:
data= [[
    'elasticnetCV',
    'HARMneuro_harm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIeregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [78]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,41.556157
1,66.367123,61.275219
2,55.838356,62.752545
3,48.238356,59.359469
4,58.616438,55.656439
...,...,...
509,73.928767,79.558195
510,74.769863,79.507272
511,74.512329,66.835422
512,67.526027,59.504567


In [79]:
eregr_compare.to_csv('HARMwhole_neuro_harm_mri_eregr_compare_on_top.csv')

In [80]:
y_top_pred = MRIetreg.predict(X_top_test)

In [81]:
data= [[
    'extra trees',
    'HARMneuro_harm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIetreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [82]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,43.172603
1,66.367123,66.367123
2,55.838356,55.838356
3,48.238356,48.238356
4,58.616438,58.616438
...,...,...
509,73.928767,73.928767
510,74.769863,74.769863
511,74.512329,74.512329
512,67.526027,67.526027


In [83]:
etreg_compare.to_csv('HARMwhole_neuro_harm_mri_etreg_compare_on_top.csv')

In [84]:
mri_based_neurocombat_harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_neurocombat_harmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,HARMneuro_harm_whole_mri_linr.sav,5.197234,0.786353,0.786353
0,lasso regression,HARMneuro_harm_whole_mri_llreg.sav,5.174376,0.783984,0.783984
0,elasticnetCV,HARMneuro_harm_whole_mri_linr.sav,6.707943,0.653699,0.653699
0,extra trees,HARMneuro_harm_mri_linr.sav,7.774499e-14,1.0,1.0


## Now we will run the exact opposite process.
1. We will explore TOP based models via k-folded results, 
2. We will make a general unahrmonized TOP model (based off all TOP)
3. We will apply the best of these model to the StrokeMRI dataset

### Build ML models based on neruocombat harmonized TOP 

In [85]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [86]:
linr_k_frame, linr_y_frame, models = sep.stratified_one_category_shuffle_split('linear regression', 'HARMneuro_harm_top_linr', LinearRegression(), ml_matrix, X, y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [87]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,HARMneuro_harm_top_linr.0,5.600948,0.729061,0.729088
0,linear regression-1,1,HARMneuro_harm_top_linr.1,4.91882,0.804779,0.806379
0,linear regression-2,2,HARMneuro_harm_top_linr.2,5.4217,0.707509,0.71
0,linear regression-3,3,HARMneuro_harm_top_linr.3,4.886924,0.760421,0.76208
0,linear regression-4,4,HARMneuro_harm_top_linr.4,5.836918,0.723401,0.734491


In [88]:
linr_k_frame.to_csv('HARMneuro_harmonized_top_linr_k_frame.csv')

In [89]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 HARMneuro_harm_top_linr.0 0 HARMneuro_...,5.333062,0.745034,0.748408


In [90]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,55.004233
1,77.601093,78.749024
2,66.041096,51.672796
3,71.479452,67.795418
4,72.630137,76.317562
...,...,...
124,72.331507,74.582645
125,50.918819,45.287759
126,70.180328,53.703098
127,42.016393,47.074433


In [91]:
linr_y_frame.to_csv('HARMneuro_harmonized_top_linr_y_frame.csv')

In [92]:
linr = models[0]
linr[0]

In [93]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [94]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'HARMneuro_harm_top_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'HARMneuro_harm_top_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'HARMneuro_harm_top_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'HARMneuro_harm_top_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'HARMneuro_harm_top_linr4.sav'))

In [95]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_one_category_shuffle_split('lasso regression', 'HARMneuro_harm_top_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,HARMneuro_harm_top_llreg.0,5.483282,0.735343,0.735437
0,lasso regression-1,1,HARMneuro_harm_top_llreg.1,4.797875,0.81063,0.812447
0,lasso regression-2,2,HARMneuro_harm_top_llreg.2,5.216478,0.720623,0.723995
0,lasso regression-3,3,HARMneuro_harm_top_llreg.3,4.808385,0.765028,0.767379
0,lasso regression-4,4,HARMneuro_harm_top_llreg.4,5.637852,0.734984,0.744773


In [96]:
llreg_k_frame.to_csv('HARMneuro_harmonized_top_llreg_k_frame.csv')

In [97]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 HARMneuro_harm_top_llreg.0 0 HARMneuro...,5.188774,0.753322,0.756806


In [98]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,54.992712
1,77.601093,78.353980
2,66.041096,53.340831
3,71.479452,68.335367
4,72.630137,76.346995
...,...,...
124,72.331507,73.953625
125,50.918819,46.201334
126,70.180328,53.806488
127,42.016393,46.962212


In [99]:
llreg_y_frame.to_csv('HARMneuro_harmonized_top_llreg_y_frame.csv')

In [100]:
llreg = models[0]
llreg[0]

In [101]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'HARMneuro_harm_top_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'HARMneuro_harm_top_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'HARMneuro_harm_top_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'HARMneuro_harm_top_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'HARMneuro_harm_top_llreg4.sav'))

In [102]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_one_category_shuffle_split('decision tree', 'HARMneuro_harm_top_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,HARMneuro_harm_top_dtree.0,5.095191,0.759108,0.759174
0,decision tree-1,1,HARMneuro_harm_top_dtree.1,5.839779,0.717877,0.717877
0,decision tree-2,2,HARMneuro_harm_top_dtree.2,5.618324,0.665112,0.665709
0,decision tree-3,3,HARMneuro_harm_top_dtree.3,4.700625,0.790504,0.790609
0,decision tree-4,4,HARMneuro_harm_top_dtree.4,5.173173,0.772156,0.77382


In [103]:
dtree_k_frame.to_csv('HARMneuro_harmonized_top_dtree_k_frame.csv')

In [104]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 HARMneuro_harm_top_dtree.0 0 HARMneuro...,5.285419,0.740952,0.741438


In [105]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,49.317808
1,77.601093,74.802740
2,66.041096,56.931507
3,71.479452,68.235616
4,72.630137,69.158904
...,...,...
124,72.331507,75.183562
125,50.918819,44.142466
126,70.180328,74.271233
127,42.016393,50.180822


In [106]:
dtree_y_frame.to_csv('HARMneuro_harmonized_top_dtree_y_frame.csv')

In [107]:
dtree = models[0]
dtree[0]

In [108]:
regr_k_frame, regr_y_frame, models = sep.stratified_one_category_shuffle_split('MLP regression', 'HARMneuro_harm_top_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,HARMneuro_harm_top_regr.0,5.90761,0.70122,0.702995
0,MLP regression-1,1,HARMneuro_harm_top_regr.1,5.69563,0.751461,0.77122
0,MLP regression-2,2,HARMneuro_harm_top_regr.2,5.151822,0.753565,0.760884
0,MLP regression-3,3,HARMneuro_harm_top_regr.3,5.700283,0.707202,0.707286
0,MLP regression-4,4,HARMneuro_harm_top_regr.4,6.156973,0.695705,0.696634


In [109]:
regr_k_frame.to_csv('HARMneuro_harmonized_top_regr_k_frame.csv')

In [110]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 HARMneuro_harm_top_regr.0 0 HARMneuro_...,5.722464,0.72183,0.727804


In [111]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,42.879222
1,77.601093,69.814808
2,66.041096,60.575170
3,71.479452,71.481277
4,72.630137,88.040937
...,...,...
124,72.331507,74.096130
125,50.918819,41.495514
126,70.180328,60.213695
127,42.016393,46.474635


In [112]:
regr_y_frame.to_csv('HARMneuro_harmonized_top_regr_y_frame.csv')

In [113]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [114]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_one_category_shuffle_split('support vector reg poly2', 'HARMneuro_harm_top_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,HARMneuro_harm_top_svrp2.0,9.312183,0.256375,0.26772
0,support vector reg poly2-1,1,HARMneuro_harm_top_svrp2.1,8.730392,0.35389,0.425499
0,support vector reg poly2-2,2,HARMneuro_harm_top_svrp2.2,8.456701,0.365014,0.373129
0,support vector reg poly2-3,3,HARMneuro_harm_top_svrp2.3,8.157983,0.367104,0.36758
0,support vector reg poly2-4,4,HARMneuro_harm_top_svrp2.4,8.583163,0.345462,0.381483


In [115]:
svrp2_k_frame.to_csv('HARMneuro_harmonized_top_svrp2_k_frame.csv')

In [116]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 HARMneuro_harm_top_svrp2.0 0 HARMneuro...,8.648084,0.337569,0.363082


In [117]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,63.788070
1,77.601093,65.674894
2,66.041096,58.324031
3,71.479452,64.745224
4,72.630137,68.506028
...,...,...
124,72.331507,64.937372
125,50.918819,59.015645
126,70.180328,51.509842
127,42.016393,62.842440


In [118]:
svrp2_y_frame.to_csv('HARMneuro_harmonized_top_svrp2_y_frame.csv')

In [119]:
svrp2 = models[0]
svrp2[0]

In [120]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_one_category_shuffle_split('elasticnetCV', 'HARMneuro_harm_top_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,HARMneuro_harm_top_eregr.0,6.9869,0.591701,0.592039
0,elasticnetCV-1,1,HARMneuro_harm_top_eregr.1,6.113391,0.69651,0.696662
0,elasticnetCV-2,2,HARMneuro_harm_top_eregr.2,6.47692,0.622017,0.638156
0,elasticnetCV-3,3,HARMneuro_harm_top_eregr.3,6.152406,0.638483,0.646
0,elasticnetCV-4,4,HARMneuro_harm_top_eregr.4,6.791199,0.620769,0.622587


In [121]:
eregr_k_frame.to_csv('HARMneuro_harmonized_top_eregr_k_frame.csv')

In [122]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 HARMneuro_harm_top_eregr.0 0 HARMneuro...,6.504163,0.633896,0.639089


In [123]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,52.670154
1,77.601093,75.042095
2,66.041096,59.506336
3,71.479452,66.602528
4,72.630137,66.677739
...,...,...
124,72.331507,71.117569
125,50.918819,45.111649
126,70.180328,53.413002
127,42.016393,50.174277


In [124]:
eregr_y_frame.to_csv('HARMneuro_harmonized_top_eregr_y_frame.csv')

In [125]:
eregr = models[0]
eregr[0]

In [126]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_one_category_shuffle_split('extra trees', 'HARMneuro_harm_top_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,HARMneuro_harm_top_etreg.0,3.932544,0.864796,0.865758
0,extra trees-1,1,HARMneuro_harm_top_etreg.1,4.06687,0.872082,0.872811
0,extra trees-2,2,HARMneuro_harm_top_etreg.2,4.120503,0.832311,0.832373
0,extra trees-3,3,HARMneuro_harm_top_etreg.3,3.667449,0.875932,0.876397
0,extra trees-4,4,HARMneuro_harm_top_etreg.4,4.264123,0.855642,0.858889


In [127]:
etreg_k_frame.to_csv('HARMneuro_haromized_top_etreg_k_frame.csv')

In [128]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 HARMneuro_harm_top_etreg.0 0 HARMneuro...,4.010298,0.860153,0.861245


In [129]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,53.845597
1,77.601093,74.848573
2,66.041096,56.487159
3,71.479452,71.354130
4,72.630137,71.169052
...,...,...
124,72.331507,71.959758
125,50.918819,45.904459
126,70.180328,61.916610
127,42.016393,45.984831


In [130]:
etreg_y_frame.to_csv('HARMneuro_harmonized_top_etreg_y_frame.csv')

In [131]:
etreg = models[0]
etreg[0]

In [132]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'HARMneuro_harm_top_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'HARMneuro_harm_top_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'HARMneuro_harm_top_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'HARMneuro_harm_top_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'HARMneuro_harm_top_etreg4.sav'))

Show results ON AVERAGE for each model

In [133]:
top_based_neuro_harmonized_on_testtop =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
top_based_neuro_harmonized_on_testtop

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 HARMneuro_harm_top_linr.0 0 HARMneuro_...,5.333062,0.745034,0.748408
0,0 lasso regression-0 0 lasso regression-...,0 HARMneuro_harm_top_llreg.0 0 HARMneuro...,5.188774,0.753322,0.756806
0,0 decision tree-0 0 decision tree-1 0 ...,0 HARMneuro_harm_top_dtree.0 0 HARMneuro...,5.285419,0.740952,0.741438
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 HARMneuro_harm_top_regr.0 0 HARMneuro_...,5.722464,0.72183,0.727804
0,0 support vector reg poly2-0 0 support v...,0 HARMneuro_harm_top_svrp2.0 0 HARMneuro...,8.648084,0.337569,0.363082
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 HARMneuro_harm_top_eregr.0 0 HARMneuro...,6.504163,0.633896,0.639089
0,0 extra trees-0 0 extra trees-1 0 ext...,0 HARMneuro_harm_top_etreg.0 0 HARMneuro...,4.010298,0.860153,0.861245


In [134]:
top_based_neuro_harmonized_on_testtop.to_csv('HARMtop_based_neuro_harmonized_on_topt_AVERAGES.csv')

## Now we will build  models based on the whole harmonized StrokeTOP dataset, and apply them to StrokeMRI. 

In [135]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [136]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [137]:
TOPlinr = LinearRegression()
TOPlinr.fit(X_train, y_train)

In [138]:
TOPllreg = linear_model.LassoLars(alpha=0.01)
TOPllreg.fit(X_train, y_train)

In [139]:
TOPeregr = ElasticNetCV(cv=5, random_state=17)
TOPeregr.fit(X_train, y_train)

In [140]:
TOPetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPetreg.fit(X_train, y_train)

##  Save these four best models

In [141]:
## optional save models
#joblib.dump(TOPlinr, ('../result_models/' + 'HARMneuro_harm_whole_top_linr.sav'))
#joblib.dump(TOPllreg, ('../result_models/'+ 'HARMneuro_harm_whole_top_llreg1.sav'))
#joblib.dump(TOPeregr, ('../result_models/'+ 'HARMneuro_harm_whole_top_eregr3.sav'))
#joblib.dump(TOPetreg, ('../result_models/'+ 'HARMneuro_harm_whole_top_etreg4.sav'))

# Running whole TOP model over MRI dataset

In [142]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri= y_mri.astype('float')

In [143]:
X_mri_test = X_mri
y_mri_test = y_mri


In [144]:
y_mri_pred = TOPlinr.predict(X_mri_test)

In [145]:
data= [[
    'linear regression',
    'HARMneuro_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPlinr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [146]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,44.197514
1,66.367123,65.468398
2,55.838356,62.248124
3,48.238356,55.176792
4,58.616438,60.046788
...,...,...
509,73.928767,80.942337
510,74.769863,77.577055
511,74.512329,69.828269
512,67.526027,62.122115


In [147]:
linr_compare.to_csv('HARMwhole_neuro-_harm_top_linr_compare_on_mti.csv')

In [148]:
y_mri_pred = TOPllreg.predict(X_mri_test)

In [149]:
data= [[
    'lasso regression',
    'HARMneuro_harm_whole_top_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPllreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,HARMneuro_harm_whole_top_llreg.sav,5.174376,0.783984,0.783984


In [150]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,44.206128
1,66.367123,64.939883
2,55.838356,61.978512
3,48.238356,55.795700
4,58.616438,59.848291
...,...,...
509,73.928767,80.954217
510,74.769863,77.456079
511,74.512329,69.560991
512,67.526027,61.504458


In [151]:
llreg_compare.to_csv('HARMwhole_neuro_harm_top_llreg_compare_on_mri.csv')

In [152]:
y_mri_pred = TOPeregr.predict(X_mri_test)

In [153]:
data= [[
    'elasticnetCV',
    'HARMneuro_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPeregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [154]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,41.556157
1,66.367123,61.275219
2,55.838356,62.752545
3,48.238356,59.359469
4,58.616438,55.656439
...,...,...
509,73.928767,79.558195
510,74.769863,79.507272
511,74.512329,66.835422
512,67.526027,59.504567


In [155]:
eregr_compare.to_csv('HARMwhole_neuro_harm_top_eregr_compare_on_mri.csv')

In [156]:
y_mri_pred = TOPetreg.predict(X_mri_test)

In [157]:
data= [[
    'extra trees',
    'HARMneuro_harm_mri_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPetreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [158]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

Unnamed: 0,y_test_real_age,linr_y_pred_age
0,43.172603,43.172603
1,66.367123,66.367123
2,55.838356,55.838356
3,48.238356,48.238356
4,58.616438,58.616438
...,...,...
509,73.928767,73.928767
510,74.769863,74.769863
511,74.512329,74.512329
512,67.526027,67.526027


In [159]:
etreg_compare.to_csv('HARMwhole_neuro_harm_top_etreg_compare_on_mri.csv')

compile csvs of results

In [160]:
top_based_neuro_harmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_neuro_harmonized_on_mri

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,HARMneuro_harm_whole_top_linr.sav,5.197234,0.786353,0.786353
0,lasso regression,HARMneuro_harm_whole_top_llreg.sav,5.174376,0.783984,0.783984
0,elasticnetCV,HARMneuro_harm_whole_top_linr.sav,6.707943,0.653699,0.653699
0,extra trees,HARMneuro_harm_mri_linr.sav,7.774499e-14,1.0,1.0


In [161]:
top_based_neuro_harmonized_on_mri.to_csv('HARMwhole_top_based_neuro_harmonized_on_mri.csv')