# ML testing: experiment #1- unharmonized

This notebook involves testing for the MRI conference abstract. This notebook shows unharmonized StrokeMRI and TOP based models, and how they perform on each other

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [5]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.71736,0.52803,0.31812,0.45881,0.45881,1.743,24.0,8.762,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981
1,sub-0002_1_ses-1_run-1,38.3,0,0.72383,0.62394,0.25673,0.45112,0.45112,1.629,23.0,9.0749,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047
2,sub-0019_1_ses-1_run-1,32.3,1,0.71224,0.53295,0.33594,0.45046,0.45046,0.621,13.0,8.8791,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588


In [6]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.64754,0.49441,0.3132,0.445,0.445,1.249,24.0,8.0434,1.7273,1.9776,1.7141,1.914,26.8262,104.3155,90.5047,73.8758,84.4527
1,sub-59083_1_ses-1_run-1,66.367123,0,0.60517,0.48594,0.42304,0.39968,0.39968,14.597,25.0,3.7791,1.9669,1.7154,2.6394,1.8815,24.9597,75.7051,65.3032,65.6771,67.1158
2,sub-59085_1_ses-1_run-1,55.838356,0,0.61724,0.53779,0.33692,0.41371,0.41371,6.341,30.0,5.1248,1.8146,1.9845,2.0827,1.9501,27.7117,81.0966,72.998,60.8317,64.3183


## Build ML models based on StrokeMRI

In [7]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [8]:
linr_k_frame, linr_y_frame, models = sep.frame_a_model_sex_split('linear regression', 'unharm_mri_linr', LinearRegression(), ml_matrix, X, y)

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (514, 18), y (514,)
Classes: [0 1], percentages: [63.03501946 36.96498054]

Fold 0:
Train shapes: X (385, 18)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 18)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 1:
Train shapes: X (385, 18)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 18)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 2:
Train shapes: X (385, 18)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 18)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 3:
Train shapes: X (385, 18)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 18)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 4:
Trai

In [9]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression0,0,unharm_mri_linr0.sav,5.112978,0.76907,0.769094
0,linear regression1,1,unharm_mri_linr1.sav,4.6925,0.834524,0.836304
0,linear regression2,2,unharm_mri_linr2.sav,5.071694,0.740732,0.744496
0,linear regression3,3,unharm_mri_linr3.sav,4.542573,0.787515,0.791268
0,linear regression4,4,unharm_mri_linr4.sav,5.552243,0.747659,0.755504


In [10]:
linr_k_frame.to_csv('unharmonized_mri_linr_k_frame.csv')

In [11]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression0 0 linear regression...,0 unharm_mri_linr0.sav 0 unharm_mri_linr...,4.994398,0.7759,0.779334


In [12]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,55.957986
1,77.601093,80.329956
2,66.041096,53.765575
3,71.479452,67.149153
4,72.630137,76.146134
...,...,...
124,72.331507,73.324910
125,50.918819,46.175055
126,70.180328,52.660841
127,42.016393,47.277406


In [13]:
linr_y_frame.to_csv('unharmonized_mri_linr_y_frame.csv')

In [14]:
linr = models
linr[0]

In [15]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [16]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mri_linr4.sav'))

In [17]:
llreg_k_frame, llreg_y_frame, models = sep.frame_a_model_sex_split('lasso regression', 'unharm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (514, 17), y (514,)
Classes: [0 1], percentages: [63.03501946 36.96498054]

Fold 0:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 1:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 2:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 3:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 4:
Trai



Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression0,0,unharm_mri_llreg0.sav,5.132188,0.767601,0.767606
0,lasso regression1,1,unharm_mri_llreg1.sav,4.743008,0.830759,0.832578
0,lasso regression2,2,unharm_mri_llreg2.sav,4.943246,0.749053,0.753528
0,lasso regression3,3,unharm_mri_llreg3.sav,4.548927,0.787735,0.792202
0,lasso regression4,4,unharm_mri_llreg4.sav,5.494137,0.754806,0.760875


In [18]:
llreg_k_frame.to_csv('unharmonized_mri_llreg_k_frame.csv')

In [19]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression0 0 lasso regression1 ...,0 unharm_mri_llreg0.sav 0 unharm_mri_llr...,4.972301,0.777991,0.781358


In [20]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,56.054100
1,77.601093,80.030308
2,66.041096,55.896355
3,71.479452,66.450805
4,72.630137,74.857489
...,...,...
124,72.331507,74.336598
125,50.918819,46.466040
126,70.180328,52.794557
127,42.016393,47.897359


In [21]:
llreg_y_frame.to_csv('unharmonized_mri_llreg_y_frame.csv')

In [22]:
llreg = models
llreg[0]

In [23]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'unharm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'unharm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'unharm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'unharm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'unharm_mri_llreg4.sav'))

In [24]:
dtree_k_frame, dtree_y_frame, models = sep.frame_a_model_sex_split('decision tree', 'unharm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (514, 17), y (514,)
Classes: [0 1], percentages: [63.03501946 36.96498054]

Fold 0:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 1:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 2:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 3:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 4:
Trai

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree0,0,unharm_mri_dtree0.sav,5.980131,0.676161,0.67779
0,decision tree1,1,unharm_mri_dtree1.sav,6.81302,0.621876,0.623492
0,decision tree2,2,unharm_mri_dtree2.sav,6.67226,0.524132,0.532882
0,decision tree3,3,unharm_mri_dtree3.sav,6.229564,0.601628,0.601629
0,decision tree4,4,unharm_mri_dtree4.sav,7.23579,0.573217,0.596041


In [25]:
dtree_k_frame.to_csv('unharmonized_mri_dtree_k_frame.csv')

In [26]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree0 0 decision tree1 0 d...,0 unharm_mri_dtree0.sav 0 unharm_mri_dtr...,6.586153,0.599403,0.606367


In [27]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,49.317808
1,77.601093,77.010959
2,66.041096,55.601093
3,71.479452,74.802740
4,72.630137,75.712329
...,...,...
124,72.331507,73.990658
125,50.918819,45.336986
126,70.180328,59.706849
127,42.016393,51.172603


In [28]:
dtree_y_frame.to_csv('unharmonized_mri_dtree_y_frame.csv')

In [29]:
dtree = models
dtree[0]

In [30]:
regr_k_frame, regr_y_frame, models = sep.frame_a_model_sex_split('MLP regression', 'unharm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (514, 17), y (514,)
Classes: [0 1], percentages: [63.03501946 36.96498054]

Fold 0:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]





Fold 1:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]





Fold 2:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]





Fold 3:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 4:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression0,0,unharm_mri_regr0.sav,6.381926,0.657836,0.657879
0,MLP regression1,1,unharm_mri_regr1.sav,5.792897,0.735003,0.73891
0,MLP regression2,2,unharm_mri_regr2.sav,5.958718,0.682188,0.708242
0,MLP regression3,3,unharm_mri_regr3.sav,6.142291,0.638496,0.641934
0,MLP regression4,4,unharm_mri_regr4.sav,7.337195,0.492286,0.492573


In [31]:
regr_k_frame.to_csv('unharmonized_mri_regr_k_frame.csv')

In [32]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression0 0 MLP regression1 0 ...,0 unharm_mri_regr0.sav 0 unharm_mri_regr...,6.322605,0.641162,0.647908


In [33]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,45.434018
1,77.601093,71.316230
2,66.041096,60.763745
3,71.479452,71.457283
4,72.630137,85.428809
...,...,...
124,72.331507,73.005422
125,50.918819,42.097677
126,70.180328,53.241370
127,42.016393,46.167845


In [34]:
regr_y_frame.to_csv('unharmonized_mri_regr_y_frame.csv')

In [35]:
regr = models
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [36]:
svrp2_k_frame, svrp2_y_frame, models = sep.frame_a_model_sex_split('support vector reg poly2', 'unharm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (514, 17), y (514,)
Classes: [0 1], percentages: [63.03501946 36.96498054]

Fold 0:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 1:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 2:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 3:
Train shapes: X (385, 17)  y (385,)
Sex classes: [0 1] percentages: [63.11688312 36.88311688]

Test shapes: X (129, 17)   y (129,)
Sex classes: [0 1],percentages: [62.79069767 37.20930233]

Fold 4:
Trai

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly20,0,unharm_mri_svrp20.sav,9.534569,0.232333,0.238814
0,support vector reg poly21,1,unharm_mri_svrp21.sav,8.910734,0.332888,0.400148
0,support vector reg poly22,2,unharm_mri_svrp22.sav,8.605218,0.342992,0.346619
0,support vector reg poly23,3,unharm_mri_svrp23.sav,8.21343,0.355394,0.358342
0,support vector reg poly24,4,unharm_mri_svrp24.sav,8.809948,0.331537,0.348821


In [38]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly20 0 support ve...,0 unharm_mri_svrp20.sav 0 unharm_mri_svr...,8.81478,0.319029,0.338549


In [39]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,50.797260,64.363362
1,77.601093,64.765813
2,66.041096,57.462376
3,71.479452,63.606712
4,72.630137,69.075910
...,...,...
124,72.331507,63.701667
125,50.918819,58.652455
126,70.180328,51.848253
127,42.016393,62.679445


In [40]:
svrp2 = models
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.frame_a_model_sex_split('elasticnetCV', 'unharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

In [None]:
eregr_k_frame.to_csv('unharmonized_eregr_k_frame.csv')

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr = models
eregr[0]

In [37]:
data= [[
    'SVR poly2',
    'unharm_mri_svrp2.sav',
    mean_absolute_error(y_test, y_pred),
    svr_p2.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
svr_p2_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svr_p2_results

NameError: name 'y_test' is not defined

In [None]:
svrp2_y_test = y_test
svrp2_y_pred = y_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'svrp2_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = svrp2_compare.reset_index()
#svrp2_compare

In [None]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train, y_train)

In [None]:
y_pred = eregr.predict(X_test)

In [None]:
print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'Elastic_netCV',
    'unharm_mri_eregr.sav',
    mean_absolute_error(y_test, y_pred),
    eregr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_test
eregr_y_pred = y_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'eregr_y_pred_age': eregr_y_pred,
    })
eregr_compare = eregr_compare.reset_index()
#eregr_compare

In [None]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train, y_train)

In [None]:
y_pred = etreg.predict(X_test)

In [None]:
print('R2 score extra trees regression: %.3f' % etreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'Extra trees',
    'unharm_mri_etreg.sav',
    mean_absolute_error(y_test, y_pred),
    etreg.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_test
etreg_y_pred = y_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
etreg_compare = etreg_compare.reset_index()
#etreg_compare

Show results

In [None]:
mri_based_unharmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   dtree_results,
                   regr_results,
                   svr_p2_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_unharmonized_on_mri

In [None]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mri_on_mri = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mri_on_mri.head(3)

## Save off models and informative csvs (optional, therefore uncomment)

In [None]:
## optionally save of csvs of algorithms and results
#mri_based_unharmonized_on_mri.to_csv('mri_based_unharmonized_on_mri.csv')
#real_versus_projected_y3_mri_on_mri.to_csv('real_versus_projected_y3_mri_on_mri.csv')

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
# joblib.dump(linr, ('../result_models/'+ 'unharm_mri_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'unharm_mri_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'unharm_mri_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+ 'unharm_mri_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+ 'unharm_mri_svrp2.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'unharm_mri_eregr.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'unharm_mri_etreg.sav'))

## Run models on other dataset (TOP)

In [None]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
X_top_test = X_top
y_top_test = y_top


In [None]:
y_top_pred = linr.predict(X_top_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'linear regression',
    'unharm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [None]:
y_top_pred = llreg.predict(X_top_test)

In [None]:
data= [[
    'lasso regression',
    'unharm_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    llreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

In [None]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [None]:
y_top_pred = dtree.predict(X_top_test)

In [None]:
# print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'decision tree',
    'unharm_mri_dtree.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    dtree.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
dtree_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#dtree_results

In [None]:
dtree_y_test = y_top_test
dtree_y_pred = y_top_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'linr_y_pred_age': dtree_y_pred,
    })
dtree_compare = dtree_compare.reset_index()
#dtree_compare

In [None]:
y_top_pred = regr.predict(X_top_test)

In [None]:
print('R2 score MLP regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'multi-layer perceptron',
    'unharm_mri_regr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    regr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
regr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#regr_results

In [None]:
regr_y_test = y_top_test
regr_y_pred = y_top_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare = regr_compare.reset_index()
#regr_compare

In [None]:
y_top_pred = svr_p2.predict(X_top_test)

In [None]:
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'svr poly2',
    'unharm_mri_svrp2.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    svr_p2.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
svrp2_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svrp2_results

In [None]:
svrp2_y_test = y_top_test
svrp2_y_pred = y_top_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'linr_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = svrp2_compare.reset_index()
#svrp2_compare

In [None]:
y_top_pred = eregr.predict(X_top_test)

In [None]:
print('R2 score elasticCV net : %.3f' % eregr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'elasticnetCV',
    'unharm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    eregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare = eregr_compare.reset_index()
#eregr_compare

In [None]:
y_top_pred = etreg.predict(X_top_test)

In [None]:
print('R2 score extra tree: %.3f' % etreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'extra trees',
    'unharm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    etreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare = etreg_compare.reset_index()
#etreg_compare

compile csvs of results

In [None]:
mri_based_unharmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   dtree_results,
                   regr_results,
                   svr_p2_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_unharmonized_on_top

In [None]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mri_on_top = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mri_on_top.head(3)

optionally save off csvs

In [None]:
#mri_based_unharmonized_on_top.to_csv('mri_based_unharmonized_on_top.csv')

In [None]:
#real_versus_projected_y3_mri_on_top.top_csv('real_versus_projected_y3_mri_on_top.csv')

## Run models on other datasets : Sabre

In [None]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [None]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [None]:
y_sabre_pred = linr.predict(X_sabre_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Linear Reg',
    'unharm_mri_linr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    linr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

In [None]:
y_sabre_pred = llreg.predict(X_sabre_test)

In [None]:
print('R2 score Lasso regression: %.3f' % llreg.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Lasso',
    'unharm_mri_lassor.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    llreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

In [None]:
y_sabre_pred = dtree.predict(X_sabre_test)

In [None]:
print('R2 score Decision tree: %.3f' % dtree.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Decision tree',
    'unharm_mri_svrp2.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    dtree.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
dtree_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_sabre

In [None]:
y_sabre_pred = regr.predict(X_sabre_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'MLP regression',
    'unharm_mri_regr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    regr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
regr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_sabre

In [None]:
y_sabre_pred = svr_p2.predict(X_sabre_test)

In [None]:
print('R2 score SVR polynomial regression: %.3f' % svr_p2.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Svr P2',
    'unharm_mri_svrp2.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    svr_p2.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
svr_p2_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_sabre

In [None]:
y_sabre_pred = eregr.predict(X_sabre_test)

In [None]:
print('R2 score ElasticNet CV : %.3f' % eregr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'ElasticnetCV',
    'unharm_mri_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    eregr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

In [None]:
y_sabre_pred = etreg.predict(X_sabre_test)

In [None]:
print('R2 score extra tree regression: %.3f' % etreg.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Extra trees',
    'unharm_mri_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    etreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

In [None]:
mri_based_unharmonized_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   dtree_results_sabre,
                   regr_results_sabre,
                   svr_p2_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mri_based_unharmonized_on_sabre

Optional save off csv (commented out)

In [None]:
#mri_based_unharmonized_on_sabre.to_csv('mri_based_unharmonized_on_sabre.csv')