# ML testing: experiment #3-requested harm (neurocomat)

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models with neurocombat harmonization to Insight 46 and Sabre datasets (the strokeMRI and TOP become one dataset)

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri_for_ids = '../open_work/internal_results/cleaned_pvc2s' 
filename_mri_for_ids = os.path.join(filepath_mri_for_ids,'StrokeMRI_pvc2c.csv') 

filepath_top_for_ids = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top_for_ids = os.path.join(filepath_top_for_ids,'TOP_pvc2c.csv') 

filepath_topmri = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_topmri = os.path.join(filepath_topmri,'Rneuro_harm3way_topmri.csv') 
filepath_sabre = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_sabre = os.path.join(filepath_topmri,'Rneuro_harm3way_sabre.csv') 
filepath_insight46 = '../open_work/internal_results/harmonized_pvc2s/requested_harm/' 
filename_insight46 =  os.path.join(filepath_topmri,'Rneuro_harm3way_insight.csv') 
# read in data
# TOP = pd.read_csv(filename_top)
# StrokeMRI = pd.read_csv(filename_mri)
TOPMRI = pd.read_csv(filename_topmri)
SABRE = pd.read_csv(filename_sabre)
Insight46 = pd.read_csv(filename_insight46)
# take extra column off
TOPMRI = TOPMRI.drop(TOPMRI.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
Insight46 = Insight46.drop(Insight46.columns[0],axis=1)
IDS_TOP =  pd.read_csv(filename_top_for_ids)
IDS_MRI =  pd.read_csv(filename_mri_for_ids)

In [3]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,0,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,1,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [4]:
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
#Insight46.head(3)

In [5]:
coly = TOPMRI.columns
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-100096_1_ses-1_run-1,78,1,0.610012,0.482679,0.468053,0.387788,0.387788,9.071137,23.835812,6.123046,2.646049,2.824262,3.364733,2.815808,24.110961,84.6646,75.440922,50.074196,63.48628
1,sub-100331_1_ses-1_run-1,71,1,0.587432,0.460086,0.521217,0.370808,0.370808,5.741298,25.789411,8.420838,3.567374,3.018827,3.007536,2.901729,17.390889,44.479919,37.446087,31.890621,35.017552
2,sub-102285_1_ses-1_run-1,72,1,0.61711,0.524304,0.40898,0.397435,0.397435,6.510194,27.461341,11.441511,3.52137,3.312764,3.073432,3.13133,22.45342,61.511213,50.596122,40.008961,45.73423


In [6]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

# Now we need to break up the top and MRI datasets as well, and format them like the others

In [7]:
set_top_ids = set(IDS_TOP.participant_id)
set_mri_ids = set(IDS_MRI.participant_id)
StrokeMRI = TOPMRI[TOPMRI['participant_id'].isin(list(set_mri_ids))]
TOP = TOPMRI[TOPMRI['participant_id'].isin(list(set_top_ids))]
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,0,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,1,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [8]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
527,sub-59082_1_ses-1_run-1,43.172603,,0.619625,0.475143,0.282154,0.447639,0.447639,-2.768004,21.237884,21.007739,2.69443,3.116515,2.494894,3.00189,22.072651,99.092903,85.901007,71.422761,80.740415
528,sub-59083_1_ses-1_run-1,66.367123,,0.577321,0.466753,0.394898,0.402171,0.402171,11.695904,20.710557,14.234537,3.145519,1.8785,5.508759,2.75405,20.647118,70.595906,60.898762,63.114804,63.590018
529,sub-59085_1_ses-1_run-1,55.838356,,0.58934,0.51888,0.302086,0.415564,0.415564,2.449953,27.648636,14.348818,2.771326,3.100342,3.604489,3.081356,22.857563,75.977349,68.547695,57.958144,60.921869


In [9]:
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,,0.689489,0.508614,0.277173,0.464584,0.464584,-2.815951,20.392622,21.711238,3.864851,3.899129,5.62034,4.367357,17.903946,75.351056,63.920141,49.334352,57.364434
1,sub-0002_1_ses-1_run-1,38.3,,0.696103,0.60549,0.220125,0.453424,0.453424,-1.921805,20.37845,23.111355,2.908789,4.986781,2.169642,3.26527,19.353158,82.275813,73.601804,61.074299,68.184063
2,sub-0019_1_ses-1_run-1,32.3,,0.684261,0.513501,0.304087,0.453051,0.453051,-3.306224,7.729002,15.244295,2.969291,1.960339,3.873684,3.389996,22.329981,88.908492,81.812966,59.787357,70.534912


In [10]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [11]:
# make mixed StrokeMRI and TOP dataset
#mixed_data = pd.concat([TOP, StrokeMRI], sort=False)
mixed_data = TOPMRI

## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [12]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [13]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [14]:
# X_train_cut = X_train[:,1:]
# X_train_cut = X_train_cut.astype('float')
# X_train_cut.shape

In [15]:
# X_test_cut = X_test[:,1:]
# X_test_cut = X_test_cut.astype('float')
# X_test_cut.shape

In [16]:
linr_k_frame, linr_y_frame, models = sep.frame_a_model_sex_split('linear regression', 'req_neuroharm_mix_linr', LinearRegression(), ml_matrix, X[:,1:], y)

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

In [17]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression0,0,req_neuroharm_mix_linr0.sav,4.907224,0.862544,0.863021
0,linear regression1,1,req_neuroharm_mix_linr1.sav,5.66713,-0.344027,-0.342165
0,linear regression2,2,req_neuroharm_mix_linr2.sav,5.153548,0.852971,0.853906
0,linear regression3,3,req_neuroharm_mix_linr3.sav,4.625221,0.886533,0.88731
0,linear regression4,4,req_neuroharm_mix_linr4.sav,5.779041,-0.601456,-0.58774


In [18]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression0 0 linear regression...,0 req_neuroharm_mix_linr0.sav 0 req_neur...,5.226433,0.331313,0.334867


In [19]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,50.662307
1,67.178082,66.197434
2,23.200000,31.688230
3,75.106849,91.367033
4,74.235616,66.210258
...,...,...
256,40.355191,39.324983
257,56.063014,58.133112
258,76.002740,73.307233
259,72.605479,57.862048


In [20]:
linr = models
linr[0]

In [21]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [22]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [23]:
llreg_k_frame, llreg_y_frame, models = sep.frame_a_model_sex_split('lasso regression', 'unharm_mix_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression0,0,unharm_mix_llreg0.sav,4.922952,0.861883,0.862269
0,lasso regression1,1,unharm_mix_llreg1.sav,5.701518,-0.396692,-0.394752
0,lasso regression2,2,unharm_mix_llreg2.sav,5.128703,0.855639,0.856553
0,lasso regression3,3,unharm_mix_llreg3.sav,4.609045,0.886746,0.887593
0,lasso regression4,4,unharm_mix_llreg4.sav,5.806644,-0.681163,-0.667155


In [24]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg.to_csv()

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression0 0 lasso regression1 ...,0 unharm_mix_llreg0.sav 0 unharm_mix_llr...,5.233773,0.305283,0.308902


In [25]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,51.037285
1,67.178082,66.338822
2,23.200000,31.383782
3,75.106849,90.301189
4,74.235616,65.788601
...,...,...
256,40.355191,39.525150
257,56.063014,57.848191
258,76.002740,72.621692
259,72.605479,57.721708


In [26]:
llreg = models
llreg[0]

In [27]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [28]:
dtree_k_frame, dtree_y_frame, models = sep.frame_a_model_sex_split('decision tree', 'unharm_mix_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree0,0,unharm_mix_dtree0.sav,7.072803,0.661246,0.662258
0,decision tree1,1,unharm_mix_dtree1.sav,7.317756,0.696412,0.698877
0,decision tree2,2,unharm_mix_dtree2.sav,6.766366,0.712137,0.712137
0,decision tree3,3,unharm_mix_dtree3.sav,6.168171,0.774932,0.774977
0,decision tree4,4,unharm_mix_dtree4.sav,6.846643,0.698744,0.70095


In [29]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree0 0 decision tree1 0 d...,0 unharm_mix_dtree0.sav 0 unharm_mix_dtr...,6.834348,0.708694,0.70984


In [30]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,46.200000
1,67.178082,66.216438
2,23.200000,28.050000
3,75.106849,80.624658
4,74.235616,50.660000
...,...,...
256,40.355191,38.740000
257,56.063014,54.471233
258,76.002740,78.800000
259,72.605479,62.777506


In [31]:
dtree = models
dtree[0]

In [32]:
regr_k_frame, regr_y_frame, models = sep.frame_a_model_sex_split('MLP regression', 'unharm_mix_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression0,0,unharm_mix_regr0.sav,5.285466,0.840498,0.840876
0,MLP regression1,1,unharm_mix_regr1.sav,7.549306,-4.055033,-4.03622
0,MLP regression2,2,unharm_mix_regr2.sav,4.723158,0.864878,0.864923
0,MLP regression3,3,unharm_mix_regr3.sav,5.189101,0.851124,0.851503
0,MLP regression4,4,unharm_mix_regr4.sav,7.045555,-2.593745,-2.577774


In [33]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression0 0 MLP regression1 0 ...,0 unharm_mix_regr0.sav 0 unharm_mix_regr...,5.958517,-0.818456,-0.811339


In [34]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,48.065042
1,67.178082,71.795376
2,23.200000,27.598261
3,75.106849,80.106396
4,74.235616,58.644881
...,...,...
256,40.355191,39.314347
257,56.063014,48.034392
258,76.002740,80.304902
259,72.605479,60.230711


In [35]:
regr = models
regr[0]

In [36]:
svrp2_k_frame, svrp2_y_frame, models = sep.frame_a_model_sex_split('support vector reg poly2', 'unharm_mix_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly20,0,unharm_mix_svrp20.sav,10.023877,0.459398,0.461176
0,support vector reg poly21,1,unharm_mix_svrp21.sav,12.112316,-1.461124,-1.411434
0,support vector reg poly22,2,unharm_mix_svrp22.sav,10.12655,0.461354,0.464082
0,support vector reg poly23,3,unharm_mix_svrp23.sav,10.456102,0.478268,0.478678
0,support vector reg poly24,4,unharm_mix_svrp24.sav,12.296433,-2.016337,-2.016333


In [37]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly20 0 support ve...,0 unharm_mix_svrp20.sav 0 unharm_mix_svr...,11.003056,-0.415688,-0.404766


In [38]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,49.619623
1,67.178082,54.321574
2,23.200000,33.592315
3,75.106849,66.519037
4,74.235616,53.390149
...,...,...
256,40.355191,53.015840
257,56.063014,56.213827
258,76.002740,62.142213
259,72.605479,41.745342


In [39]:
svrp2 = models
svrp2[0]

In [40]:
eregr_k_frame, eregr_y_frame, models = sep.frame_a_model_sex_split('elasticnetCV', 'unharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV0,0,unharm_mix_eregr0.sav,8.717609,0.578495,0.579203
0,elasticnetCV1,1,unharm_mix_eregr1.sav,7.67832,-1.817407,-1.805408
0,elasticnetCV2,2,unharm_mix_eregr2.sav,8.978482,0.576246,0.576271
0,elasticnetCV3,3,unharm_mix_eregr3.sav,9.464145,0.579055,0.58004
0,elasticnetCV4,4,unharm_mix_eregr4.sav,7.841744,-2.150874,-2.130226


In [41]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV0 0 elasticnetCV1 0 ela...,0 unharm_mix_eregr0.sav 0 unharm_mix_ere...,8.53606,-0.446897,-0.440024


In [42]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,55.138073
1,67.178082,59.017011
2,23.200000,29.489444
3,75.106849,78.031706
4,74.235616,61.596004
...,...,...
256,40.355191,42.527965
257,56.063014,54.481011
258,76.002740,62.114563
259,72.605479,56.817766


In [43]:
eregr = models
eregr[0]

In [44]:
etreg_k_frame, etreg_y_frame, models = sep.frame_a_model_sex_split('extra trees', 'unharm_mix_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X[:,1:], y)
etreg_k_frame

StratifiedShuffleSplit(n_splits=5, random_state=12, test_size=0.25,
            train_size=None)
Whole dataset shape: X (1041, 18), y (1041,)
Classes: [0 1], percentages: [54.75504323 45.24495677]

Fold 0:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 1:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 2:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 3:
Train shapes: X (780, 18)  y (780,)
Sex classes: [0 1] percentages: [54.74358974 45.25641026]

Test shapes: X (261, 18)   y (261,)
Sex classes: [0 1],percentages: [54.78927203 45.21072797]

Fold 4:
Tr

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees0,0,unharm_mix_etreg0.sav,4.92651,0.849962,0.850545
0,extra trees1,1,unharm_mix_etreg1.sav,4.551437,0.876696,0.878988
0,extra trees2,2,unharm_mix_etreg2.sav,4.877461,0.854953,0.855709
0,extra trees3,3,unharm_mix_etreg3.sav,4.60474,0.881923,0.882351
0,extra trees4,4,unharm_mix_etreg4.sav,4.460143,0.872858,0.874461


In [45]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees0 0 extra trees1 0 extra...,0 unharm_mix_etreg0.sav 0 unharm_mix_etr...,4.684058,0.867278,0.868411


In [46]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,38.080000,50.653862
1,67.178082,63.747678
2,23.200000,40.448400
3,75.106849,79.089953
4,74.235616,65.628041
...,...,...
256,40.355191,34.598201
257,56.063014,52.312300
258,76.002740,77.014414
259,72.605479,60.694474


In [47]:
etreg = models
etreg[0]

In [48]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_etreg0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_etreg1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_etreg2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_etreg3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_etreg4.sav'))

Show results ON AVERAGE for each model

In [49]:
mixed_based_unharmonized_on_testmix =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mixed_based_unharmonized_on_testmix.to_csv()

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression0 0 linear regression...,0 req_neuroharm_mix_linr0.sav 0 req_neur...,5.226433,0.331313,0.334867
0,0 lasso regression0 0 lasso regression1 ...,0 unharm_mix_llreg0.sav 0 unharm_mix_llr...,5.233773,0.305283,0.308902
0,0 decision tree0 0 decision tree1 0 d...,0 unharm_mix_dtree0.sav 0 unharm_mix_dtr...,6.834348,0.708694,0.70984
0,0 MLP regression0 0 MLP regression1 0 ...,0 unharm_mix_regr0.sav 0 unharm_mix_regr...,5.958517,-0.818456,-0.811339
0,0 support vector reg poly20 0 support ve...,0 unharm_mix_svrp20.sav 0 unharm_mix_svr...,11.003056,-0.415688,-0.404766
0,0 elasticnetCV0 0 elasticnetCV1 0 ela...,0 unharm_mix_eregr0.sav 0 unharm_mix_ere...,8.53606,-0.446897,-0.440024
0,0 extra trees0 0 extra trees1 0 extra...,0 unharm_mix_etreg0.sav 0 unharm_mix_etr...,4.684058,0.867278,0.868411


In [50]:
# data_frames1 = [linr_y_frame, llreg_y_frame, dtree_y_frame,]# regr_compare, ]#etreg_compare, svrp2_compare,]
# real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames1)
# #real_versus_projected_y1
# data_frames2 = [eregr_y_frame, svrp2_y_frame, etreg_y_frame,]
# real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames2)
# #real_versus_projected_y2
# real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
# real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
# #real_versus_projected_y1

In [51]:
# data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
# real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames1)
# #real_versus_projected_y1
# data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
# real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames2)
# #real_versus_projected_y2
# real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
# real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
# #real_versus_projected_y1

In [52]:
# data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
# real_versus_projected_y3_mixed_on_mixed = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames3)
# real_versus_projected_y3_mixed_on_mixed.head(3)

## Save off models and csv (optional, must uncomment)

In [53]:
# # optionally save of csvs of algorithms and results
# mixed_based_unharmonized_on_testmix.to_csv('mixed_based_unharmonized_on_testmix.csv')
# real_versus_projected_y3_mixed_on_mixed.to_csv('real_versus_projected_y3_mixed_on_mixed.csv')

In [None]:
# # check if model folder exists and if not , then create
# model_folder = '../result_models/'
# if not os.path.exists(model_folder):
#     os.makedirs(model_folder)

In [None]:
# make a model based on 80% of top and stroke together

## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

In [None]:
# These cells must be rewritten, or the cells above, but skip to running over sabre

In [None]:
# def frame_a_model_sex_split_2(
#         model_name,
#         model_file_name,
#         scikit_model,
#         our_ml_matrix,
#         our_x,
#         our_y,
# ):
#     """
#     This takes a sci-kit learn coded model and
#     creates a dataframe based on k-folds of results on
#     our_ml_matrix, and it's X component
#     returns a dataframe of fold results
#     and raw y_test versus y_pred
#     """
#     y_split = our_ml_matrix['sex'].values
#     # 5 folds as example, you can change this
#     sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=12)

#     X = our_x
#     y = our_y
#     sss.get_n_splits(X, y_split)

#     unique, counts = np.unique(y_split, return_counts=True)

#     y_frame = []
#     all_mod_results = []
#     models = []
#     for i, (train_index, test_index) in enumerate(sss.split(X, y_split)):
#         unique, counts = np.unique(y_split[train_index], return_counts=True)
#         unique, counts = np.unique(y_split[test_index], return_counts=True)
#         cols = [
#             'algorithm',
#             'fold',
#             'file_name',
#             'mae',
#             'r2',
#             'explained_variance',
#         ]
#         mod_results = pd.DataFrame(columns=cols)
#         current_fold_X_train = X[train_index][:, 1:]
#         current_fold_y_train = y[train_index]
#         current_fold_X_test = X[test_index][:, 1:]
#         current_fold_y_test = y[test_index]
#         scikit_model.fit(current_fold_X_train, current_fold_y_train)
#         current_fold_y_pred = scikit_model.predict(current_fold_X_test)

#         data = [[
#             f'{model_name}-{i}',
#             i,
#             f'{model_file_name}.{i}',
#             mean_absolute_error(current_fold_y_test, current_fold_y_pred),
#             scikit_model.score(current_fold_X_test, current_fold_y_test),
#             metrics.explained_variance_score(
#                 current_fold_y_test,
#                 current_fold_y_pred
#             )]]
#         mod_results_current_fold = pd.DataFrame(data, columns=cols)
#         mod_results = pd.concat([mod_results, mod_results_current_fold])
#         mod_results.reset_index(drop=True, inplace=True)
#         all_mod_results.append(mod_results)
#         y_frame_now = pd.DataFrame(
#             {
#                 'y_test': list(current_fold_y_test),
#                 'y_pred': list(current_fold_y_pred),
#             })

#         y_frame.append(y_frame_now)

#         models.append((scikit_model, X[train_index][:, 0]))

#     df = pd.concat(all_mod_results)
#     y_frame = pd.concat([
#         y_frame[0],
#         y_frame[1],
#         y_frame[2],
#         y_frame[3],
#         y_frame[4],
#     ], axis=0)

#     return df, y_frame, models

In [54]:
o_split,  oy_frame, omodels = sep.frame_a_model_sex_split_2('linear regression', 'req_neuroharm_mix_linr', LinearRegression(), ml_matrix, X, y)

now we need to make a dataframe of TOP minus what is in X_train

In [55]:
o_split

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,req_neuroharm_mix_linr.0,4.907224,0.862544,0.863021
0,linear regression-1,1,req_neuroharm_mix_linr.1,5.66713,-0.344027,-0.342165
0,linear regression-2,2,req_neuroharm_mix_linr.2,5.153548,0.852971,0.853906
0,linear regression-3,3,req_neuroharm_mix_linr.3,4.625221,0.886533,0.88731
0,linear regression-4,4,req_neuroharm_mix_linr.4,5.779041,-0.601456,-0.58774


In [56]:
oy_frame

Unnamed: 0,y_test,y_pred
0,38.080000,50.662307
1,67.178082,66.197434
2,23.200000,31.688230
3,75.106849,91.367033
4,74.235616,66.210258
...,...,...
256,40.355191,39.324983
257,56.063014,58.133112
258,76.002740,73.307233
259,72.605479,57.862048


In [58]:
olinr = omodels
## we can print off arrays of everything it was run on
#olinr[0][0] # that is the model itself
#olinr[0][1] # that is what it was trained on

In [None]:
X_train_pandas.head(3)

In [None]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [None]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here
new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

filter down to only top where they are in new_top set

In [None]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
#TOP_new

In [None]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
X_top_test = X_top
y_top_test = y_top

In [None]:
y_top_pred = linr.predict(X_top_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_top

In [None]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [None]:
y_top_pred = llreg.predict(X_top_test)

In [None]:
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'lasso regression',
    'unharm_mixed_llregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    llreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_top

In [None]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [None]:
y_top_pred = dtree.predict(X_top_test)

In [None]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    dtree.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
dtree_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_top

In [None]:
dtree_y_test = y_top_test
dtree_y_pred = y_top_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'linr_y_pred_age': dtree_y_pred,
    })
dtree_compare = linr_compare.reset_index()
#dtree_compare

In [None]:
y_top_pred = eregr.predict(X_top_test)

In [None]:
print('R2 score ElasticnetCV regression: %.3f' % eregr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'elasticnetCV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_top

In [None]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare = linr_compare.reset_index()
#eregr_compare

In [None]:
y_top_pred = svr_p2.predict(X_top_test)

In [None]:
print('R2 score SVR poly 2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'SVR polynom degree 2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    svr_p2.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
svrp2_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svrp2_results_top

In [None]:
svrp2_y_test = y_top_test
svrp2_y_pred = y_top_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'linr_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = linr_compare.reset_index()
#svrp2_compare

In [None]:
y_top_pred = etreg.predict(X_top_test)

In [None]:
print('R2 score Extra trees: %.3f' % etreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'extra trees',
    'unharm_mixed_ereg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    etreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_top

In [None]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare = linr_compare.reset_index()
#etreg_compare

In [None]:
y_top_pred = regr.predict(X_top_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'multilayered percentron',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    regr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
regr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#regr_results_top

In [None]:
regr_y_test = y_top_test
regr_y_pred = y_top_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare = linr_compare.reset_index()
#regr_compare

In [None]:
# Compile results of mixed on TOP

In [None]:
mixed_based_unharmonized_on_top =pd.concat([linr_results_top,
                   llreg_results_top,
                   dtree_results_top,
                   regr_results_top,
                   svrp2_results_top,
                   eregr_results_top,
                  etreg_results_top],
                  axis=0)
mixed_based_unharmonized_on_top

In [None]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_top = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_top.head(3)

## Save off csvs of results 
optional, you must uncomment

In [None]:
## optionally save of csvs of algorithms and results
#mixed_based_unharmonized_on_top.to_csv('mixed_based_unharmonized_on_top.csv')
#real_versus_projected_y3_mixed_on_top.to_csv('real_versus_projected_y3_mixed_on_top.csv')

# Mixed algorithm on stroke MRI subjects (not in training)

In [None]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here
new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects

In [None]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
#StrokeMRI_new

In [None]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 
X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri

In [None]:
y_mri_pred = linr.predict(X_mri_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    linr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_mri

In [None]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare_mri = linr_compare.reset_index()
#linr_compare_mri

In [None]:
y_mri_pred = llreg.predict(X_mri_test)

In [None]:
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'lasso regression',
    'unharm_mixed_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    llreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#llreg_results_mri

In [None]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'llreg_y_pred_age': llreg_y_pred,
    })
llreg_compare_mri = llreg_compare.reset_index()
#llreg_compare_mri

In [None]:
y_mri_pred = dtree.predict(X_mri_test)

In [None]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    dtree.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
dtree_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_mri

In [None]:
dtree_y_test = y_mri_test
dtree_y_pred = y_mri_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'dtree_y_pred_age': dtree_y_pred,
    })
dtree_compare_mri = dtree_compare.reset_index()
#dtree_compare_mri

In [None]:
y_mri_pred = regr.predict(X_mri_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'multilayered perceptron',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    regr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
regr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_mri

In [None]:
regr_y_test = y_mri_test
regr_y_pred = y_mri_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare_mri = regr_compare.reset_index()
#regr_compare_mri

In [None]:
y_mri_pred = svr_p2.predict(X_mri_test)

In [None]:
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'svr poly degree 2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    svr_p2.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
svrp2_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svrp2_results_mri

In [None]:
svrp2_y_test = y_mri_test
svrp2_y_pred = y_mri_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'svrp2_y_pred_age': svrp2_y_pred,
    })
svrp2_compare_mri = svrp2_compare.reset_index()
#svrp2_compare_mri

In [None]:
y_mri_pred = etreg.predict(X_mri_test)

In [None]:
print('R2 score Extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'extra trees',
    'unharm_mixed_etreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    etreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_mri

In [None]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
etreg_compare_mri = etreg_compare.reset_index()
#etreg_compare_mri

In [None]:
y_mri_pred = eregr.predict(X_mri_test)

In [None]:
print('R2 score elasticnetCV: %.3f' % eregr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'elasticnet CV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    eregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_mri

In [None]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'etreg_y_pred_age': eregr_y_pred,
    })
eregr_compare_mri = eregr_compare.reset_index()
#etreg_compare_mri

In [None]:
# Agregate mixed results on MRI and optional save

In [None]:
mixed_based_unharmonized_on_mri =pd.concat([linr_results_mri,
                   llreg_results_mri,
                   dtree_results_mri,
                   regr_results_mri,
                   svrp2_results_mri,
                   eregr_results_mri,
                   etreg_results_mri],
                   axis=0)
mixed_based_unharmonized_on_mri

In [None]:
data_frames1 = [linr_compare_mri, llreg_compare_mri, dtree_compare_mri,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare_mri, svrp2_compare_mri, etreg_compare_mri,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_mri = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_mri.head(3)

In [None]:
# # optionally save of csvs of algorithms and results
# mixed_based_unharmonized_on_mri.to_csv('mixed_based_unharmonized_on_mri.csv')
# real_versus_projected_y3_mixed_on_mri.to_csv('real_versus_projected_y3_mixed_on_mri.csv')

# Running mixed model over SABRE dataset

## Here we will do an example of running allthe [0] models

In [None]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [None]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [None]:
y_sabre_pred = linr[0].predict(X_sabre_test)

In [None]:
data= [[
    'Linear Reg',
    'unharm_mix_linr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    linr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_linr_sabre = pd.DataFrame(data)
y_frame_linr_sabre

In [None]:
y_sabre_pred = llreg[0].predict(X_sabre_test)

In [None]:
data= [[
    'Lasso',
    'unharm_mix_lassor0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    llreg[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_llreg_sabre = pd.DataFrame(data)
y_frame_llreg_sabre

In [None]:
y_sabre_pred = dtree[0].predict(X_sabre_test)

In [None]:
data= [[
    'Decision tree',
    'unharm_mix_dtree0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    dtree[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
dtree_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_dtree_sabre = pd.DataFrame(data)
y_frame_dtree_sabre

In [None]:
y_sabre_pred = regr[0].predict(X_sabre_test)

In [None]:
data= [[
    'MLP regression',
    'unharm_mix_regr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    regr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
regr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_sabre

In [None]:
y_sabre_pred = svrp2[0].predict(X_sabre_test)

In [None]:
data= [[
    'Svr P2',
    'unharm_mri_svrp20.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    svrp2[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
svr_p2_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_sabre

In [None]:
y_sabre_pred = eregr[0].predict(X_sabre_test)

In [None]:
data= [[
    'ElasticnetCV',
    'harm_mix_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    eregr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

In [None]:
y_sabre_pred = etreg[0].predict(X_sabre_test)

In [None]:
data= [[
    'Extra trees',
    'harm_mix_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    etreg[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_etregr_sabre = pd.DataFrame(data)
y_frame_etregr_sabre

In [None]:
mix_based_neuroharmonized_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   dtree_results_sabre,
                   regr_results_sabre,
                   svr_p2_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mix_based_neuroharmonized_on_sabre

# Running mixed model over Insight46 dataset

## Here we will do an example of running allthe [0] models

In [None]:
insight_ml_matrix = Insight46.drop('participant_id', axis=1)
X_insight = insight_ml_matrix.drop('age', axis =1)
X_insight = X_insight.values
X_insight = X_insight.astype('float')
y_insight = insight_ml_matrix['age'].values
y_insight= y_insight.astype('float')

In [None]:
X_insight_test = X_insight
y_insight_test = y_insight

In [None]:
y_insight_pred = linr[0].predict(X_insight_test)

In [None]:
data= [[
    'Linear Reg',
    'neuroharm_mix_linr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    linr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
linr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_linr_insight = pd.DataFrame(data)
y_frame_linr_insight

In [None]:
y_insight_pred = llreg[0].predict(X_insight_test)

In [None]:
data= [[
    'Lasso',
    'neuroharm_mix_lassor0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    llreg[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
llreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_llreg_insight = pd.DataFrame(data)
y_frame_llreg_insight

In [None]:
y_insight_pred = dtree[0].predict(X_insight_test)

In [None]:
data= [[
    'Decision tree',
    'neuroharm_mix_dtree0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    dtree[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
dtree_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_dtree_insight = pd.DataFrame(data)
y_frame_dtree_insight

In [None]:
y_insight_pred = regr[0].predict(X_insight_test)

In [None]:
data= [[
    'MLP regression',
    'neuroharm_mix_regr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    regr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
regr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_insight

In [None]:
y_insight_pred = svrp2[0].predict(X_insight_test)

In [None]:
data= [[
    'Svr P2',
    'neuroharm_mix_svrp20.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    svrp2[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
svr_p2_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_insight

In [None]:
y_insight_pred = eregr[0].predict(X_insight_test)

In [None]:
data= [[
    'ElasticnetCV',
    'neuroharm_mix_elasticregr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    eregr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
eregr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_insight

In [None]:
y_insight_pred = etreg[0].predict(X_insight_test)

In [None]:
data= [[
    'Extra trees',
    'neuroharm_mix_etreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    etreg[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
etreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_etregr_insight = pd.DataFrame(data)
y_frame_etregr_insight

In [None]:
mix_based_neuroharmonized_on_insight =pd.concat([linr_results_insight,
                   llreg_results_insight,
                   dtree_results_insight,
                   regr_results_insight,
                   svr_p2_results_insight,
                   eregr_results_insight,
                   etreg_results_insight],
                  axis=0)
mix_based_neuroharmonized_on_insight