# ML testing: experiment #3-reqd harm (OPNested ComBat)

This notebook involves testing for the clinical harmonization paper. This notebook shows mixed_dataset (TOP + StrokeMRI) based models with OPNested ComBat harmonization to Insight 46 and SABRE datasets (the StrokeMRI and TOP become one dataset)

Harmonisation: OPNested Combat

Training data: NORMENT whichh is StrokeMRI and TOP togehter

Testing data: test set from NORMENT

Futher data applied to: SABRE, Insight46, EDIS, (HELIUS pending)

Validation method: K-fold, double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: SubjectID, real age, predicted age of validation and testing sets

### import libraries

In [145]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep

### import data

In [209]:
filepath_mri_for_ids = '../open_work/internal_results/cleaned_pvc2s' 
filename_mri_for_ids = os.path.join(filepath_mri_for_ids,'StrokeMRI_pvc2c.csv') 

filepath_top_for_ids = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top_for_ids = os.path.join(filepath_top_for_ids,'TOP_pvc2c.csv') 

filepath_topmri = '../open_work/internal_results/harmonized_pvc2s/requested_open_harm/' 
filename_topmri = os.path.join(filepath_topmri,'Rtop_opn_harmonized.csv')  # changed here, but the file should be called topmri to be consistend with notebook 3b
filepath_sabre = '../open_work/internal_results/harmonized_pvc2s/requested_open_harm/' 
filename_sabre = os.path.join(filepath_topmri,'Rsabre_opn_harmonized.csv') 
filepath_insight46 = '../open_work/internal_results/harmonized_pvc2s/requested_open_harm/' 
filename_insight46 =  os.path.join(filepath_topmri,'Rinsight_opn_harmonized.csv') 
# read in data
# TOP = pd.read_csv(filename_top)
# StrokeMRI = pd.read_csv(filename_mri)
TOPMRI = pd.read_csv(filename_topmri)
SABRE = pd.read_csv(filename_sabre)
Insight46 = pd.read_csv(filename_insight46)
# take extra column off
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
Insight46 = Insight46.drop(Insight46.columns[0],axis=1)
IDS_TOP =  pd.read_csv(filename_top_for_ids)
IDS_MRI =  pd.read_csv(filename_mri_for_ids)

In [147]:
# Now we need to flip the sex back to numbers for a correlation
# sex_mapping = {'F':0,'M':1}
# TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-0001_1_ses-1_run-1,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,3.955039,5.5654,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.49,0
1,sub-0002_1_ses-1_run-1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,5.00393,2.150061,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.3,1
2,sub-0019_1_ses-1_run-1,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,1.985136,3.852563,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.3,0


In [148]:
#Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
Insight46.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-10015124_1_ses-1_run-1,0.55116,0.467781,0.35262,0.401879,0.401879,10.848986,21.751675,5.977847,2.289577,2.462664,2.138066,2.255942,35.269473,88.315969,81.490231,69.650414,74.062388,69.733333,1
1,sub-10024822_1_ses-1_run-1,0.532271,0.40628,0.418355,0.393377,0.393377,31.830633,35.52839,1.674953,2.414836,2.427024,2.675412,2.397329,39.384624,123.926694,125.306297,92.419243,102.422754,70.288889,1
2,sub-10075012_1_ses-1_run-1,0.65146,0.517728,0.430636,0.408243,0.408243,8.526374,25.775231,10.140971,2.934346,2.778649,2.511607,2.58986,16.5122,66.559388,53.905683,50.643899,52.656046,69.883333,0


In [149]:
coly = TOPMRI.columns
#SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-100096_1_ses-1_run-1,0.610463,0.482036,0.465766,0.389002,0.389002,9.067945,23.825923,6.12443,2.64478,2.823009,3.366722,2.815465,24.10553,84.656442,75.444075,50.064487,63.490676,78.0,0
1,sub-100331_1_ses-1_run-1,0.587717,0.459487,0.519845,0.370925,0.370925,5.740985,25.787982,8.420772,3.560863,3.018213,3.007307,2.901465,17.409639,44.445126,37.448646,31.882411,35.021619,71.0,0
2,sub-102285_1_ses-1_run-1,0.617482,0.523829,0.408515,0.398043,0.398043,6.509469,27.458724,11.441679,3.515536,3.311985,3.073524,3.13103,22.467699,61.482033,50.599385,39.999681,45.739123,72.0,0


In [150]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

# Now we need to break up the TOP and StrokeMRI datasets as well, and format them like the others

In [151]:
set_top_ids = set(IDS_TOP.participant_id)
set_mri_ids = set(IDS_MRI.participant_id)
StrokeMRI = TOPMRI[TOPMRI['participant_id'].isin(list(set_mri_ids))]
TOP = TOPMRI[TOPMRI['participant_id'].isin(list(set_top_ids))]
TOP.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-0001_1_ses-1_run-1,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,3.955039,5.5654,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.49,0
1,sub-0002_1_ses-1_run-1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,5.00393,2.150061,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.3,1
2,sub-0019_1_ses-1_run-1,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,1.985136,3.852563,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.3,0


In [152]:
#StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
527,sub-59082_1_ses-1_run-1,0.619322,0.475629,0.28578,0.446516,0.446516,-2.761028,21.258967,20.968764,2.810085,3.15401,2.461138,3.012218,22.077681,99.1313,85.896789,71.44337,80.733629,43.172603,1
528,sub-59083_1_ses-1_run-1,0.577055,0.467156,0.393442,0.402537,0.402537,11.691849,20.698515,14.273664,3.023663,1.841114,5.540364,2.742972,20.633245,70.608335,60.897204,63.123811,63.584321,66.367123,1
529,sub-59085_1_ses-1_run-1,0.589078,0.519222,0.298482,0.416703,0.416703,2.441073,27.620937,14.42182,2.550152,3.026053,3.668246,3.060504,22.834709,75.993054,68.545096,57.957575,60.9174,55.838356,1


In [153]:
#TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-0001_1_ses-1_run-1,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,3.955039,5.5654,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.49,0
1,sub-0002_1_ses-1_run-1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,5.00393,2.150061,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.3,1
2,sub-0019_1_ses-1_run-1,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,1.985136,3.852563,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.3,0


In [154]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [155]:
# make mixed StrokeMRI and TOP dataset
#mixed_data = pd.concat([TOP, StrokeMRI], sort=False)
mixed_data = TOPMRI

In [156]:
TOPMRI

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,sub-0001_1_ses-1_run-1,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,3.955039,5.565400,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.490000,0
1,sub-0002_1_ses-1_run-1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,5.003930,2.150061,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.300000,1
2,sub-0019_1_ses-1_run-1,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,1.985136,3.852563,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.300000,0
3,sub-0020_1_ses-1_run-1,0.642103,0.415276,0.232177,0.494535,0.494535,-1.731741,21.605267,-1.534289,3.573918,2.863531,2.793938,2.939377,22.254758,95.189132,86.216866,65.172481,75.650707,21.970000,1
4,sub-0022_1_ses-1_run-1,0.608357,0.450407,0.182573,0.483454,0.483454,-1.392525,19.222467,14.477935,2.399085,3.422763,2.941279,2.556149,20.934833,81.692662,73.131382,66.341565,69.895871,37.520000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,sub-59440_1_ses-1_run-1,0.544227,0.507823,0.492557,0.354290,0.354290,25.548484,67.221034,8.263924,2.301097,2.467848,1.817204,2.800027,22.813858,77.574330,68.050013,51.652869,55.856991,73.928767,0
1037,sub-59440_2_ses-2_run-1,0.546934,0.490178,0.502093,0.357678,0.357678,20.561162,27.858747,9.175623,4.421446,5.196750,3.161863,4.965876,23.463447,77.621301,72.106387,63.205920,58.301448,74.769863,0
1038,sub-59441_2_ses-2_run-1,0.558441,0.456762,0.388144,0.397381,0.397381,-0.972065,25.421631,16.561857,1.527717,1.813665,0.848389,1.467022,22.174164,61.001456,59.319177,49.938656,50.928927,74.512329,0
1039,sub-59442_1_ses-1_run-1,0.611103,0.533650,0.353544,0.406597,0.406597,3.676332,0.149892,17.655268,0.976837,2.308007,3.013636,1.974517,19.603298,66.769083,60.657521,53.780776,55.744016,67.526027,0


In [157]:
output_folder = '3_OPNested'

os.makedirs(output_folder, exist_ok=True)

## Build ML models

In [158]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [159]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'opnharm_mix_linr', LinearRegression(), ml_matrix, X[:,1:], y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [160]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,opnharm_mix_linr.0,4.757304,0.862364,0.863292
0,linear regression-1,1,opnharm_mix_linr.1,4.835043,0.873652,0.873789
0,linear regression-2,2,opnharm_mix_linr.2,5.754875,-1.099319,-1.089204
0,linear regression-3,3,opnharm_mix_linr.3,5.102208,0.846943,0.852762
0,linear regression-4,4,opnharm_mix_linr.4,4.975696,0.855267,0.855631


In [161]:
linr_k_frame.to_csv(output_folder + '/linr_k_frame_opn_harm.csv')

In [162]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 opnharm_mix_linr.0 0 opnharm_mix_linr....,5.085025,0.467781,0.471254


In [163]:
linr_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,56.634466
1,45.540000,43.248840
2,23.930000,30.607042
3,56.632083,49.573745
4,32.910000,26.652318
...,...,...
256,55.000000,54.678820
257,67.435616,53.565553
258,69.202740,75.409747
259,30.610000,39.426876


In [164]:
linr_y_frame.to_csv(output_folder + '/linr_y_frame_opn_harm.csv')

In [165]:
linr = models[0]
linr[0]

In [211]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [213]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'opnharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'opnharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'opnharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'opnharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'opnharm_mix_linr4.sav'))

In [168]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'opnharm_mix_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,opnharm_mix_llreg.0,4.811775,0.859273,0.860441
0,lasso regression-1,1,opnharm_mix_llreg.1,4.838992,0.871889,0.872034
0,lasso regression-2,2,opnharm_mix_llreg.2,5.91413,-1.253633,-1.242526
0,lasso regression-3,3,opnharm_mix_llreg.3,5.205826,0.838658,0.845357
0,lasso regression-4,4,opnharm_mix_llreg.4,5.317488,0.833445,0.833803


In [169]:
llreg_k_frame.to_csv(output_folder + '/llreg_k_frame_opn_harm.csv')

In [170]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 opnharm_mix_llreg.0 0 opnharm_mix_llre...,5.217642,0.429927,0.433822


In [171]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,56.618422
1,45.540000,43.833415
2,23.930000,31.593107
3,56.632083,49.665950
4,32.910000,26.668264
...,...,...
256,55.000000,56.624426
257,67.435616,49.628623
258,69.202740,70.319357
259,30.610000,40.371041


In [172]:
llreg_y_frame.to_csv(output_folder + '/llreg_y_frame_opn_harm.csv')

In [173]:
llreg = models[0]
llreg[0]

In [174]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'opnharm_mix_linr0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'opnharm_mix_linr1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'opnharm_mix_linr2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'opnharm_mix_linr3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'opnharm_mix_linr4.sav'))

In [175]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'opnharm_mix_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,opnharm_mix_dtree.0,6.641311,0.722131,0.722356
0,decision tree-1,1,opnharm_mix_dtree.1,6.5761,0.750979,0.751956
0,decision tree-2,2,opnharm_mix_dtree.2,7.623492,0.641079,0.641652
0,decision tree-3,3,opnharm_mix_dtree.3,6.793733,0.713973,0.714724
0,decision tree-4,4,opnharm_mix_dtree.4,6.449711,0.725822,0.725991


In [176]:
dtree_k_frame.to_csv(output_folder + '/dtree_k_frame_opn_harm.csv')

In [177]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 opnharm_mix_dtree.0 0 opnharm_mix_dtre...,6.816869,0.710797,0.711336


In [178]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,64.257534
1,45.540000,30.680000
2,23.930000,26.103825
3,56.632083,54.169863
4,32.910000,21.260000
...,...,...
256,55.000000,61.550685
257,67.435616,60.616012
258,69.202740,72.813699
259,30.610000,39.390000


In [179]:
dtree_y_frame.to_csv(output_folder + '/dtree_y_frame_opn_harm.csv')

In [180]:
dtree = models[0]
dtree[0]

In [181]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'opnharm_mix_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,opnharm_mix_regr.0,4.997857,0.845078,0.847251
0,MLP regression-1,1,opnharm_mix_regr.1,4.881584,0.862183,0.862253
0,MLP regression-2,2,opnharm_mix_regr.2,8.323435,-6.697799,-6.646949
0,MLP regression-3,3,opnharm_mix_regr.3,5.150812,0.836366,0.836954
0,MLP regression-4,4,opnharm_mix_regr.4,5.102534,0.848412,0.849632


In [182]:
regr_k_frame.to_csv(output_folder + '/regr_k_frame_opn_harm.csv')

In [183]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 opnharm_mix_regr.0 0 opnharm_mix_regr....,5.691245,-0.661152,-0.650172


In [184]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,53.850723
1,45.540000,48.054751
2,23.930000,32.907793
3,56.632083,57.098918
4,32.910000,35.239626
...,...,...
256,55.000000,65.372431
257,67.435616,53.201963
258,69.202740,69.889609
259,30.610000,39.154163


In [185]:
regr_y_frame.to_csv(output_folder + '/regr_y_frame_opn_harm.csv')

In [186]:
regr = models[0]
regr[0]

In [187]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'opnharm_mix_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,opnharm_mix_svrp2.0,10.005818,0.458943,0.463121
0,support vector reg poly2-1,1,opnharm_mix_svrp2.1,10.13213,0.466643,0.468938
0,support vector reg poly2-2,2,opnharm_mix_svrp2.2,12.280338,-1.971326,-1.951225
0,support vector reg poly2-3,3,opnharm_mix_svrp2.3,10.150055,0.433124,0.43374
0,support vector reg poly2-4,4,opnharm_mix_svrp2.4,10.3367,0.427867,0.430994


In [188]:
svrp2_k_frame.to_csv(output_folder + '/svrp2_k_frame_opn_harm.csv')

In [189]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 opnharm_mix_svrp2.0 0 opnharm_mix_svrp...,10.581008,-0.03695,-0.030886


In [190]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,52.102858
1,45.540000,47.947161
2,23.930000,44.704718
3,56.632083,48.147345
4,32.910000,36.206269
...,...,...
256,55.000000,53.544818
257,67.435616,50.741089
258,69.202740,41.750075
259,30.610000,50.351617


In [191]:
svrp2_y_frame.to_csv(output_folder + '/svrp2_y_frame_opn_harm.csv')

In [192]:
svrp2 = models[0]
svrp2[0]

In [193]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'opnharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,opnharm_mix_eregr.0,8.778375,0.586261,0.586981
0,elasticnetCV-1,1,opnharm_mix_eregr.1,8.994022,0.578548,0.579077
0,elasticnetCV-2,2,opnharm_mix_eregr.2,7.905633,-2.630606,-2.612491
0,elasticnetCV-3,3,opnharm_mix_eregr.3,9.048926,0.534651,0.538079
0,elasticnetCV-4,4,opnharm_mix_eregr.4,9.061342,0.563559,0.563561


In [194]:
eregr_k_frame.to_csv(output_folder + '/eregr_k_frame_opn_harm.csv')

In [195]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 opnharm_mix_eregr.0 0 opnharm_mix_ereg...,8.75766,-0.073517,-0.068959


In [196]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,51.193239
1,45.540000,47.828650
2,23.930000,43.825518
3,56.632083,48.977901
4,32.910000,35.311757
...,...,...
256,55.000000,58.037007
257,67.435616,54.772939
258,69.202740,45.178346
259,30.610000,46.855199


In [197]:
eregr_y_frame.to_csv(output_folder + '/eregr_y_frame_opn_harm.csv')

In [198]:
eregr = models[0]
eregr[0]

In [199]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'opnharm_mix_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X[:,1:], y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,opnharm_mix_etreg.0,4.820598,0.851129,0.851736
0,extra trees-1,1,opnharm_mix_etreg.1,4.586745,0.875523,0.876316
0,extra trees-2,2,opnharm_mix_etreg.2,4.798038,0.859654,0.860144
0,extra trees-3,3,opnharm_mix_etreg.3,4.426621,0.868818,0.87007
0,extra trees-4,4,opnharm_mix_etreg.4,4.685479,0.869909,0.870716


In [200]:
etreg_k_frame.to_csv(output_folder + '/etreg_k_frame_opn_harm.csv')

In [201]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 opnharm_mix_etreg.0 0 opnharm_mix_etre...,4.663496,0.865007,0.865796


In [202]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,60.249315,58.837980
1,45.540000,37.977162
2,23.930000,33.104347
3,56.632083,54.258397
4,32.910000,30.098240
...,...,...
256,55.000000,53.732785
257,67.435616,60.179686
258,69.202740,72.963805
259,30.610000,33.209223


In [203]:
etreg_y_frame.to_csv(output_folder + '/etreg_y_frame_opn_harm.csv')

In [204]:
etreg = models[0]
etreg[0]

In [205]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'opnharm_mix_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'opnharm_mix_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'opnharm_mix_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'opnharm_mix_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'opnharm_mix_etreg4.sav'))

Show results ON AVERAGE for each model

In [206]:
mixed_based_opnharmonized_on_testmix =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mixed_based_opnharmonized_on_testmix

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 opnharm_mix_linr.0 0 opnharm_mix_linr....,5.085025,0.467781,0.471254
0,0 lasso regression-0 0 lasso regression-...,0 opnharm_mix_llreg.0 0 opnharm_mix_llre...,5.217642,0.429927,0.433822
0,0 decision tree-0 0 decision tree-1 0 ...,0 opnharm_mix_dtree.0 0 opnharm_mix_dtre...,6.816869,0.710797,0.711336
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 opnharm_mix_regr.0 0 opnharm_mix_regr....,5.691245,-0.661152,-0.650172
0,0 support vector reg poly2-0 0 support v...,0 opnharm_mix_svrp2.0 0 opnharm_mix_svrp...,10.581008,-0.03695,-0.030886
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 opnharm_mix_eregr.0 0 opnharm_mix_ereg...,8.75766,-0.073517,-0.068959
0,0 extra trees-0 0 extra trees-1 0 ext...,0 opnharm_mix_etreg.0 0 opnharm_mix_etre...,4.663496,0.865007,0.865796


In [207]:
mixed_based_opnharmonized_on_testmix.to_csv(output_folder + '/mixed_based_opnharmonized_on_testmix.csv')

# Running mixed model over SABRE dataset

## Here we will do an example of running  models made of the entire StrokeMRI and TOP dataset mixed as one

#### Build new models

In [208]:
TOPMRI.head(3)

Unnamed: 0,participant_id,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,...,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex,binned,fuse_bin
0,sub-0001_1_ses-1_run-1,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,...,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.49,0,1,1
1,sub-0002_1_ses-1_run-1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,...,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.3,1,1,5
2,sub-0019_1_ses-1_run-1,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,...,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.3,0,0,0


In [96]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
ml_matrix = ml_matrix.drop('binned', axis =1) # these were still in the training dataset
ml_matrix = ml_matrix.drop('fuse_bin', axis =1) # these were still in the training dataset
                        
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [97]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [98]:
ml_matrix

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,0.689185,0.509135,0.282154,0.462391,0.462391,-2.806177,20.422746,21.652564,4.033135,3.955039,5.565400,4.382002,17.914502,75.364629,63.919437,49.345816,57.364028,43.490000,0
1,0.695833,0.605874,0.222773,0.452668,0.452668,-1.917033,20.392667,23.087666,2.975182,5.003930,2.150061,3.270955,19.354915,82.289749,73.600269,61.084929,68.181466,38.300000,1
2,0.683954,0.513992,0.306674,0.452239,0.452239,-3.301527,7.744353,15.222875,3.036258,1.985136,3.852563,3.395662,22.330502,88.936089,81.809197,59.799743,70.530694,32.300000,0
3,0.642103,0.415276,0.232177,0.494535,0.494535,-1.731741,21.605267,-1.534289,3.573918,2.863531,2.793938,2.939377,22.254758,95.189132,86.216866,65.172481,75.650707,21.970000,1
4,0.608357,0.450407,0.182573,0.483454,0.483454,-1.392525,19.222467,14.477935,2.399085,3.422763,2.941279,2.556149,20.934833,81.692662,73.131382,66.341565,69.895871,37.520000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,0.544227,0.507823,0.492557,0.354290,0.354290,25.548484,67.221034,8.263924,2.301097,2.467848,1.817204,2.800027,22.813858,77.574330,68.050013,51.652869,55.856991,73.928767,0
1037,0.546934,0.490178,0.502093,0.357678,0.357678,20.561162,27.858747,9.175623,4.421446,5.196750,3.161863,4.965876,23.463447,77.621301,72.106387,63.205920,58.301448,74.769863,0
1038,0.558441,0.456762,0.388144,0.397381,0.397381,-0.972065,25.421631,16.561857,1.527717,1.813665,0.848389,1.467022,22.174164,61.001456,59.319177,49.938656,50.928927,74.512329,0
1039,0.611103,0.533650,0.353544,0.406597,0.406597,3.676332,0.149892,17.655268,0.976837,2.308007,3.013636,1.974517,19.603298,66.769083,60.657521,53.780776,55.744016,67.526027,0


In [99]:
MIXlinr = LinearRegression()
MIXlinr.fit(X_train, y_train)

In [100]:
MIXllreg = linear_model.LassoLars(alpha=0.01)
MIXllreg.fit(X_train, y_train)

In [101]:
MIXeregr = ElasticNetCV(cv=5, random_state=17)
MIXeregr.fit(X_train, y_train)


In [102]:
MIXetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MIXetreg.fit(X_train, y_train)

##  Save these four best models

In [103]:

## optional save models
#joblib.dump(MIXlinr, ('../result_models/'+  'opn_harm_mix_MIXlinr.sav'))
#joblib.dump(MIXllreg, ('../result_models/'+ 'opn_harmm_mix_MIXllreg.sav'))

#joblib.dump(MIXeregr, ('../result_models/'+ 'opn_harm_mix_MIXeregr.sav'))
#joblib.dump(MIXetreg, ('../result_models/'+ 'opn_harm_mix_MIXetreg.sav'))

In [104]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [105]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [106]:
sabre_ml_matrix

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex
0,0.610463,0.482036,0.465766,0.389002,0.389002,9.067945,23.825923,6.124430,2.644780,2.823009,3.366722,2.815465,24.105530,84.656442,75.444075,50.064487,63.490676,78.0,0
1,0.587717,0.459487,0.519845,0.370925,0.370925,5.740985,25.787982,8.420772,3.560863,3.018213,3.007307,2.901465,17.409639,44.445126,37.448646,31.882411,35.021619,71.0,0
2,0.617482,0.523829,0.408515,0.398043,0.398043,6.509469,27.458724,11.441679,3.515536,3.311985,3.073524,3.131030,22.467699,61.482033,50.599385,39.999681,45.739123,72.0,0
3,0.589631,0.482395,0.527265,0.364439,0.364439,5.746882,14.185508,15.207566,2.391070,2.564401,2.143719,2.320280,25.441606,67.401859,58.171038,52.863983,55.897284,78.0,0
4,0.658984,0.482019,0.395179,0.428913,0.428913,11.067411,23.576806,6.619101,2.230960,2.786570,2.926960,2.717244,18.543971,63.547711,56.707380,43.937463,52.420749,75.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,0.608996,0.496167,0.452338,0.388973,0.388973,25.814750,73.250695,0.413216,2.439106,3.509528,2.700832,2.826013,30.416872,80.373692,65.945538,61.214370,64.376333,72.0,0
690,0.688474,0.594745,0.450731,0.394805,0.394805,8.183551,17.882316,13.992773,2.544724,2.466323,2.048735,2.042430,19.709414,85.871124,70.830759,64.479638,68.878427,73.0,0
691,0.545023,0.463438,0.329556,0.411517,0.411517,6.608918,26.591350,17.381330,3.257431,2.825424,2.262128,2.570704,21.125635,63.811213,57.154575,46.162379,52.234033,71.0,0
692,0.630304,0.544326,0.443024,0.388048,0.388048,16.874166,27.458724,2.180060,2.476367,2.418067,2.852294,2.331594,30.066679,80.394890,67.658015,48.394171,58.434330,72.0,0


In [107]:
y_sabre_pred = MIXlinr.predict(X_sabre_test)

In [108]:
data= [[
    'Linear Reg',
    'opn_harm_mix_MIXlinr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXlinr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,opn_harm_mix_MIXlinr.sav,9.016992,-2.049943,-0.987568


In [109]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_linr_sabre = pd.DataFrame(data)
y_frame_linr_sabre

Unnamed: 0,real,predicted
0,78.0,61.233343
1,71.0,74.557962
2,72.0,66.180610
3,78.0,76.104256
4,75.0,54.230283
...,...,...
689,72.0,70.772207
690,73.0,53.798959
691,71.0,61.221187
692,72.0,67.369894


In [110]:
y_frame_linr_sabre.to_csv(output_folder + '/y_frame_linr_sabre_opn_harm.csv')

In [111]:
y_sabre_pred = MIXllreg.predict(X_sabre_test)

In [112]:
data= [[
    'Lasso',
    'opn_harmm_mix_MIXllreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXllreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,opn_harmm_mix_MIXllreg.sav,9.213432,-2.205769,-1.061817


In [113]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_llreg_sabre = pd.DataFrame(data)
y_frame_llreg_sabre

Unnamed: 0,real,predicted
0,78.0,60.380006
1,71.0,73.486837
2,72.0,66.507005
3,78.0,75.145213
4,75.0,53.780757
...,...,...
689,72.0,70.478715
690,73.0,54.127090
691,71.0,61.487925
692,72.0,67.880809


In [114]:
y_frame_llreg_sabre.to_csv(output_folder + '/y_frame_llreg_sabre_opn_harm.csv')

In [115]:
y_sabre_pred = MIXeregr.predict(X_sabre_test)

In [116]:
data= [[
    'ElasticnetCV',
    'opn_harmm_mix_MIXeregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXeregr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,opn_harmm_mix_MIXeregr.sav,15.45131,-6.053759,-1.107362


In [117]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_eregr_sabre = pd.DataFrame(data)
y_frame_eregr_sabre

Unnamed: 0,real,predicted
0,78.0,48.705323
1,71.0,60.936410
2,72.0,59.997369
3,78.0,57.241270
4,75.0,54.103336
...,...,...
689,72.0,64.143649
690,73.0,48.840952
691,71.0,59.341086
692,72.0,57.170710


In [118]:
y_frame_eregr_sabre.to_csv(output_folder + '/y_frame_eregr_sabre_opn_harm.csv')

In [119]:
y_sabre_pred = MIXetreg.predict(X_sabre_test)

In [120]:
data= [[
    'Extra trees',
    'opn_harmm_mix_MIXetreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXetreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,opn_harmm_mix_MIXetreg.sav,6.331908,-0.535789,-0.052754


In [121]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_etregr_sabre = pd.DataFrame(data)
y_frame_etregr_sabre

Unnamed: 0,real,predicted
0,78.0,68.610850
1,71.0,70.005743
2,72.0,68.559932
3,78.0,73.432105
4,75.0,56.310849
...,...,...
689,72.0,67.793205
690,73.0,58.045806
691,71.0,68.696250
692,72.0,66.103851


In [122]:
y_frame_etregr_sabre.to_csv(output_folder + '/y_frame_etregr_sabre_opn_harm.csv')

In [123]:
mix_based_opnharmonized_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mix_based_opnharmonized_on_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,opn_harm_mix_MIXlinr.sav,9.016992,-2.049943,-0.987568
0,Lasso,opn_harmm_mix_MIXllreg.sav,9.213432,-2.205769,-1.061817
0,ElasticnetCV,opn_harmm_mix_MIXeregr.sav,15.45131,-6.053759,-1.107362
0,Extra trees,opn_harmm_mix_MIXetreg.sav,6.331908,-0.535789,-0.052754


In [124]:
mix_based_opnharmonized_on_sabre.to_csv(output_folder + '/mix_based_opnharmonized_on_sabre_opn_harm.csv')

# Running mixed model over Insight46 dataset

In [125]:
insight_ml_matrix = Insight46.drop('participant_id', axis=1)
X_insight = insight_ml_matrix.drop('age', axis =1)
X_insight = X_insight.values
X_insight = X_insight.astype('float')
y_insight = insight_ml_matrix['age'].values
y_insight= y_insight.astype('float')

In [126]:
X_insight_test = X_insight
y_insight_test = y_insight

In [127]:
y_insight_pred = MIXlinr.predict(X_insight_test)

In [128]:
data= [[
    'Linear Reg',
    'opn_harmm_mix_MIXllinr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXlinr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
linr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,opn_harmm_mix_MIXllinr.sav,7.897529,-212.870024,-110.99904


In [129]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_linr_insight = pd.DataFrame(data)
y_frame_linr_insight

Unnamed: 0,real,predicted
0,69.733333,63.480929
1,70.288889,58.968971
2,69.883333,58.566579
3,69.866667,74.430121
4,70.661111,60.995545
...,...,...
277,71.705556,72.120518
278,70.822222,61.765637
279,71.341667,54.350732
280,70.741667,68.028313


In [130]:
y_frame_linr_insight.to_csv(output_folder + '/y_frame_linr_insight_opn_harm.csv')

In [131]:
y_insight_pred = MIXllreg.predict(X_insight_test)

In [132]:
data= [[
    'Lasso',
    'opn_harmm_mix_MIXllreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXllreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
llreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,opn_harmm_mix_MIXllreg.sav,8.039754,-216.789872,-106.281783


In [133]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_llreg_insight = pd.DataFrame(data)
y_frame_llreg_insight

Unnamed: 0,real,predicted
0,69.733333,63.010457
1,70.288889,57.443384
2,69.883333,58.735239
3,69.866667,72.697487
4,70.661111,61.542117
...,...,...
277,71.705556,72.582742
278,70.822222,60.910990
279,71.341667,54.547624
280,70.741667,67.641034


In [134]:
y_frame_llreg_insight.to_csv(output_folder + '/y_frame_llreg_insight_opn_harm.csv')

In [135]:
y_insight_pred = MIXeregr.predict(X_insight_test)

In [136]:
data= [[
    'ElasticnetCV',
    'opn_harmm_mix_MIXeregr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXeregr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
eregr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,opn_harmm_mix_MIXeregr.sav,15.678359,-680.92011,-187.2199


In [137]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_eregr_insight = pd.DataFrame(data)
y_frame_eregr_insight

Unnamed: 0,real,predicted
0,69.733333,50.579252
1,70.288889,46.090634
2,69.883333,54.293244
3,69.866667,53.600889
4,70.661111,65.026074
...,...,...
277,71.705556,62.227221
278,70.822222,53.876991
279,71.341667,63.036645
280,70.741667,68.734600


In [138]:
y_frame_eregr_insight.to_csv(output_folder + '/y_frame_eregr_insight_opn_harm.csv')

In [139]:
y_insight_pred = MIXetreg.predict(X_insight_test)

In [140]:
data= [[
    'Extra trees',
    'opnharm_mix_etreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXetreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
etreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,opnharm_mix_etreg.sav,3.801428,-60.901982,-46.05094


In [141]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_etreg_insight = pd.DataFrame(data)
y_frame_etreg_insight

Unnamed: 0,real,predicted
0,69.733333,67.180058
1,70.288889,68.470181
2,69.883333,64.522196
3,69.866667,72.446733
4,70.661111,64.755644
...,...,...
277,71.705556,71.280896
278,70.822222,66.735242
279,71.341667,73.603806
280,70.741667,78.267465


In [142]:
y_frame_etreg_insight.to_csv(output_folder + '/y_frame_etreg_insight_opn_harm.csv')

In [143]:
mix_based_opnharmonized_on_insight =pd.concat([linr_results_insight,
                   llreg_results_insight,
                   eregr_results_insight,
                   etreg_results_insight],
                  axis=0)
mix_based_opnharmonized_on_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,opn_harmm_mix_MIXllinr.sav,7.897529,-212.870024,-110.99904
0,Lasso,opn_harmm_mix_MIXllreg.sav,8.039754,-216.789872,-106.281783
0,ElasticnetCV,opn_harmm_mix_MIXeregr.sav,15.678359,-680.92011,-187.2199
0,Extra trees,opnharm_mix_etreg.sav,3.801428,-60.901982,-46.05094


In [144]:
mix_based_opnharmonized_on_insight.to_csv(output_folder + '/mix_based_opnharmonized_on_insight.csv')