# ML testing: experiment #3g- harmonized with ComBat++

This notebook involves testing for the clinical harmonization paper. This notebook shows mixed_dataset (TOP + StrokeMRI) based models with ComBat++ harmonization to Insight 46 and Sabre datasets (the strokeMRI and TOP become one dataset)

Data: StrokeMRI, TOP, SABRE, Insight46, EDIS, (HELIUS pending)

Harmonisation: Combat++

Training data: Norment whichh is StrokeMRI and TOP togehter

Testing data: test set from Norment

Futher data applied to: SABRE, Insight46, EDIS, (HELIUS pending)

Validation method: K-fold, double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: SubjectID, real age, predicted age of validation and testing sets

### import libraries

In [150]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [151]:
filepath_mri_for_ids = os.path.join('/home/radv/mdijsselhof/my-scratch/CBAdatasets/eScience/','our_datasets/StrokeMRI/' )
filename_mri_for_ids = os.path.join(filepath_mri_for_ids,'TrainingDataComplete.csv') 

filepath_top_for_ids = os.path.join('/home/radv/mdijsselhof/my-scratch/CBAdatasets/eScience/','our_datasets/TOP/')
filename_top_for_ids = os.path.join(filepath_top_for_ids,'TrainingDataComplete.csv') 


filepath_topmri = 'harmonizations/harm_results/plus/' 
filename_topmri = os.path.join(filepath_topmri,'plus_harmonized5_topmri.csv') 

filepath_sabre = 'harmonizations/harm_results/plus/'
filename_sabre = os.path.join(filepath_topmri,'plus_harmonized5_sabre.csv') 

filepath_edis = 'harmonizations/harm_results/plus/'
filename_edis = os.path.join(filepath_topmri,'plus_harmonized5_edis.csv') 

filepath_helius = 'harmonizations/harm_results/plus/'
filename_helius = os.path.join(filepath_topmri,'plus_harmonized5_helius.csv') 

filepath_insight46 = 'harmonizations/harm_results/plus/'
filename_insight46 =  os.path.join(filepath_topmri,'plus_harmonized5_insight.csv') 



# read in data

TOPMRI = pd.read_csv(filename_topmri, index_col=0)
SABRE = pd.read_csv(filename_sabre, index_col=0)
Insight46 = pd.read_csv(filename_insight46, index_col=0)
EDIS=  pd.read_csv(filename_edis, index_col=0)
HELIUS =  pd.read_csv(filename_helius, index_col=0)

In [152]:
datasets = [
    TOPMRI,
    SABRE,
    Insight46,
    EDIS,
    HELIUS,
]
for data in datasets:
    data.columns = data.columns.str.lower()

TOPMRI = TOPMRI.drop(['id', 'site'], axis=1)
SABRE= SABRE.drop(['id', 'site'], axis=1)
Insight46= Insight46.drop(['id', 'site'], axis=1)
EDIS= EDIS.drop(['id', 'site'], axis=1)
HELIUS = HELIUS.drop(['id', 'site'], axis=1)

In [153]:
TOPMRI

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
1,sub-0668_1,50.400000,1,0.641295,0.495326,0.267765,0.456277,0.812151,-0.003504,14.095155,1.307450,0.298414,0.383769,0.766132,0.626313,17.897126,84.305987,77.900374,53.169370,68.612307
2,sub-0532_1,37.020000,1,0.717819,0.604665,0.326666,0.432841,0.803615,-0.004214,20.980256,0.107729,0.391102,0.721035,0.505814,0.611645,17.180514,95.715073,87.690683,63.897942,76.855858
3,sub-0529_1,30.570000,1,0.664766,0.469301,0.297046,0.466218,0.788410,-0.000441,16.839277,-0.038462,0.675109,0.429992,0.418169,0.468490,19.038816,103.575974,97.644834,68.868748,84.178769
4,sub-0393_1,47.050000,1,0.652443,0.514877,0.221586,0.468642,0.845659,-0.000756,23.934273,0.889728,0.405073,0.598978,0.409987,0.645396,19.807745,86.763994,78.039526,57.132865,70.239474
5,sub-0593_1,44.630000,1,0.660608,0.498725,0.283818,0.457973,0.804583,-0.001428,32.663858,0.316437,0.487874,0.223292,0.392470,0.255708,17.036103,76.528153,74.119399,63.811988,67.523163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,sub-5928501_1,67.178082,1,0.569080,0.459421,0.331738,0.417177,0.756094,0.005316,26.593239,2.450828,0.212169,0.216974,0.579333,0.597770,17.573698,64.541612,56.398009,36.994968,45.879171
1104,sub-5910502_1,67.794521,1,0.563406,0.455411,0.349609,0.410541,0.743386,-0.002700,28.739947,2.134161,0.344434,0.448625,0.662398,0.524295,15.803213,61.631364,56.238856,47.714656,49.731201
1105,sub-5931002_1,54.769863,0,0.667830,0.571669,0.426031,0.398485,0.742887,0.007069,37.467708,1.728236,1.151203,0.914687,0.770296,0.869043,17.149113,61.355881,54.057235,40.778592,49.074896
1106,sub-5925701_1,66.639344,1,0.582834,0.486575,0.352920,0.408098,0.752543,0.002577,26.608980,2.786829,0.163598,1.104606,1.123917,1.018356,16.129733,51.182443,53.649762,34.115713,41.587339


In [154]:

IDS_TOP =  pd.read_csv(filename_top_for_ids)
IDS_MRI =  pd.read_csv(filename_mri_for_ids)

In [155]:
coly = TOPMRI.columns
#SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
2071,sub-301662_1,79,1,0.556867,0.417756,0.419472,0.398236,0.69972,0.02287,27.955998,1.291436,0.625854,0.573227,0.691617,0.590015,19.31808,73.88985,64.460118,48.225298,59.542263
2072,sub-600024_1,70,2,0.509263,0.459559,0.330275,0.393774,0.745938,0.024958,32.532622,1.661724,0.341594,0.371011,0.421151,0.398835,15.001009,61.310531,57.554514,44.953459,50.293292
2073,sub-600134_1,62,1,0.655138,0.621189,0.430547,0.385792,0.743326,0.058942,82.43947,-1.402007,0.350801,0.383971,0.360651,0.410913,35.830425,118.723728,104.449677,84.418909,91.652377


In [156]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

# Now we need to break up the TOP and StrokeMRI datasets as well, and format them like the others

In [157]:
set_top_ids = set(IDS_TOP.participant_id)
set_mri_ids = set(IDS_MRI.participant_id)
StrokeMRI = TOPMRI[TOPMRI['participant_id'].isin(list(set_mri_ids))]
TOP = TOPMRI[TOPMRI['participant_id'].isin(list(set_top_ids))]
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
1,sub-0668_1,50.4,1,0.641295,0.495326,0.267765,0.456277,0.812151,-0.003504,14.095155,1.30745,0.298414,0.383769,0.766132,0.626313,17.897126,84.305987,77.900374,53.16937,68.612307
2,sub-0532_1,37.02,1,0.717819,0.604665,0.326666,0.432841,0.803615,-0.004214,20.980256,0.107729,0.391102,0.721035,0.505814,0.611645,17.180514,95.715073,87.690683,63.897942,76.855858
3,sub-0529_1,30.57,1,0.664766,0.469301,0.297046,0.466218,0.78841,-0.000441,16.839277,-0.038462,0.675109,0.429992,0.418169,0.46849,19.038816,103.575974,97.644834,68.868748,84.178769


In [158]:
StrokeMRI.head(3) 

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
530,sub-5917601_1,70.713706,1,0.644244,0.451751,0.467093,0.414706,0.698754,0.015742,14.583978,3.497514,1.116753,0.182737,1.229468,0.81153,35.244011,81.763867,74.529554,57.826735,64.238336
531,sub-5931802_1,47.583562,1,0.640465,0.509745,0.327423,0.432539,0.778008,-0.000574,23.918683,0.598289,0.186089,0.247395,0.258075,0.26707,21.268037,86.813536,80.142934,64.422286,71.10123
532,sub-5911901_1,74.10411,1,0.566288,0.499349,0.466122,0.367301,0.693324,0.040484,39.379186,2.065545,0.150218,0.496044,0.899257,0.340432,22.800846,68.624718,59.403358,56.218983,57.203324


In [159]:
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
1,sub-0668_1,50.4,1,0.641295,0.495326,0.267765,0.456277,0.812151,-0.003504,14.095155,1.30745,0.298414,0.383769,0.766132,0.626313,17.897126,84.305987,77.900374,53.16937,68.612307
2,sub-0532_1,37.02,1,0.717819,0.604665,0.326666,0.432841,0.803615,-0.004214,20.980256,0.107729,0.391102,0.721035,0.505814,0.611645,17.180514,95.715073,87.690683,63.897942,76.855858
3,sub-0529_1,30.57,1,0.664766,0.469301,0.297046,0.466218,0.78841,-0.000441,16.839277,-0.038462,0.675109,0.429992,0.418169,0.46849,19.038816,103.575974,97.644834,68.868748,84.178769


In [160]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [161]:
# make mixed StrokeMRI and TOP dataset
#mixed_data = pd.concat([TOP, StrokeMRI], sort=False)
mixed_data = TOPMRI
mixed_data.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
1,sub-0668_1,50.4,1,0.641295,0.495326,0.267765,0.456277,0.812151,-0.003504,14.095155,1.30745,0.298414,0.383769,0.766132,0.626313,17.897126,84.305987,77.900374,53.16937,68.612307
2,sub-0532_1,37.02,1,0.717819,0.604665,0.326666,0.432841,0.803615,-0.004214,20.980256,0.107729,0.391102,0.721035,0.505814,0.611645,17.180514,95.715073,87.690683,63.897942,76.855858
3,sub-0529_1,30.57,1,0.664766,0.469301,0.297046,0.466218,0.78841,-0.000441,16.839277,-0.038462,0.675109,0.429992,0.418169,0.46849,19.038816,103.575974,97.644834,68.868748,84.178769


In [162]:
output_folder = '3_ComBatPlusPlus'

os.makedirs(output_folder, exist_ok=True)

## Build ML models

In [163]:
ml_matrix = mixed_data.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')

In [164]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'req_neuroharm_mix_linr', LinearRegression(), ml_matrix, X[:,1:], y)

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


In [165]:
mixed_data.columns

Index(['participant_id', 'age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b_cbf', 'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf'],
      dtype='object')

In [166]:
linr_k_frame

Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,linear regression-0,0,req_neuroharm_mix_linr.0,2.912454,0.95372,0.953751
0,linear regression-1,1,req_neuroharm_mix_linr.1,2.852651,0.958185,0.958336
0,linear regression-2,2,req_neuroharm_mix_linr.2,2.80544,0.954104,0.95417
0,linear regression-3,3,req_neuroharm_mix_linr.3,2.981037,0.95181,0.952139
0,linear regression-4,4,req_neuroharm_mix_linr.4,2.808857,0.954923,0.954978


In [167]:
linr_k_frame.to_csv(output_folder + 'linr_k_frame_combatplusplus_harm.csv')

In [168]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 req_neuroharm_mix_linr.0 0 req_neuroha...,2.872088,0.954548,0.954675


In [169]:
linr_y_frame.to_csv(output_folder + 'linr_y_frame_combatplusplus_harm.csv')
linr_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,33.328492
1,52.495890,55.177301
2,31.372603,29.813501
3,50.345205,51.703607
4,77.593735,72.553437
...,...,...
272,69.424658,76.551467
273,48.730000,45.529257
274,60.183060,65.210404
275,40.355191,36.026198


In [170]:
linr_y_frame.to_csv(output_folder + 'linr_y_frame_combatplusplus_harm.csv')

In [171]:
linr = models[0]
linr[0]

In [172]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [173]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [174]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'unharm_mix_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,lasso regression-0,0,unharm_mix_llreg.0,3.167097,0.945834,0.945982
0,lasso regression-1,1,unharm_mix_llreg.1,3.005264,0.95174,0.951767
0,lasso regression-2,2,unharm_mix_llreg.2,2.917516,0.950897,0.951087
0,lasso regression-3,3,unharm_mix_llreg.3,3.313136,0.939982,0.940521
0,lasso regression-4,4,unharm_mix_llreg.4,3.012993,0.950917,0.951012


In [175]:
llreg_k_frame.to_csv(output_folder + 'llreg_k_frame_combatplusplus_harm.csv')

In [176]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 lasso regression-0 0 lasso regression-...,0 unharm_mix_llreg.0 0 unharm_mix_llreg....,3.083201,0.947874,0.948074


In [177]:
llreg_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,36.831496
1,52.495890,54.520112
2,31.372603,30.362388
3,50.345205,52.035684
4,77.593735,72.587939
...,...,...
272,69.424658,73.064995
273,48.730000,45.402442
274,60.183060,63.298349
275,40.355191,37.498809


In [178]:
llreg_y_frame.to_csv(output_folder + 'llreg_y_frame_combatplusplus_harm.csv')

In [179]:
llreg = models[0]
llreg[0]

In [180]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [181]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'unharm_mix_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,decision tree-0,0,unharm_mix_dtree.0,4.462672,0.891351,0.891528
0,decision tree-1,1,unharm_mix_dtree.1,4.083627,0.910416,0.910596
0,decision tree-2,2,unharm_mix_dtree.2,4.580262,0.871544,0.873685
0,decision tree-3,3,unharm_mix_dtree.3,4.410002,0.885069,0.885229
0,decision tree-4,4,unharm_mix_dtree.4,4.464044,0.883201,0.883409


In [182]:
dtree_k_frame.to_csv(output_folder + 'dtree_k_frame_combatplusplus_harm.csv')

In [183]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 decision tree-0 0 decision tree-1 0 ...,0 unharm_mix_dtree.0 0 unharm_mix_dtree....,4.400121,0.888316,0.888889


In [184]:
dtree_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,36.540000
1,52.495890,53.760289
2,31.372603,31.350000
3,50.345205,51.130000
4,77.593735,70.456808
...,...,...
272,69.424658,69.986301
273,48.730000,46.512329
274,60.183060,61.972176
275,40.355191,35.767760


In [185]:
dtree_y_frame.to_csv(output_folder + 'dtree_y_frame_combatplusplus_harm.csv')

In [186]:
dtree = models[0]
dtree[0]

In [187]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'unharm_mix_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,MLP regression-0,0,unharm_mix_regr.0,3.142912,0.93253,0.932689
0,MLP regression-1,1,unharm_mix_regr.1,2.946391,0.949682,0.951722
0,MLP regression-2,2,unharm_mix_regr.2,3.169527,0.942787,0.947939
0,MLP regression-3,3,unharm_mix_regr.3,3.377538,0.930903,0.938797
0,MLP regression-4,4,unharm_mix_regr.4,2.761971,0.953925,0.953977


In [188]:
regr_k_frame.to_csv(output_folder + 'regr_k_frame_combatplusplus_harm.csv')

In [189]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 unharm_mix_regr.0 0 unharm_mix_regr.1 ...,3.079668,0.941966,0.945025


In [190]:
regr_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,32.982025
1,52.495890,52.631526
2,31.372603,27.677244
3,50.345205,47.952510
4,77.593735,75.333054
...,...,...
272,69.424658,70.870590
273,48.730000,47.846619
274,60.183060,60.713790
275,40.355191,37.155308


In [191]:
regr_y_frame.to_csv(output_folder + 'regr_y_frame_combatplusplus_harm.csv')

In [192]:
regr = models[0]
regr[0]

In [193]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'unharm_mix_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,support vector reg poly2-0,0,unharm_mix_svrp2.0,11.450985,0.350904,0.357836
0,support vector reg poly2-1,1,unharm_mix_svrp2.1,12.059076,0.230519,0.23498
0,support vector reg poly2-2,2,unharm_mix_svrp2.2,11.378245,0.315521,0.326349
0,support vector reg poly2-3,3,unharm_mix_svrp2.3,11.820058,0.307657,0.31164
0,support vector reg poly2-4,4,unharm_mix_svrp2.4,11.636808,0.313669,0.318306


In [194]:
svrp2_k_frame.to_csv(output_folder + 'svrp2_k_frame_combatplusplus_harm.csv')

In [195]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 support vector reg poly2-0 0 support v...,0 unharm_mix_svrp2.0 0 unharm_mix_svrp2....,11.669035,0.303654,0.309822


In [196]:
svrp2_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,46.098574
1,52.495890,50.973043
2,31.372603,43.203618
3,50.345205,52.561105
4,77.593735,55.215942
...,...,...
272,69.424658,62.551680
273,48.730000,48.099129
274,60.183060,48.128574
275,40.355191,52.571359


In [197]:
svrp2_y_frame.to_csv(output_folder + 'svrp2_y_frame_combatplusplus_harm.csv')

In [198]:
svrp2 = models[0]
svrp2[0]

In [199]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'unharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,elasticnetCV-0,0,unharm_mix_eregr.0,3.397519,0.939041,0.939056
0,elasticnetCV-1,1,unharm_mix_eregr.1,3.479249,0.937144,0.937156
0,elasticnetCV-2,2,unharm_mix_eregr.2,3.363955,0.93507,0.935333
0,elasticnetCV-3,3,unharm_mix_eregr.3,3.547136,0.933188,0.933377
0,elasticnetCV-4,4,unharm_mix_eregr.4,3.485565,0.936355,0.936407


In [200]:
eregr_k_frame.to_csv(output_folder + 'eregr_k_frame_combatplusplus_harm.csv')

In [201]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 unharm_mix_eregr.0 0 unharm_mix_eregr....,3.454685,0.93616,0.936266


In [202]:
eregr_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,36.053251
1,52.495890,54.011852
2,31.372603,32.092267
3,50.345205,52.455351
4,77.593735,74.665858
...,...,...
272,69.424658,73.237986
273,48.730000,46.372930
274,60.183060,62.348250
275,40.355191,40.421878


In [203]:
eregr_y_frame.to_csv(output_folder + 'eregr_y_frame_combatplusplus_harm.csv')

In [204]:
eregr = models[0]
eregr[0]

In [205]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'unharm_mix_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X[:,1:], y)
etreg_k_frame

  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])
  mod_results = pd.concat([mod_results, mod_results_current_fold])


Unnamed: 0,algorithm,fold,file_name,mae,r2,explained_variance
0,extra trees-0,0,unharm_mix_etreg.0,3.058213,0.945463,0.945695
0,extra trees-1,1,unharm_mix_etreg.1,3.070389,0.950881,0.951022
0,extra trees-2,2,unharm_mix_etreg.2,2.878715,0.949451,0.949481
0,extra trees-3,3,unharm_mix_etreg.3,2.948716,0.949529,0.949532
0,extra trees-4,4,unharm_mix_etreg.4,2.960794,0.948077,0.948198


In [206]:
etreg_k_frame.to_csv(output_folder + 'etreg_k_frame_combatplusplus_harm.csv')

In [207]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 extra trees-0 0 extra trees-1 0 ext...,0 unharm_mix_etreg.0 0 unharm_mix_etreg....,2.983365,0.94868,0.948785


In [208]:
etreg_y_frame

Unnamed: 0,y_test,y_pred
0,27.550000,34.035759
1,52.495890,53.672680
2,31.372603,31.715975
3,50.345205,49.618281
4,77.593735,73.981688
...,...,...
272,69.424658,70.400182
273,48.730000,45.429596
274,60.183060,65.769061
275,40.355191,37.094277


In [209]:
etreg_y_frame.to_csv(output_folder + 'etreg_y_frame_combatplusplus_harm.csv')

In [210]:
etreg = models[0]
etreg[0]

In [211]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'unharm_mix_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'unharm_mix_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'unharm_mix_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'unharm_mix_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'unharm_mix_etreg4.sav'))

Show results ON AVERAGE for each model

In [212]:
mixed_based_combatplusplus_harmonized_on_testmix =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mixed_based_combatplusplus_harmonized_on_testmix

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,0 linear regression-0 0 linear regressio...,0 req_neuroharm_mix_linr.0 0 req_neuroha...,2.872088,0.954548,0.954675
0,0 lasso regression-0 0 lasso regression-...,0 unharm_mix_llreg.0 0 unharm_mix_llreg....,3.083201,0.947874,0.948074
0,0 decision tree-0 0 decision tree-1 0 ...,0 unharm_mix_dtree.0 0 unharm_mix_dtree....,4.400121,0.888316,0.888889
0,0 MLP regression-0 0 MLP regression-1 0 ...,0 unharm_mix_regr.0 0 unharm_mix_regr.1 ...,3.079668,0.941966,0.945025
0,0 support vector reg poly2-0 0 support v...,0 unharm_mix_svrp2.0 0 unharm_mix_svrp2....,11.669035,0.303654,0.309822
0,0 elasticnetCV-0 0 elasticnetCV-1 0 e...,0 unharm_mix_eregr.0 0 unharm_mix_eregr....,3.454685,0.93616,0.936266
0,0 extra trees-0 0 extra trees-1 0 ext...,0 unharm_mix_etreg.0 0 unharm_mix_etreg....,2.983365,0.94868,0.948785


In [213]:
mixed_based_combatplusplus_harmonized_on_testmix.to_csv(output_folder + 'mixed_based_combatplusplus_harmonized_on_testmix.csv')

# Running mixed model over SABRE dataset

## Here we will do an example of running  models made of the entire StrokeMRI and TOP dataset mixed as one

#### Build new models

In [214]:
TOPMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_icvratio,gmwm_icvratio,wmhvol_wmvol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b_cbf,aca_b_cbf,mca_b_cbf,pca_b_cbf,totalgm_b_cbf
1,sub-0668_1,50.4,1,0.641295,0.495326,0.267765,0.456277,0.812151,-0.003504,14.095155,1.30745,0.298414,0.383769,0.766132,0.626313,17.897126,84.305987,77.900374,53.16937,68.612307
2,sub-0532_1,37.02,1,0.717819,0.604665,0.326666,0.432841,0.803615,-0.004214,20.980256,0.107729,0.391102,0.721035,0.505814,0.611645,17.180514,95.715073,87.690683,63.897942,76.855858
3,sub-0529_1,30.57,1,0.664766,0.469301,0.297046,0.466218,0.78841,-0.000441,16.839277,-0.038462,0.675109,0.429992,0.418169,0.46849,19.038816,103.575974,97.644834,68.868748,84.178769


In [215]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [216]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [217]:
MIXlinr = LinearRegression()
MIXlinr.fit(X_train, y_train)

In [218]:
MIXllreg = linear_model.LassoLars(alpha=0.01)
MIXllreg.fit(X_train, y_train)

In [219]:
MIXeregr = ElasticNetCV(cv=5, random_state=17)
MIXeregr.fit(X_train, y_train)


In [220]:
MIXetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MIXetreg.fit(X_train, y_train)

In [221]:
##  Save these four best models

In [222]:
## optional save models
#joblib.dump(MIXlinr, ('../result_models/'+  'neuro_harm_mix_MIXlinr.sav'))
#joblib.dump(MIXllreg, ('../result_models/'+ 'neuro_harmm_mix_MIXllreg.sav'))
#joblib.dump(MIXeregr, ('../result_models/'+ 'neuro_harm_mix_MIXeregr.sav'))
#joblib.dump(MIXetreg, ('../result_models/'+ 'neuro_harm_mix_MIXetreg.sav'))

In [223]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [224]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [225]:
y_sabre_pred = MIXlinr.predict(X_sabre_test)

In [226]:
data= [[
    'Linear Reg',
    'unharm_mix_linr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXlinr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,unharm_mix_linr0.sav,21.425112,-17.875008,-11.14368


In [227]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_linr_sabre = pd.DataFrame(data)
y_frame_linr_sabre

Unnamed: 0,real,predicted
0,79.0,58.665962
1,70.0,63.829092
2,62.0,23.017629
3,78.0,38.027621
4,68.0,111.214922
...,...,...
701,66.0,52.644945
702,72.0,54.407969
703,73.0,48.409064
704,65.0,50.098924


In [228]:
y_frame_linr_sabre.to_csv(output_folder + 'y_frame_linr_sabre_combatplusplus_harm.csv')

In [229]:
y_sabre_pred = MIXllreg.predict(X_sabre_test)

In [230]:
data= [[
    'Lasso',
    'unharm_mix_lassor0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXllreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,unharm_mix_lassor0.sav,24.976871,-25.207671,-16.368406


In [231]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_llreg_sabre = pd.DataFrame(data)
y_frame_llreg_sabre

Unnamed: 0,real,predicted
0,79.0,53.579122
1,70.0,63.923730
2,62.0,27.407571
3,78.0,29.656713
4,68.0,127.410981
...,...,...
701,66.0,50.771191
702,72.0,53.756473
703,73.0,45.854572
704,65.0,47.504042


In [232]:
y_frame_llreg_sabre.to_csv(output_folder + 'y_frame_llreg_sabre_combatplusplus_harm.csv')

In [233]:
y_sabre_pred = MIXeregr.predict(X_sabre_test)

In [234]:
data= [[
    'ElasticnetCV',
    'harm_mix_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXeregr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,harm_mix_elasticregr.sav,24.44974,-22.89107,-13.870428


In [235]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_eregr_sabre = pd.DataFrame(data)
y_frame_eregr_sabre

Unnamed: 0,real,predicted
0,79.0,52.751833
1,70.0,62.040210
2,62.0,31.558614
3,78.0,30.076063
4,68.0,121.989971
...,...,...
701,66.0,50.354234
702,72.0,50.457100
703,73.0,44.103966
704,65.0,47.938406


In [236]:
y_frame_eregr_sabre.to_csv(output_folder + 'y_frame_eregr_sabre_combatplusplus_harm.csv')

In [237]:
y_sabre_pred = MIXetreg.predict(X_sabre_test)

In [238]:
data= [[
    'Extra trees',
    'harm_mix_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    MIXetreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,harm_mix_etreg.sav,22.817574,-15.435786,-3.99278


In [239]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_etregr_sabre = pd.DataFrame(data)
y_frame_etregr_sabre

Unnamed: 0,real,predicted
0,79.0,57.319292
1,70.0,62.472767
2,62.0,31.354030
3,78.0,32.915544
4,68.0,72.260113
...,...,...
701,66.0,46.372093
702,72.0,53.639005
703,73.0,49.048715
704,65.0,43.025937


In [240]:
y_frame_etregr_sabre.to_csv(output_folder + 'y_frame_etregr_sabre_combatplusplus_harm.csv')

In [241]:
mix_based_combatplusplus_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mix_based_combatplusplus_on_sabre

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,unharm_mix_linr0.sav,21.425112,-17.875008,-11.14368
0,Lasso,unharm_mix_lassor0.sav,24.976871,-25.207671,-16.368406
0,ElasticnetCV,harm_mix_elasticregr.sav,24.44974,-22.89107,-13.870428
0,Extra trees,harm_mix_etreg.sav,22.817574,-15.435786,-3.99278


In [242]:
mix_based_combatplusplus_on_sabre.to_csv(output_folder + 'mix_based_combatplusplus_on_sabre.csv')

# Running mixed model over Insight46 dataset

## Here we will do an example of running all the [0] models

In [243]:
insight_ml_matrix = Insight46.drop('participant_id', axis=1)
X_insight = insight_ml_matrix.drop('age', axis =1)
X_insight = X_insight.values
X_insight = X_insight.astype('float')
y_insight = insight_ml_matrix['age'].values
y_insight= y_insight.astype('float')

In [244]:
X_insight_test = X_insight
y_insight_test = y_insight

In [245]:
y_insight_pred = MIXlinr.predict(X_insight_test)

In [246]:
data= [[
    'Linear Reg',
    'neuroharm_mix_linr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXlinr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
linr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,neuroharm_mix_linr0.sav,19.910157,-702.018643,-490.031098


In [247]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_linr_insight = pd.DataFrame(data)
y_frame_linr_insight

Unnamed: 0,real,predicted
0,71.072222,55.594385
1,73.000000,61.132867
2,74.000000,51.512148
3,71.644444,54.374416
4,71.694444,52.162341
...,...,...
612,72.000000,54.085965
613,73.000000,163.091964
614,72.000000,52.648401
615,70.605556,50.694045


In [248]:
y_frame_linr_insight.to_csv(output_folder + 'y_frame_linr_insight_combatplusplus_harm.csv')

In [249]:
y_insight_pred = MIXllreg.predict(X_insight_test)

In [250]:
data= [[
    'Lasso',
    'neuroharm_mix_lassor0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXllreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
llreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Lasso,neuroharm_mix_lassor0.sav,22.777167,-947.405923,-670.978145


In [251]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_llreg_insight = pd.DataFrame(data)
y_frame_llreg_insight

Unnamed: 0,real,predicted
0,71.072222,50.076866
1,73.000000,48.735794
2,74.000000,49.777718
3,71.644444,52.662767
4,71.694444,48.847716
...,...,...
612,72.000000,46.947165
613,73.000000,181.496506
614,72.000000,47.881773
615,70.605556,50.248099


In [252]:
y_frame_llreg_insight.to_csv(output_folder + 'y_frame_llreg_insight_combatplusplus_harm.csv')

In [253]:
y_insight_pred = MIXeregr.predict(X_insight_test)

In [254]:
data= [[
    'ElasticnetCV',
    'neuroharm_mix_elasticregr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXeregr.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
eregr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,ElasticnetCV,neuroharm_mix_elasticregr.sav,22.798186,-895.989315,-614.255842


In [255]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_eregr_insight = pd.DataFrame(data)
y_frame_eregr_insight

Unnamed: 0,real,predicted
0,71.072222,47.956888
1,73.000000,48.963306
2,74.000000,50.274671
3,71.644444,52.992207
4,71.694444,46.627563
...,...,...
612,72.000000,48.031817
613,73.000000,171.946251
614,72.000000,46.914868
615,70.605556,50.358815


In [256]:
y_frame_eregr_insight.to_csv(output_folder + 'y_frame_eregr_insight_combatplusplus_harm.csv')

In [257]:
y_insight_pred = MIXetreg.predict(X_insight_test)

In [258]:
data= [[
    'Extra trees',
    'neuroharm_mix_etreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    MIXetreg.score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
etreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,neuroharm_mix_etreg.sav,22.552031,-380.305874,-27.849622


In [259]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_etreg_insight = pd.DataFrame(data)
y_frame_etreg_insight

Unnamed: 0,real,predicted
0,71.072222,47.756026
1,73.000000,49.942385
2,74.000000,48.712318
3,71.644444,51.073842
4,71.694444,46.477476
...,...,...
612,72.000000,43.917790
613,73.000000,79.009308
614,72.000000,44.255815
615,70.605556,44.055838


In [260]:
y_frame_etreg_insight.to_csv(output_folder + 'y_frame_etreg_insight_combatplusplus_harm.csv')

In [261]:
mix_based_combatplusplus_on_insight =pd.concat([linr_results_insight,
                   llreg_results_insight,
                   eregr_results_insight,
                   etreg_results_insight],
                  axis=0)
mix_based_combatplusplus_on_insight

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Linear Reg,neuroharm_mix_linr0.sav,19.910157,-702.018643,-490.031098
0,Lasso,neuroharm_mix_lassor0.sav,22.777167,-947.405923,-670.978145
0,ElasticnetCV,neuroharm_mix_elasticregr.sav,22.798186,-895.989315,-614.255842
0,Extra trees,neuroharm_mix_etreg.sav,22.552031,-380.305874,-27.849622


In [262]:
mix_based_combatplusplus_on_insight.to_csv(output_folder + 'mix_based_combatplusplus_on_insight.csv')