# ML testing: experiment #1g- harmonized with combat++

This notebook involves testing for the MRI conference abstract. This notebook shows harmonized StrokeMRI and TOP based models, and how they perform on each other, once harmonized by combat++

Data: StrokeMRI, TOP

Harmonisation: combat++

Training data: StrokeMRI and/or TOP

Testing data: StrokeMRI and/or TOP test subsets

Futher data applied to: none

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: ? what do we want to have here?

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
 
# # demo stuff
import ipywidgets as widgets
from ipywidgets import interactive
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filepath_mri = 'harmonizations/harm_results/plus/'
filename_mri = os.path.join(filepath_mri,'plus_harmonized_mri_from_v1.csv') 

filepath_top = 'harmonizations/harm_results/plus/'
filename_top = os.path.join(filepath_top,'plus_harmonized_top_from_v1.csv') 

In [None]:
TOP = pd.read_csv(filename_top, index_col=0)
StrokeMRI = pd.read_csv(filename_mri, index_col=0)

In [None]:
TOP

In [None]:
TOP = TOP.drop(TOP.columns[0],axis=1)
TOP = TOP.drop(['ID', 'Site'], axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(['ID', 'Site'], axis=1)
StrokeMRI

In [None]:
TOP

In [None]:
TOP.columns = TOP.columns.str.lower() 
StrokeMRI.columns = StrokeMRI.columns.str.lower() 

In [None]:
# # Now we need to flip the sex back to numbers for a correlation
# sex_mapping = {'F':0,'M':1}
# TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

In [None]:
# StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.tail(3)

In [None]:
#give a checkbox for out put folder
loged_feat = widgets.ToggleButton(
    value=False,
    description='Click me if features logged',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Description',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
loged_feat

In [None]:
loged_feat.value

In [None]:
if loged_feat.value == False:
    output_folder = '1g_no_log_outputs'
else:
    output_folder = '1g_loged_outputs'

os.makedirs(output_folder, exist_ok=True)

## Build ML models based on combat_plusp_ StrokeMRI

In [None]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'combat_plusp_harm_mri_linr', LinearRegression(), ml_matrix, X, y)

In [None]:
linr_k_frame

In [None]:
linr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_linr_k_frame.csv')

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_linr_y_frame.csv')

In [None]:
linr = models[0]
linr[0]

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'combat_plusp_harm_mri_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'combat_plusp_harm_mri_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'combat_plusp_harm_mri_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'combat_plusp_harm_mri_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'combat_plusp_harm_mri_linr4.sav'))

In [None]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'combat_plusp_harm_mri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

In [None]:
llreg_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_llreg_k_frame.csv')

In [None]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

In [None]:
llreg_y_frame

In [None]:
llreg_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_llreg_y_frame.csv')

In [None]:
llreg = models[0]
llreg[0]

In [None]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'combat_plusp_harm_mri_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'combat_plusp_harm_mri_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'combat_plusp_harm_mri_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'combat_plusp_harm_mri_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'combat_plusp_harm_mri_llreg4.sav'))

In [None]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'combat_plusp_harm_mri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

In [None]:
dtree_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_dtree_k_frame.csv')

In [None]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

In [None]:
dtree_y_frame

In [None]:
dtree_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_dtree_y_frame.csv')

In [None]:
dtree = models[0]
dtree[0]

In [None]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'combat_plusp_harm_mri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

In [None]:
regr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_regr_k_frame.csv')

In [None]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

In [None]:
regr_y_frame

In [None]:
regr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_regr_y_frame.csv')

In [None]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [None]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'combat_plusp_harm_mri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

In [None]:
svrp2_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_svrp2_k_frame.csv')

In [None]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

In [None]:
svrp2_y_frame

In [None]:
svrp2_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mrisvrp2_y_frame.csv')

In [None]:
svrp2 = models[0]
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'combat_plusp_harm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

In [None]:
eregr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_eregr_k_frame.csv')

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_mri_eregr_y_frame.csv')

In [None]:
eregr = models[0]
eregr[0]

In [None]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'combat_plusp_harm_mri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

In [None]:
etreg_k_frame.to_csv(output_folder + '/combat_plusp_haromized_etreg_k_frame.csv')

In [None]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

In [None]:
etreg_y_frame

In [None]:
etreg_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_etreg_y_frame.csv')

In [None]:
etreg = models[0]
etreg[0]

In [None]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'combat_plusp_harm_mri_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'combat_plusp_harm_mri_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'combat_plusp_harm_mri_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'combat_plusp_harm_mri_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'combat_plusp_harm_mri_etreg4.sav'))

Show results ON AVERAGE for each model

In [None]:
mri_based_combat_plusp_harmonized_on_testmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mri_based_combat_plusp_harmonized_on_testmri

In [None]:
mri_based_combat_plusp_harmonized_on_testmri.to_csv(output_folder + '/mri_based_combat_plusp_harm_on_testmri_AVERAGES.csv')

## Now we will build  models based on the whole  combat_plusp_ harmonized StrokeMRI dataset, and apply them to TOP . 

In [None]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [None]:
MRIlinr = LinearRegression()
MRIlinr.fit(X_train, y_train)

In [None]:
MRIllreg = linear_model.LassoLars(alpha=0.01)
MRIllreg.fit(X_train, y_train)

In [None]:
MRIeregr = ElasticNetCV(cv=5, random_state=17)
MRIeregr.fit(X_train, y_train)


In [None]:
MRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
MRIetreg.fit(X_train, y_train)

In [None]:
##  Save these four best models

In [None]:

## optional save models
#joblib.dump(MRIlinr, ('../result_models/' + 'combat_plusp_harm_whole_mri_linr.sav'))
#joblib.dump(MRIllreg, ('../result_models/'+ 'combat_plusp_harm_whole_mri_llreg1.sav'))
#joblib.dump(MRIeregr, ('../result_models/'+ 'combat_plusp_harm_whole_mri_eregr3.sav'))
#joblib.dump(MRIetreg, ('../result_models/'+ 'combat_plusp_harm_whole_mri_etreg4.sav'))

# Running whole MRI model over TOP dataset

In [None]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
X_top_test = X_top
y_top_test = y_top


In [None]:
y_top_pred = MRIlinr.predict(X_top_test)

In [None]:
# print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
# print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'linear regression',
    'combat_plusp_harm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIlinr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
linr_compare

In [None]:
linr_compare.to_csv(output_folder + '/whole_combat_plusp_harm_mri_linr_compare_on_top.csv')

In [None]:
y_top_pred = MRIllreg.predict(X_top_test)

In [None]:
data= [[
    'lasso regression',
    'combat_pluspharm_whole_mri_llreg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIllreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

In [None]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

In [None]:
llreg_compare.to_csv(output_folder + '/whole_combat_plusp_harm_mri_llreg_compare_on_top.csv')

In [None]:
y_top_pred = MRIeregr.predict(X_top_test)

In [None]:
data= [[
    'elasticnetCV',
    'combat_pluspharm_whole_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIeregr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

In [None]:
eregr_compare.to_csv(output_folder + '/whole_combat_plusp_harm_mri_eregr_compare_on_top.csv')

In [None]:
y_top_pred = MRIetreg.predict(X_top_test)

In [None]:
data= [[
    'extra trees',
    'neruo_harm_mri_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    MRIetreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

In [None]:
etreg_compare.to_csv(output_folder + '/whole_combat_plusp_harm_mri_etreg_compare_on_top.csv')

In [None]:
mri_based_combat_plusp__harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mri_based_combat_plusp__harmonized_on_top

## Now we will run the exact opposite process.
1. We will explore TOP based models via k-folded results, 
2. We will make a general unahrmonized TOP model (based off all TOP)
3. We will apply the best of these model to the StrokeMRI dataset

### Build ML models based on neruocombat harmonized TOP 

In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'combat_plusp_harm_top_linr', LinearRegression(), ml_matrix, X, y)

In [None]:
linr_k_frame

In [None]:
linr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_linr_k_frame.csv')

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_linr_y_frame.csv')

In [None]:
linr = models[0]
linr[0]

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'combat_plusp_harm_top_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'combat_plusp_harm_top_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'combat_plusp_harm_top_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'combat_plusp_harm_top_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'combat_plusp_harm_top_linr4.sav'))

In [None]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'combat_plusp_harm_top_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

In [None]:
llreg_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_llreg_k_frame.csv')

In [None]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

In [None]:
llreg_y_frame

In [None]:
llreg_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_llreg_y_frame.csv')

In [None]:
llreg = models[0]
llreg[0]

In [None]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'combat_plusp_harm_top_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'combat_plusp_harm_top_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'combat_plusp_harm_top_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'combat_plusp_harm_top_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'combat_plusp_harm_top_llreg4.sav'))

In [None]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'combat_plusp_harm_top_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

In [None]:
dtree_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_dtree_k_frame.csv')

In [None]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

In [None]:
dtree_y_frame

In [None]:
dtree_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_dtree_y_frame.csv')

In [None]:
dtree = models[0]
dtree[0]

In [None]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'combat_plusp_harm_top_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

In [None]:
regr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_regr_k_frame.csv')

In [None]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

In [None]:
regr_y_frame

In [None]:
regr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_regr_y_frame.csv')

In [None]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [None]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'combat_plusp_harm_top_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

In [None]:
svrp2_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_svrp2_k_frame.csv')

In [None]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

In [None]:
svrp2_y_frame

In [None]:
svrp2_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_svrp2_y_frame.csv')

In [None]:
svrp2 = models[0]
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'combat_plusp_harm_top_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

In [None]:
eregr_k_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_eregr_k_frame.csv')

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_eregr_y_frame.csv')

In [None]:
eregr = models[0]
eregr[0]

In [None]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'combat_plusp_harm_top_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

In [None]:
etreg_k_frame.to_csv(output_folder + '/combat_plusp_haromized_top_etreg_k_frame.csv')

In [None]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

In [None]:
etreg_y_frame

In [None]:
etreg_y_frame.to_csv(output_folder + '/combat_plusp_harmonized_top_etreg_y_frame.csv')

In [None]:
etreg = models[0]
etreg[0]

In [None]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'combat_plusp_harm_top_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'combat_plusp_harm_top_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'combat_plusp_harm_top_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'combat_plusp_harm_top_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'combat_plusp_harm_top_etreg4.sav'))

Show results ON AVERAGE for each model

In [None]:
top_based_combat_plusp_harmonized_on_testtop =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
top_based_combat_plusp_harmonized_on_testtop

In [None]:
top_based_combat_plusp_harmonized_on_testtop.to_csv(output_folder + '/top_based_combat_plusp_harmonized_on_topt_AVERAGES.csv')

## Now we will build  models based on the whole harmonized StrokeTOP dataset, and apply them to StrokeMRI. 

In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [None]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [None]:
TOPlinr = LinearRegression()
TOPlinr.fit(X_train, y_train)

In [None]:
TOPllreg = linear_model.LassoLars(alpha=0.01)
TOPllreg.fit(X_train, y_train)

In [None]:
TOPeregr = ElasticNetCV(cv=5, random_state=17)
TOPeregr.fit(X_train, y_train)

In [None]:
TOPetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPetreg.fit(X_train, y_train)

##  Save these four best models

In [None]:
## optional save models
#joblib.dump(TOPlinr, ('../result_models/' + 'combat_plusp_harm_whole_top_linr.sav'))
#joblib.dump(TOPllreg, ('../result_models/'+ 'combat_plusp_harm_whole_top_llreg1.sav'))
#joblib.dump(TOPeregr, ('../result_models/'+ 'combat_plusp_harm_whole_top_eregr3.sav'))
#joblib.dump(TOPetreg, ('../result_models/'+ 'combat_plusp_harm_whole_top_etreg4.sav'))

# Running whole TOP model over MRI dataset

In [None]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri= y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri


In [None]:
y_mri_pred = TOPlinr.predict(X_mri_test)

In [None]:
data= [[
    'linear regression',
    'combat_plusp_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPlinr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

In [None]:
linr_compare.to_csv(output_folder + '/whole_combat_plusp__harm_top_linr_compare_on_mti.csv')

In [None]:
y_mri_pred = TOPllreg.predict(X_mri_test)

In [None]:
data= [[
    'lasso regression',
    'combat_plusp_harm_whole_top_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPllreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

In [None]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
#llreg_compare = llreg_compare.reset_index()
llreg_compare

In [None]:
llreg_compare.to_csv(output_folder + '/whole_combat_plusp_harm_top_llreg_compare_on_mri.csv')

In [None]:
y_mri_pred = TOPeregr.predict(X_mri_test)

In [None]:
data= [[
    'elasticnetCV',
    'combat_plusp_harm_whole_top_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPeregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare

In [None]:
eregr_compare.to_csv(output_folder + '/whole_combat_plusp_harm_top_eregr_compare_on_mri.csv')

In [None]:
y_mri_pred = TOPetreg.predict(X_mri_test)

In [None]:
data= [[
    'extra trees',
    'combat_plusp_harm_mri_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    TOPetreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

In [None]:
etreg_compare.to_csv(output_folder + '/whole_combat_plusptharm_top_etreg_compare_on_mri.csv')

compile csvs of results

In [None]:
top_based_combat_plusp_harmonized_on_mri =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_combat_plusp_harmonized_on_mri

In [None]:
top_based_combat_plusp_harmonized_on_mri.to_csv(output_folder + '/whole_top_based_combat_plus-harmonized_on_mri.csv')