# ML testing: experiment #2dc- harmonized with auto combat (TOPMRI on EDIS)

This notebook involves testing of the autoombat algorithm and ML for age prediction. This notebook shows harmonized StrokeMRI+TOP based models, and how they perform on EDIS.  The autocombat algorithm requires that samples are not unique even in continous variables. Therefore before harmonization, a variable for age group was created, and then removed after harmonization. Age group can currently be split by tens or by twos, a more fine grained split. A widget allows the choice. 


Data: StrokeMRI, TOP, EDIS

Harmonisation: Auto-combat

Training data: StrokeMRI and TOP together

Testing data: StrokeMRI and TOP together, test subsets

Futher data applied to: EDIS

Validation method: K-fold double-stratified

Brain-age algorithms: LR, lasso, extra trees, elasticCV net fully tested (but not optimized parameters), additionals partly

Outputs: ? what do we want to have here?

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
# pick how you want the data split by age for autocombat 
possibilities = ['split_into_tens','split_into_twos']

split = widgets.Dropdown(options= possibilities,
                    value='split_into_tens',
                    description='split on age',
                    disabled=False)
split

In [None]:
filepath = 'harmonizations/harm_results/autocombat/'  

if split.value == 'split_into_tens':
    filename_topmri = os.path.join(filepath, 'autocom_harm_topmri_v_e.csv')
    filename_edis = os.path.join(filepath,'autocom_harm_edis.csv') 
else: 
    filename_topmri = os.path.join(filepath, 'fg_autocom_harm_topmri_v_e.csv')
    filename_edis = os.path.join(filepath,'fg_autocom_harm_edis1.csv') 

In [None]:
TOPMRI = pd.read_csv(filename_topmri)
EDIS= pd.read_csv(filename_edis)

In [None]:
TOPMRI = TOPMRI.drop(TOPMRI.columns[0],axis=1)
EDIS = EDIS.drop(EDIS.columns[0],axis=1)
EDIS.head(3)

In [None]:
TOPMRI.head(3)

In [None]:
output_folder = '2dc_loged_outputs'
os.makedirs(output_folder, exist_ok=True)

## Build ML models based on auto combat StrokeMRI-TOP mixed set (TOPMRI)

In [None]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [None]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'auto_harm_topmri_linr', LinearRegression(), ml_matrix, X, y)

In [None]:
linr_k_frame

In [None]:
linr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_linr_k_frame.csv')

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_linr_y_frame.csv')

In [None]:
linr = models[0]
linr[0]

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'auto_harm_topmri_EDIS_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'auto_harm_topmri_EDIS_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'auto_harm_topmri_EDIS_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'auto_harm_topmri_EDIS_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'auto_harm_topmri_EDIS_linr4.sav'))

In [None]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'auto_harm_topmri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

In [None]:
llreg_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_llreg_k_frame.csv')

In [None]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

In [None]:
llreg_y_frame

In [None]:
llreg_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_llreg_y_frame.csv')

In [None]:
llreg = models[0]
llreg[0]

In [None]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg4.sav'))

In [None]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'auto_harm_topmri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

In [None]:
dtree_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_dtree_k_frame.csv')

In [None]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

In [None]:
dtree_y_frame

In [None]:
dtree_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_dtree_y_frame.csv')

In [None]:
dtree = models[0]
dtree[0]

In [None]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'auto_harm_topmri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

In [None]:
regr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_regr_k_frame.csv')

In [None]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

In [None]:
regr_y_frame

In [None]:
regr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_regr_y_frame.csv')

In [None]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [None]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'auto_harm_topmri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

In [None]:
svrp2_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_svrp2_k_frame.csv')

In [None]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

In [None]:
svrp2_y_frame

In [None]:
svrp2_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_svrp2_y_frame.csv')

In [None]:
svrp2 = models[0]
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'auto_harm_topmri_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

In [None]:
eregr_k_frame.to_csv(output_folder + '/auto_harmonized_topmri_eregr_k_frame.csv')

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_EDIS_eregr_y_frame.csv')

In [None]:
eregr = models[0]
eregr[0]

In [None]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'auto_harm_topmri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

In [None]:
etreg_k_frame.to_csv(output_folder + '/auto_haromized_topmri_EDIS_etreg_k_frame.csv')

In [None]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

In [None]:
etreg_y_frame

In [None]:
etreg_y_frame.to_csv(output_folder + '/auto_harmonized_topmri_EDIS_etreg_y_frame.csv')

In [None]:
etreg = models[0]
etreg[0]

In [None]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg4.sav'))

Show results ON AVERAGE for each model

In [None]:
topmri_based_auto_harmonized_on_testtopmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
topmri_based_auto_harmonized_on_testtopmri

In [None]:
topmri_based_auto_harmonized_on_testtopmri.to_csv(output_folder + '/topmri_based_auto_harmonized_on_testtopmri_AVERAGES.csv')

## Now we will build  models based on the whole  autocombat harmonized TOPMRI dataset, and apply them to EDIS . 

In [None]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [None]:
TOPMRIlinr = LinearRegression()
TOPMRIlinr.fit(X_train, y_train)

In [None]:
TOPMRIllreg = linear_model.LassoLars(alpha=0.01)
TOPMRIllreg.fit(X_train, y_train)

In [None]:
TOPMRIeregr = ElasticNetCV(cv=5, random_state=17)
TOPMRIeregr.fit(X_train, y_train)


In [None]:
TOPMRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPMRIetreg.fit(X_train, y_train)

In [None]:
##  Save these four best models

In [None]:

## optional save models
#joblib.dump(TOPMRIlinr, ('../result_models/' + 'auto_harm_topmri_EDIS_linr.sav'))
#joblib.dump(TOPMRIllreg, ('../result_models/'+ 'auto_harm_topmri_EDIS_llreg1.sav'))
#joblib.dump(TOPMRIeregr, ('../result_models/'+ 'auto_harm_topmri_EDIS_eregr3.sav'))
#joblib.dump(TOPMRIetreg, ('../result_models/'+ 'auto_harm_topmri_EDIS_etreg4.sav'))

# Running whole TOPMRI model over EDIS dataset

In [None]:
EDIS_ml_matrix = EDIS.drop('participant_id', axis=1)
X_EDIS = EDISml_matrix.drop('age', axis =1)
X_EDIS = X_EDISvalues
X_EDIS = X_EDIS.astype('float')
y_EDIS= EDIS_ml_matrix['age'].values
y_EDIS=y_EDIS.astype('float')

In [None]:
X_EDIS_test = X_EDIS
y_EDIS_test = y_EDIS


In [None]:
y_EDIS_pred = TOPMRIlinr.predict(X_EDIS_test)

In [None]:
data= [[
    'linear regression',
    'auto_harm_topmri_EDIS_linr.sav',
    mean_absolute_error(y_EDIS_test, y_EDIS_pred),
    TOPMRIlinr.score(X_EDIS_test,y_EDIS_test),
    metrics.explained_variance_score(y_EDIS_test, y_EDISpred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_EDIStest
linr_y_pred = y_EDISpred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

In [None]:
linr_compare.to_csv(output_folder + '/whole_auto_harm_topmri_EDIS_linr_compare_on_top.csv')

In [None]:
y_EDIS_pred = TOPMRIllreg.predict(X_EDIS_test)

In [None]:
data= [[
    'lasso regression',
    'auto_harm_whole_topmri_EDIS_llreg.sav',
    mean_absolute_error(y_EDIS_test, y_EDISpred),
    TOPMRIllreg.score(X_EDIS_test,y_EDIS_test),
    metrics.explained_variance_score(y_EDIS_test, y_EDIS_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

In [None]:
llreg_y_test = y_EDIS_test
llreg_y_pred = y_EDIS_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare

In [None]:
llreg_compare.to_csv(output_folder + '/whole_auto_harm_topmri_EDIS_llreg_compare_on_top.csv')

In [None]:
y_EDIS_pred = TOPMRIeregr.predict(X_EDIS_test)

In [None]:
data= [[
    'elasticnetCV',
    'auto_harm_whole_topmri_EDISlinr.sav',
    mean_absolute_error(y_EDIStest, y_sEDISpred),
    TOPMRIeregr.score (X_EDIStest, y_EDIS_test),
    metrics.explained_variance_score(y_EDIStest, y_EDIS_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_EDIS_test
eregr_y_pred = y_EDIS_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

In [None]:
eregr_compare.to_csv(output_folder + '/whole_auto_harm_topmri_EDIS_eregr_compare_on_top.csv')

In [None]:
y_EDIS_pred = TOPMRIetreg.predict(X_EDIStest)

In [None]:
data= [[
    'extra trees',
    'auto_harm_topmri_EDIS_linr.sav',
    mean_absolute_error(y_EDIS_test, y_EDIS_pred),
    TOPMRIetreg.score(X_EDIS_test,y_EDIS_test),
    metrics.explained_variance_score(y_EDIS_test, y_EDISpred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_EDIS_test
etreg_y_pred = y_EDIS_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

In [None]:
etreg_compare.to_csv(output_folder + '/whole_auto_harm_topmri_EDIS_etreg_compare_on_top.csv')

In [None]:
topmri_based_auto_combat_harmonized_on_EDIS =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
topmri_based_auto_combat_harmonized_on_EDIS

In [None]:
topmri_based_auto_combat_harmonized_on_EDIS.to_csv(output_folder + '/topmri_based_auto_combat_harmonized_on_EDIS.csv')