# ML testing: experiment #2b- harmonized with open nested combat (TOPMRI on sabre)

This notebook involves testing for the MRI conference abstract. This notebook shows harmonized StrokeMRI+TOP based models, and how they perform on sabre

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filepath_topmri = 'harmonizations/harm_results/open_nested_combat/' 
filename_topmri = os.path.join(filepath_topmri,'topmri_opn_harmonized_to_sab.csv') 

filepath_sabre = 'harmonizations/harm_results/open_nested_combat/' 
filename_sabre = os.path.join(filepath_sabre,'sabre_opn_harmonized.csv')  


In [None]:
TOPMRI = pd.read_csv(filename_topmri)
SABRE = pd.read_csv(filename_sabre)

In [None]:
TOPMRI = TOPMRI.drop(TOPMRI.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
SABRE.head(3)

In [None]:
TOPMRI.head(3)

In [None]:
# # Now we need to flip the sex back to numbers for a correlation
# sex_mapping = {'F':0,'M':1}
# TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
# TOPMRI.head(3)

In [None]:
# SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
# SABRE.head(3)

## Build ML models based on open nested combat StrokeMRI-TOP mixed set (TOPMRI)

In [None]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')

In [None]:
linr_k_frame, linr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('linear regression', 'open_harm_topmri_linr', LinearRegression(), ml_matrix, X, y)

In [None]:
linr_k_frame

In [None]:
linr_k_frame.to_csv('open_harmonized_topmri_linr_k_frame.csv')

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr_y_frame.to_csv('open_harmonized_topmri_linr_y_frame.csv')

In [None]:
linr = models[0]
linr[0]

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'open_harm_topmri_sabre_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'open_harm_topmri_sabre_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'open_harm_topmri_sabre_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'open_harm_topmri_sabre_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'open_harm_topmri_sabre_linr4.sav'))

In [None]:
llreg_k_frame, llreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('lasso regression', 'open_harm_topmri_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X, y)
llreg_k_frame

In [None]:
llreg_k_frame.to_csv('open_harmonized_topmri_llreg_k_frame.csv')

In [None]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

In [None]:
llreg_y_frame

In [None]:
llreg_y_frame.to_csv('open_harmonized_topmri_llreg_y_frame.csv')

In [None]:
llreg = models[0]
llreg[0]

In [None]:
## optional save models
#joblib.dump(llreg[0], ('../result_models/'+ 'open_harm_topmri_sabre_llreg0.sav'))
#joblib.dump(llreg[1], ('../result_models/'+ 'open_harm_topmri_sabre_llreg1.sav'))
#joblib.dump(llreg[2], ('../result_models/'+ 'open_harm_topmri_sabre_llreg2.sav'))
#joblib.dump(llreg[3], ('../result_models/'+ 'open_harm_topmri_sabre_llreg3.sav'))
#joblib.dump(llreg[4], ('../result_models/'+ 'open_harm_topmri_sabre_llreg4.sav'))

In [None]:
dtree_k_frame, dtree_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('decision tree', 'open_harm_topmri_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X, y)
dtree_k_frame

In [None]:
dtree_k_frame.to_csv('open_harmonized_topmri_dtree_k_frame.csv')

In [None]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

In [None]:
dtree_y_frame

In [None]:
dtree_y_frame.to_csv('open_harmonized_topmri_dtree_y_frame.csv')

In [None]:
dtree = models[0]
dtree[0]

In [None]:
regr_k_frame, regr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('MLP regression', 'open_harm_topmri_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X, y)
regr_k_frame

In [None]:
regr_k_frame.to_csv('open_harmonized_topmri_regr_k_frame.csv')

In [None]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

In [None]:
regr_y_frame

In [None]:
regr_y_frame.to_csv('open_harmonized_topmri_regr_y_frame.csv')

In [None]:
regr = models[0]
regr[0]

### Note I'm not actually saving the mlp or svr models here because they are not the best performers. if neccesary this can be added

In [None]:
svrp2_k_frame, svrp2_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('support vector reg poly2', 'open_harm_topmri_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X, y)
svrp2_k_frame

In [None]:
svrp2_k_frame.to_csv('open_harmonized_topmri_svrp2_k_frame.csv')

In [None]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

In [None]:
svrp2_y_frame

In [None]:
svrp2_y_frame.to_csv('open_harmonized_topmri_svrp2_y_frame.csv')

In [None]:
svrp2 = models[0]
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('elasticnetCV', 'open_harm_topmri_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X, y)
eregr_k_frame

In [None]:
eregr_k_frame.to_csv('open_harmonized_topmri_eregr_k_frame.csv')

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr_y_frame.to_csv('open_harmonized_topmri_sabre_eregr_y_frame.csv')

In [None]:
eregr = models[0]
eregr[0]

In [None]:
etreg_k_frame, etreg_y_frame, models = sep.stratified_cat_and_cont_categories_shuffle_split('extra trees', 'open_harm_topmri_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X, y)
etreg_k_frame

In [None]:
etreg_k_frame.to_csv('open_haromized_topmri_sabre_etreg_k_frame.csv')

In [None]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

In [None]:
etreg_y_frame

In [None]:
etreg_y_frame.to_csv('open_harmonized_topmri_sabre_etreg_y_frame.csv')

In [None]:
etreg = models[0]
etreg[0]

In [None]:
## optional save models
#joblib.dump(etreg[0], ('../result_models/'+ 'open_harm_topmri_sabre_etreg0.sav'))
#joblib.dump(etreg[1], ('../result_models/'+ 'open_harm_topmri_sabre_etreg1.sav'))
#joblib.dump(etreg[2], ('../result_models/'+ 'open_harm_topmri_sabre_etreg2.sav'))
#joblib.dump(etreg[3], ('../result_models/'+ 'open_harm_topmri_sabre_etreg3.sav'))
#joblib.dump(etreg[4], ('../result_models/'+ 'open_harm_topmri_sabre_etreg4.sav'))

Show results ON AVERAGE for each model

In [None]:
topmri_based_open_harmonized_on_testtopmri =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
topmri_based_open_harmonized_on_testtopmri

In [None]:
topmri_based_open_harmonized_on_testtopmri.to_csv('topmri_based_open_harmonized_on_testtopmri_AVERAGES.csv')

## Now we will build  models based on the whole  open combat harmonized TOPMRI dataset, and apply them to SABRE . 

In [None]:
ml_matrix = TOPMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
# now we decide to do no test train split, rather take all
X_train = X
y_train = y

In [None]:
TOPMRIlinr = LinearRegression()
TOPMRIlinr.fit(X_train, y_train)

In [None]:
TOPMRIllreg = linear_model.LassoLars(alpha=0.01)
TOPMRIllreg.fit(X_train, y_train)

In [None]:
TOPMRIeregr = ElasticNetCV(cv=5, random_state=17)
TOPMRIeregr.fit(X_train, y_train)


In [None]:
TOPMRIetreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
TOPMRIetreg.fit(X_train, y_train)

In [None]:
##  Save these four best models

In [None]:

## optional save models
#joblib.dump(TOPMRIlinr, ('../result_models/' + 'open_harm_topmri_sabre_linr.sav'))
#joblib.dump(TOPMRIllreg, ('../result_models/'+ 'open_harm_topmri_sabre_llreg1.sav'))
#joblib.dump(TOPMRIeregr, ('../result_models/'+ 'open_harm_topmri_sabre_eregr3.sav'))
#joblib.dump(TOPMRIetreg, ('../result_models/'+ 'open_harm_topmri_sabre_etreg4.sav'))

# Running whole TOPMRI model over ABRE dataset

In [None]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [None]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre


In [None]:
y_sabre_pred = TOPMRIlinr.predict(X_sabre_test)

In [None]:
data= [[
    'linear regression',
    'open_harm_topmri_sabre_linr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    TOPMRIlinr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_sabre_test
linr_y_pred = y_sabre_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
#linr_compare = linr_compare.reset_index()
linr_compare

In [None]:
linr_compare.to_csv('whole_open_harm_topmri_sabre_linr_compare_on_top.csv')

In [None]:
y_sabre_pred = TOPMRIllreg.predict(X_sabre_test)

In [None]:
data= [[
    'lasso regression',
    'open_harm_whole_topmri_sabre_llreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    TOPMRIllreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

In [None]:
llreg_y_test = y_sabre_test
llreg_y_pred = y_sabre_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare

In [None]:
llreg_compare.to_csv('whole_open_harm_topmri_sabre_llreg_compare_on_top.csv')

In [None]:
y_sabre_pred = TOPMRIeregr.predict(X_sabre_test)

In [None]:
data= [[
    'elasticnetCV',
    'open_harm_whole_topmri_sabre_linr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    TOPMRIeregr.score (X_sabre_test, y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [None]:
eregr_y_test = y_sabre_test
eregr_y_pred = y_sabre_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
#eregr_compare = eregr_compare.reset_index()
eregr_compare

In [None]:
eregr_compare.to_csv('whole_open_harm_topmri_sabre_eregr_compare_on_top.csv')

In [None]:
y_sabre_pred = TOPMRIetreg.predict(X_sabre_test)

In [None]:
data= [[
    'extra trees',
    'open_harm_topmri_sabre_linr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    TOPMRIetreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [None]:
etreg_y_test = y_sabre_test
etreg_y_pred = y_sabre_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
#etreg_compare = etreg_compare.reset_index()
etreg_compare

In [None]:
etreg_compare.to_csv('whole_open_harm_topmri_sabre_etreg_compare_on_top.csv')

In [None]:
topmri_based_open_combat_harmonized_on_sabre =pd.concat([linr_results,
                   llreg_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
topmri_based_open_combat_harmonized_on_sabre

In [None]:
topmri_based_open_combat_harmonized_on_sabre.to_csv('topmri_based_open_combat_harmonized_on_sabre.csv')