# ML testing: experiment #10

This notebook involves testing for the MRI conference abstract. This notebook shows TOP-harmonized based models. Harmonization is with open nested combat, where the harmonization assumes TOP, StrokeMRI and SABRE are all different "batches."

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
#import ipywidgets as widgets
#import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filepath = '../open_work/internal_results/harmonized_pvc2s/open3_harm/' 
filename_mri = os.path.join(filepath,'mri_opn_harmonized.csv') 
filename_sabre = os.path.join(filepath,'sabre_opn_harmonized.csv') 
filename_top = os.path.join(filepath,'top_opn_harmonized.csv') 

In [None]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)
SABRE = pd.read_csv(filename_sabre)

In [None]:
TOP = TOP.drop(TOP.columns[0],axis=1)
#TOP.head(3)

In [None]:
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI.head(3)

In [None]:
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#SABRE.head(3)

## Build ML models

In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'linear regression',
    'opn_harm3_top_linr.sav',
    mean_absolute_error(y_test, y_pred),
    linr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [None]:
linr_y_test = y_test
linr_y_pred = y_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [None]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [None]:
y_pred = llreg.predict(X_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'lasso regression',
    'opn_harm3_top_llreg.sav',
    mean_absolute_error(y_test, y_pred),
    llreg.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#llreg_results

In [None]:
llreg_y_test = y_test
llreg_y_pred = y_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [None]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)
print('R2 score dtree regression: %.3f' % dtree.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'decision tree',
    'opn_harm3_top_dtree.sav',
    mean_absolute_error(y_test, y_pred),
    dtree.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
dtree_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results

In [None]:
dtree_y_test = y_test
dtree_y_pred = y_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'dtree_y_pred_age': dtree_y_pred,
    })
dtree_compare = dtree_compare.reset_index()
#dtree_compare

In [None]:
regr = MLPRegressor(random_state=1, max_iter=900)
regr.fit(X_train, y_train)

In [None]:
y_pred = regr.predict(X_test)

In [None]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'multi-level perceptron',
    'opn_harm3_top_regr.sav',
    mean_absolute_error(y_test, y_pred),
    regr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
regr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results

In [None]:
regr_y_test = y_test
regr_y_pred = y_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'dtree_y_pred_age': regr_y_pred,
    })
regr_compare = regr_compare.reset_index()
#regr_compare

In [None]:
svr_p2 = SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2)
svr_p2.fit(X_train, y_train)

In [None]:
y_pred = svr_p2.predict(X_test)

In [None]:
print('R2 score SVR 2nd degree poly kernel regression: %.3f' % svr_p2.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'svr pol2',
    'opn_harm3_top_svrp2.sav',
    mean_absolute_error(y_test, y_pred),
    svr_p2.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
svr_p2_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results

In [None]:
svrp2_y_test = y_test
svrp2_y_pred = y_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age':  svrp2_y_test,
     'dtree_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = svrp2_compare.reset_index()
#svrp2_compare

In [None]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train, y_train)

In [None]:
y_pred = eregr.predict(X_test)
print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'elasticnetCV',
    'opn_harm3_top_eregr.sav',
    mean_absolute_error(y_test, y_pred),
    eregr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results

In [None]:
eregr_y_test = y_test
eregr_y_pred = y_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'dtree_y_pred_age': eregr_y_pred,
    })
eregr_compare = eregr_compare.reset_index()
#eregr_compare

In [None]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train, y_train)

In [None]:
y_pred = etreg.predict(X_test)
print('R2 score extra trees regression: %.3f' % etreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
data= [[
    'extra trees',
    'opn_harm3_top_etreg.sav',
    mean_absolute_error(y_test, y_pred),
    etreg.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results

In [None]:
etreg_y_test = y_test
etreg_y_pred = y_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'dtree_y_pred_age': etreg_y_pred,
    })
etreg_compare = etreg_compare.reset_index()
#etreg_compare

In [None]:
top_based_opn3harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   dtree_results,
                   regr_results,
                   svr_p2_results,
                   eregr_results, #eregr_results
                  etreg_results],
                  axis=0)
top_based_opn3harmonized_on_top

In [None]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_mixed = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_mixed.head(3)

In [None]:
## SAve off models

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
# joblib.dump(linr, ('../result_models/'+  'opn_harm3_top_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'opn_harm3_top_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'opn_harm3_top_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+ ' opn_harm3_top_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+'opn_harm3_top_svrp2.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'opn_harm3_top_extratree.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'opn_harm3_top_elasticregr.sav'))

## Run models on other dataset : MRI dataset (open nested harmonized as in a different batch)

In [None]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri

In [None]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score Decision tree: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score MLP regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = svr_p2.predict(X_mri_test)
print('R2 score SVR polynomial regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = eregr.predict(X_mri_test)
print('R2 score ElasticNet CV : %.3f' % eregr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = etreg.predict(X_mri_test)
print('R2 score extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
# plt.figure(figsize=(10,10))
# plt.scatter(y_test, y_pred, c='crimson')
# plt.yscale('log')
# plt.xscale('log')

# p1 = max(max(y_pred), max(y_test))
# p2 = min(min(y_pred), min(y_test))
# plt.plot([p1, p2], [p1, p2], 'b-')
# plt.xlabel('True Values', fontsize=15)
# plt.ylabel('Predictions', fontsize=15)
# plt.axis('equal')
# plt.show()

# Running TOP-harmonized model over SABRE-harmonixed dataset

In [None]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [None]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [None]:
y_sabre_pred = linr.predict(X_sabre_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Linear Reg',
    'opn_harm3_top_linr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    linr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

In [None]:
y_sabre_pred = llreg.predict(X_sabre_test)

In [None]:
print('R2 score Lasso regression: %.3f' % llreg.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Lasso',
    'opn_harm3_top_lassor.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    llreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

In [None]:
y_sabre_pred = dtree.predict(X_sabre_test)

In [None]:
print('R2 score Decision tree: %.3f' % dtree.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Decision tree',
    'opn_harm3_top_svrp2.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    dtree.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
dtree_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_sabre

In [None]:
y_sabre_pred = regr.predict(X_sabre_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'MLP regression',
    'opn_harm3_top_regr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    regr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
regr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_sabre

In [None]:
y_sabre_pred = svr_p2.predict(X_sabre_test)

In [None]:
print('R2 score SVR polynomial regression: %.3f' % svr_p2.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Svr P2',
    'opn_harm3_top_svrp2.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    svr_p2.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
svr_p2_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_sabre

In [None]:
y_sabre_pred = eregr.predict(X_sabre_test)

In [None]:
print('R2 score ElasticNet CV : %.3f' % eregr.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'ElasticnetCV',
    'opn_harm3_top_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    eregr.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

In [None]:
y_sabre_pred = etreg.predict(X_sabre_test)

In [None]:
print('R2 score extra tree regression: %.3f' % etreg.score(X_sabre_test,y_sabre_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_sabre_test, y_sabre_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_sabre_test, y_sabre_pred))

In [None]:
data= [[
    'Extra trees',
    'opn_harm3_top_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    etreg.score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

In [None]:
top_based_opn3harmonized_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   dtree_results_sabre,
                   regr_results_sabre,
                   svr_p2_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
top_based_opn3harmonized_on_sabre