# ML testing: experiment #3

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models without harmonization, applied to Insight 46 and Sabre datasets (as wekk as a test set of mixed TOP and StrokeMRI)

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

filepath_sabre = '../open_work/internal_results/cleaned_pvc2s/' 
filename_sabre = os.path.join(filepath_top,'SABRE_pvc2_cleaned.csv') 
filepath_insight46 = '../open_work/internal_results/cleaned_pvc2s/' 
filename_insight46 =  os.path.join(filepath_top,'Insight46_pvc2c.csv') 
# read in data
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)
SABRE = pd.read_csv(filename_sabre)
Insight46 = pd.read_csv(filename_insight46)
# take extra column off
TOP = TOP.drop(TOP.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
Insight46 = Insight46.drop(Insight46.columns[0],axis=1)

In [None]:
# TOP = pd.read_csv(filename_top)
# StrokeMRI = pd.read_csv(filename_mri)
# SABRE = pd.read_csv(filename_sabre)

In [None]:
# TOP = TOP.drop(TOP.columns[0],axis=1)
# SABRE = SABRE.drop(SABRE.columns[0],axis=1)
# #SABRE

In [None]:
# StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
# #StrokeMRI

In [None]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

In [None]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

In [None]:
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
#Insight46.head(3)

In [None]:
coly = TOP.columns
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

In [None]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

In [None]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

In [None]:
# make mixed StrokeMRI and TOP dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [None]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
# X_train_cut = X_train[:,1:]
# X_train_cut = X_train_cut.astype('float')
# X_train_cut.shape

In [None]:
# X_test_cut = X_test[:,1:]
# X_test_cut = X_test_cut.astype('float')
# X_test_cut.shape

In [None]:
linr_k_frame, linr_y_frame, models = sep.frame_a_model_sex_split('linear regression', 'unharm_mix_linr', LinearRegression(), ml_matrix, X[:,1:], y)

In [None]:
linr_k_frame

In [None]:
avg_linr = sep.avg_k_folds(linr_k_frame)
avg_linr

In [None]:
linr_y_frame

In [None]:
linr = models
linr[0]

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [None]:
llreg_k_frame, llreg_y_frame, models = sep.frame_a_model_sex_split('lasso regression', 'unharm_mix_llreg',  linear_model.LassoLars(alpha=0.01), ml_matrix, X[:,1:], y)
llreg_k_frame

In [None]:
avg_llreg = sep.avg_k_folds(llreg_k_frame)
avg_llreg

In [None]:
llreg_y_frame

In [None]:
llreg = models
llreg[0]

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_linr0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_linr1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_linr2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_linr3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_linr4.sav'))

In [None]:
dtree_k_frame, dtree_y_frame, models = sep.frame_a_model_sex_split('decision tree', 'unharm_mix_dtree',  tree.DecisionTreeRegressor(), ml_matrix, X[:,1:], y)
dtree_k_frame

In [None]:
avg_dtree = sep.avg_k_folds(dtree_k_frame)
avg_dtree

In [None]:
dtree_y_frame

In [None]:
dtree = models
dtree[0]

In [None]:
regr_k_frame, regr_y_frame, models = sep.frame_a_model_sex_split('MLP regression', 'unharm_mix_regr',   MLPRegressor(random_state=1, max_iter=700), ml_matrix, X[:,1:], y)
regr_k_frame

In [None]:
avg_regr = sep.avg_k_folds(regr_k_frame)
avg_regr

In [None]:
regr_y_frame

In [None]:
regr = models
regr[0]

In [None]:
svrp2_k_frame, svrp2_y_frame, models = sep.frame_a_model_sex_split('support vector reg poly2', 'unharm_mix_svrp2',   SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2), ml_matrix, X[:,1:], y)
svrp2_k_frame

In [None]:
avg_svrp2 = sep.avg_k_folds(svrp2_k_frame)
avg_svrp2

In [None]:
svrp2_y_frame

In [None]:
svrp2 = models
svrp2[0]

In [None]:
eregr_k_frame, eregr_y_frame, models = sep.frame_a_model_sex_split('elasticnetCV', 'unharm_mix_eregr',  ElasticNetCV(cv=5, random_state=12), ml_matrix, X[:,1:], y)
eregr_k_frame

In [None]:
avg_eregr = sep.avg_k_folds(eregr_k_frame)
avg_eregr

In [None]:
eregr_y_frame

In [None]:
eregr = models
eregr[0]

In [None]:
etreg_k_frame, etreg_y_frame, models = sep.frame_a_model_sex_split('extra trees', 'unharm_mix_etreg',  ExtraTreesRegressor(n_estimators=100, random_state=0), ml_matrix, X[:,1:], y)
etreg_k_frame

In [None]:
avg_etreg = sep.avg_k_folds(etreg_k_frame)
avg_etreg

In [None]:
etreg_y_frame

In [None]:
etreg = models
etreg[0]

In [None]:
## optional save models
#joblib.dump(linr[0], ('../result_models/'+ 'unharm_mix_etreg0.sav'))
#joblib.dump(linr[1], ('../result_models/'+ 'unharm_mix_etreg1.sav'))
#joblib.dump(linr[2], ('../result_models/'+ 'unharm_mix_etreg2.sav'))
#joblib.dump(linr[3], ('../result_models/'+ 'unharm_mix_etreg3.sav'))
#joblib.dump(linr[4], ('../result_models/'+ 'unharm_mix_etreg4.sav'))

Show results ON AVERAGE for each model

In [None]:
mixed_based_unharmonized_on_testmix =pd.concat([avg_linr,
                   avg_llreg,
                   avg_dtree,
                   avg_regr,
                   avg_svrp2,
                   avg_eregr,
                   avg_etreg],
                  axis=0)
mixed_based_unharmonized_on_testmix

In [None]:
# data_frames1 = [linr_y_frame, llreg_y_frame, dtree_y_frame,]# regr_compare, ]#etreg_compare, svrp2_compare,]
# real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames1)
# #real_versus_projected_y1
# data_frames2 = [eregr_y_frame, svrp2_y_frame, etreg_y_frame,]
# real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames2)
# #real_versus_projected_y2
# real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
# real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
# #real_versus_projected_y1

In [None]:
# data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
# real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames1)
# #real_versus_projected_y1
# data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
# real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames2)
# #real_versus_projected_y2
# real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
# real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
# #real_versus_projected_y1

In [None]:
# data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
# real_versus_projected_y3_mixed_on_mixed = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
#                                             how='outer'), data_frames3)
# real_versus_projected_y3_mixed_on_mixed.head(3)

## Save off models and csv (optional, must uncomment)

In [None]:
# # optionally save of csvs of algorithms and results
# mixed_based_unharmonized_on_testmix.to_csv('mixed_based_unharmonized_on_testmix.csv')
# real_versus_projected_y3_mixed_on_mixed.to_csv('real_versus_projected_y3_mixed_on_mixed.csv')

In [None]:
# # check if model folder exists and if not , then create
# model_folder = '../result_models/'
# if not os.path.exists(model_folder):
#     os.makedirs(model_folder)

In [None]:
# joblib.dump(linr, ('../result_models/'+ 'unharm_mixed_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'unharm_mixed_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'unharm_mixed_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+'unharm_mixed_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+'unharm_mixed_svrp2.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'unharm_mixed_elasticnet.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'unharm_mixed_extratree.sav'))

## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

In [None]:
# These cells must be rewritten, or the cells above, but skip to running over sabre

# # Here we check that no rows are duplicated once patient IDs were pulled 
(if not we can map them back)

In [None]:
X_train_pandas = pd.DataFrame(X_train)
X_train_pandas.duplicated().sum()

top_ml_matrix
needs to be mapped to top rows in X_train,
we will use ese MD5 hashes

now we need to make a dataframe of TOP minus what is in X_train

In [None]:
X_train_pandas.head(3)

In [None]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [None]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here
new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

filter down to only top where they are in new_top set

In [None]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
#TOP_new

In [None]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
X_top_test = X_top
y_top_test = y_top

In [None]:
y_top_pred = linr.predict(X_top_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_top

In [None]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [None]:
y_top_pred = llreg.predict(X_top_test)

In [None]:
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'lasso regression',
    'unharm_mixed_llregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    llreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_top

In [None]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [None]:
y_top_pred = dtree.predict(X_top_test)

In [None]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    dtree.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
dtree_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_top

In [None]:
dtree_y_test = y_top_test
dtree_y_pred = y_top_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'linr_y_pred_age': dtree_y_pred,
    })
dtree_compare = linr_compare.reset_index()
#dtree_compare

In [None]:
y_top_pred = eregr.predict(X_top_test)

In [None]:
print('R2 score ElasticnetCV regression: %.3f' % eregr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'elasticnetCV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_top

In [None]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare = linr_compare.reset_index()
#eregr_compare

In [None]:
y_top_pred = svr_p2.predict(X_top_test)

In [None]:
print('R2 score SVR poly 2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'SVR polynom degree 2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    svr_p2.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
svrp2_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svrp2_results_top

In [None]:
svrp2_y_test = y_top_test
svrp2_y_pred = y_top_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'linr_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = linr_compare.reset_index()
#svrp2_compare

In [None]:
y_top_pred = etreg.predict(X_top_test)

In [None]:
print('R2 score Extra trees: %.3f' % etreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'extra trees',
    'unharm_mixed_ereg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    etreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_top

In [None]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare = linr_compare.reset_index()
#etreg_compare

In [None]:
y_top_pred = regr.predict(X_top_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
data= [[
    'multilayered percentron',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    regr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
regr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#regr_results_top

In [None]:
regr_y_test = y_top_test
regr_y_pred = y_top_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare = linr_compare.reset_index()
#regr_compare

In [None]:
# Compile results of mixed on TOP

In [None]:
mixed_based_unharmonized_on_top =pd.concat([linr_results_top,
                   llreg_results_top,
                   dtree_results_top,
                   regr_results_top,
                   svrp2_results_top,
                   eregr_results_top,
                  etreg_results_top],
                  axis=0)
mixed_based_unharmonized_on_top

In [None]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_top = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_top.head(3)

## Save off csvs of results 
optional, you must uncomment

In [None]:
## optionally save of csvs of algorithms and results
#mixed_based_unharmonized_on_top.to_csv('mixed_based_unharmonized_on_top.csv')
#real_versus_projected_y3_mixed_on_top.to_csv('real_versus_projected_y3_mixed_on_top.csv')

# Mixed algorithm on stroke MRI subjects (not in training)

In [None]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here
new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects

In [None]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
#StrokeMRI_new

In [None]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 
X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri

In [None]:
y_mri_pred = linr.predict(X_mri_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    linr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
linr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_mri

In [None]:
linr_y_test = y_mri_test
linr_y_pred = y_mri_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare_mri = linr_compare.reset_index()
#linr_compare_mri

In [None]:
y_mri_pred = llreg.predict(X_mri_test)

In [None]:
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'lasso regression',
    'unharm_mixed_llreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    llreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
llreg_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#llreg_results_mri

In [None]:
llreg_y_test = y_mri_test
llreg_y_pred = y_mri_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'llreg_y_pred_age': llreg_y_pred,
    })
llreg_compare_mri = llreg_compare.reset_index()
#llreg_compare_mri

In [None]:
y_mri_pred = dtree.predict(X_mri_test)

In [None]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    dtree.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
dtree_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_mri

In [None]:
dtree_y_test = y_mri_test
dtree_y_pred = y_mri_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'dtree_y_pred_age': dtree_y_pred,
    })
dtree_compare_mri = dtree_compare.reset_index()
#dtree_compare_mri

In [None]:
y_mri_pred = regr.predict(X_mri_test)

In [None]:
print('R2 score MLP regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'multilayered perceptron',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    regr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
regr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_mri

In [None]:
regr_y_test = y_mri_test
regr_y_pred = y_mri_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare_mri = regr_compare.reset_index()
#regr_compare_mri

In [None]:
y_mri_pred = svr_p2.predict(X_mri_test)

In [None]:
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'svr poly degree 2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    svr_p2.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
svrp2_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svrp2_results_mri

In [None]:
svrp2_y_test = y_mri_test
svrp2_y_pred = y_mri_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'svrp2_y_pred_age': svrp2_y_pred,
    })
svrp2_compare_mri = svrp2_compare.reset_index()
#svrp2_compare_mri

In [None]:
y_mri_pred = etreg.predict(X_mri_test)

In [None]:
print('R2 score Extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'extra trees',
    'unharm_mixed_etreg.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    etreg.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
etreg_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_mri

In [None]:
etreg_y_test = y_mri_test
etreg_y_pred = y_mri_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
etreg_compare_mri = etreg_compare.reset_index()
#etreg_compare_mri

In [None]:
y_mri_pred = eregr.predict(X_mri_test)

In [None]:
print('R2 score elasticnetCV: %.3f' % eregr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
data= [[
    'elasticnet CV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_mri_test, y_mri_pred),
    eregr.score(X_mri_test,y_mri_test),
    metrics.explained_variance_score(y_mri_test, y_mri_pred)]]
eregr_results_mri = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_mri

In [None]:
eregr_y_test = y_mri_test
eregr_y_pred = y_mri_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'etreg_y_pred_age': eregr_y_pred,
    })
eregr_compare_mri = eregr_compare.reset_index()
#etreg_compare_mri

In [None]:
# Agregate mixed results on MRI and optional save

In [None]:
mixed_based_unharmonized_on_mri =pd.concat([linr_results_mri,
                   llreg_results_mri,
                   dtree_results_mri,
                   regr_results_mri,
                   svrp2_results_mri,
                   eregr_results_mri,
                   etreg_results_mri],
                   axis=0)
mixed_based_unharmonized_on_mri

In [None]:
data_frames1 = [linr_compare_mri, llreg_compare_mri, dtree_compare_mri,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare_mri, svrp2_compare_mri, etreg_compare_mri,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [None]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_mri = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_mri.head(3)

In [None]:
# # optionally save of csvs of algorithms and results
# mixed_based_unharmonized_on_mri.to_csv('mixed_based_unharmonized_on_mri.csv')
# real_versus_projected_y3_mixed_on_mri.to_csv('real_versus_projected_y3_mixed_on_mri.csv')

# Running mixed model over SABRE dataset

## Here we will do an example of running allthe [0] models

In [None]:
sabre_ml_matrix = SABRE.drop('participant_id', axis=1)
X_sabre = sabre_ml_matrix.drop('age', axis =1)
X_sabre = X_sabre.values
X_sabre = X_sabre.astype('float')
y_sabre = sabre_ml_matrix['age'].values
y_sabre=y_sabre.astype('float')

In [None]:
X_sabre_test = X_sabre
y_sabre_test = y_sabre

In [None]:
y_sabre_pred = linr[0].predict(X_sabre_test)

In [None]:
data= [[
    'Linear Reg',
    'unharm_mix_linr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    linr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
linr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_linr_sabre = pd.DataFrame(data)
y_frame_linr_sabre

In [None]:
y_sabre_pred = llreg[0].predict(X_sabre_test)

In [None]:
data= [[
    'Lasso',
    'unharm_mix_lassor0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    llreg[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
llreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_llreg_sabre = pd.DataFrame(data)
y_frame_llreg_sabre

In [None]:
y_sabre_pred = dtree[0].predict(X_sabre_test)

In [None]:
data= [[
    'Decision tree',
    'unharm_mix_dtree0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    dtree[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
dtree_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_dtree_sabre = pd.DataFrame(data)
y_frame_dtree_sabre

In [None]:
y_sabre_pred = regr[0].predict(X_sabre_test)

In [None]:
data= [[
    'MLP regression',
    'unharm_mix_regr0.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    regr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
regr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_sabre

In [None]:
y_sabre_pred = svrp2[0].predict(X_sabre_test)

In [None]:
data= [[
    'Svr P2',
    'unharm_mri_svrp20.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    svrp2[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
svr_p2_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_sabre

In [None]:
y_sabre_pred = eregr[0].predict(X_sabre_test)

In [None]:
data= [[
    'ElasticnetCV',
    'unharm_mri_elasticregr.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    eregr[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
eregr_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_sabre

In [None]:
y_sabre_pred = etreg[0].predict(X_sabre_test)

In [None]:
data= [[
    'Extra trees',
    'unharm_mri_etreg.sav',
    mean_absolute_error(y_sabre_test, y_sabre_pred),
    etreg[0].score(X_sabre_test,y_sabre_test),
    metrics.explained_variance_score(y_sabre_test, y_sabre_pred)]]
etreg_results_sabre = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_sabre

In [None]:
data = {'real': y_sabre_test, 'predicted': y_sabre_pred}
y_frame_etregr_sabre = pd.DataFrame(data)
y_frame_etregr_sabre

In [None]:
mix_based_unharmonized_on_sabre =pd.concat([linr_results_sabre,
                   llreg_results_sabre,
                   dtree_results_sabre,
                   regr_results_sabre,
                   svr_p2_results_sabre,
                   eregr_results_sabre,
                  etreg_results_sabre],
                  axis=0)
mix_based_unharmonized_on_sabre

# Running mixed model over Insight46 dataset

## Here we will do an example of running allthe [0] models

In [None]:
insight_ml_matrix = Insight46.drop('participant_id', axis=1)
X_insight = insight_ml_matrix.drop('age', axis =1)
X_insight = X_insight.values
X_insight = X_insight.astype('float')
y_insight = insight_ml_matrix['age'].values
y_insight= y_insight.astype('float')

In [None]:
X_insight_test = X_insight
y_insight_test = y_insight

In [None]:
y_insight_pred = linr[0].predict(X_insight_test)

In [None]:
data= [[
    'Linear Reg',
    'unharm_mix_linr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    linr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
linr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_linr_insight = pd.DataFrame(data)
y_frame_linr_insight

In [None]:
y_insight_pred = llreg[0].predict(X_insight_test)

In [None]:
data= [[
    'Lasso',
    'unharm_mix_lassor0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    llreg[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
llreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_llreg_insight = pd.DataFrame(data)
y_frame_llreg_insight

In [None]:
y_insight_pred = dtree[0].predict(X_insight_test)

In [None]:
data= [[
    'Decision tree',
    'unharm_mix_dtree0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    dtree[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
dtree_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_dtree_insight = pd.DataFrame(data)
y_frame_dtree_insight

In [None]:
y_insight_pred = regr[0].predict(X_insight_test)

In [None]:
data= [[
    'MLP regression',
    'unharm_mix_regr0.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    regr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
regr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results_insight

In [None]:
y_insight_pred = svrp2[0].predict(X_insight_test)

In [None]:
data= [[
    'Svr P2',
    'unharm_mri_svrp20.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    svrp2[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
svr_p2_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results_insight

In [None]:
y_insight_pred = eregr[0].predict(X_insight_test)

In [None]:
data= [[
    'ElasticnetCV',
    'unharm_mri_elasticregr.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    eregr[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
eregr_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_insight

In [None]:
y_insight_pred = etreg[0].predict(X_insight_test)

In [None]:
data= [[
    'Extra trees',
    'unharm_mri_etreg.sav',
    mean_absolute_error(y_insight_test, y_insight_pred),
    etreg[0].score(X_insight_test,y_insight_test),
    metrics.explained_variance_score(y_insight_test, y_insight_pred)]]
etreg_results_insight = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_insight

In [None]:
data = {'real': y_insight_test, 'predicted': y_insight_pred}
y_frame_etregr_insight = pd.DataFrame(data)
y_frame_etregr_insight

In [None]:
mix_based_unharmonized_on_insight =pd.concat([linr_results_insight,
                   llreg_results_insight,
                   dtree_results_insight,
                   regr_results_insight,
                   svr_p2_results_insight,
                   eregr_results_insight,
                   etreg_results_insight],
                  axis=0)
mix_based_unharmonized_on_insight