# ML testing: experiment #3

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

filepath_sabre = '../open_work/internal_results/cleaned_pvc2s/' 
filename_sabre = os.path.join(filepath_top,'SABRE_pvc2_cleaned.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)
SABRE = pd.read_csv(filename_sabre)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#SABRE

In [5]:
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [6]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.71736,0.52803,0.31812,0.45881,0.45881,1.743,24.0,8.762,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981
1,sub-0002_1_ses-1_run-1,38.3,0,0.72383,0.62394,0.25673,0.45112,0.45112,1.629,23.0,9.0749,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047
2,sub-0019_1_ses-1_run-1,32.3,1,0.71224,0.53295,0.33594,0.45046,0.45046,0.621,13.0,8.8791,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588


In [7]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.64754,0.49441,0.3132,0.445,0.445,1.249,24.0,8.0434,1.7273,1.9776,1.7141,1.914,26.8262,104.3155,90.5047,73.8758,84.4527
1,sub-59083_1_ses-1_run-1,66.367123,0,0.60517,0.48594,0.42304,0.39968,0.39968,14.597,25.0,3.7791,1.9669,1.7154,2.6394,1.8815,24.9597,75.7051,65.3032,65.6771,67.1158
2,sub-59085_1_ses-1_run-1,55.838356,0,0.61724,0.53779,0.33692,0.41371,0.41371,6.341,30.0,5.1248,1.8146,1.9845,2.0827,1.9501,27.7117,81.0966,72.998,60.8317,64.3183


In [8]:
coly = TOP.columns
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-100096_1_ses-1_run-1,78,1,0.56201,0.44945,0.42419,0.39147,0.39147,4.621,19.0,13.671,3.5865,3.8956,4.3534,3.669,15.3022,71.6047,62.0948,43.2926,54.7345
1,sub-100331_1_ses-1_run-1,71,1,0.53812,0.42593,0.49319,0.36927,0.36927,1.085,22.0,16.3956,4.3543,4.1134,3.8613,3.7593,11.3323,37.0396,30.0755,25.5827,29.7665
2,sub-102285_1_ses-1_run-1,72,1,0.56881,0.49224,0.35808,0.40082,0.40082,1.956,24.0,20.5471,4.3167,4.441,3.9533,3.9991,14.277,51.7767,41.2339,33.5018,39.2296


In [9]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

In [10]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [11]:
# make mixed StrokeMRI and TOP dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [12]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [14]:
X_train_cut = X_train[:,1:]
X_train_cut = X_train_cut.astype('float')
X_train_cut.shape

(780, 18)

In [15]:
X_test_cut = X_test[:,1:]
X_test_cut = X_test_cut.astype('float')
X_test_cut.shape

(261, 18)

In [16]:

# regr = MLPRegressor(random_state=1, max_iter=700)
# regr.fit(X_train_cut, y_train)

In [17]:
# y_pred = regr.predict(X_test_cut)

In [18]:
# print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [19]:
linr = LinearRegression()
linr.fit(X_train_cut, y_train)

In [20]:
y_pred = linr.predict(X_test_cut)

In [21]:
print('R2 score Linear regression: %.3f' % linr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.861
Explained variance score: 0.861
MAE:  5.253


In [22]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_test, y_pred),
    linr.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#linr_results

In [23]:
linr_y_test = y_test
linr_y_pred = y_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [24]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train_cut, y_train)



In [25]:
y_pred = llreg.predict(X_test_cut)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.856
Explained variance score: 0.856
The mean absolute error: 5.361


In [26]:
data= [[
    'lasso linear regression',
    'unharm_mixed_lassor.sav',
    mean_absolute_error(y_test, y_pred),
    llreg.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#llreg_results

In [27]:
llreg_y_test = y_test
llreg_y_pred = y_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'lasso_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [28]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train_cut, y_train)

In [29]:
y_pred = dtree.predict(X_test_cut)
print('R2 score dtree regression: %.3f' % dtree.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.763
Explained variance score: 0.764
The mean absolute error: 6.543


In [30]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_test, y_pred),
    dtree.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
dtree_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#dtree_results

In [31]:
dtree_y_test = y_test
dtree_y_pred = y_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'dtree_y_pred_age': dtree_y_pred,
    })
dtree_compare = dtree_compare.reset_index()
#dtree_compare

In [32]:
regr = MLPRegressor(random_state=1, max_iter=700)
regr.fit(X_train_cut, y_train)

In [33]:
y_pred = regr.predict(X_test_cut)

In [34]:
# print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [35]:
data= [[
    'MLP regression',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_test, y_pred),
    regr.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
regr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,MLP regression,unharm_mixed_regr.sav,6.808033,0.773869,0.77388


In [36]:
regr_y_test = y_test
regr_y_pred = y_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'regr_y_pred_age': regr_y_pred,
    })
regr_compare = regr_compare.reset_index()
#regr_compare

In [37]:
svr_p2 = SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2)
svr_p2.fit(X_train_cut, y_train)

In [38]:
y_pred = svr_p2.predict(X_test_cut)

In [39]:
# print('R2 score SVR 2nd degree poly kernel regression: %.3f' % svr_p2.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [40]:
data= [[
    'SVR poly2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_test, y_pred),
    svr_p2.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
svr_p2_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svr_p2_results

In [41]:
svrp2_y_test = y_test
svrp2_y_pred = y_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'svrp2_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = svrp2_compare.reset_index()
#svrp2_compare

In [42]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train_cut, y_train)

In [43]:
y_pred = eregr.predict(X_test_cut)

In [44]:
# print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [45]:
data= [[
    'Elastic_netCV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_test, y_pred),
    eregr.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#eregr_results

In [46]:
eregr_y_test = y_test
eregr_y_pred = y_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'eregr_y_pred_age': eregr_y_pred,
    })
eregr_compare = eregr_compare.reset_index()
#eregr_compare

In [47]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train_cut, y_train)

In [48]:
y_pred = etreg.predict(X_test_cut)

In [49]:
# print('R2 score extra trees regression: %.3f' % etreg.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [50]:
data= [[
    'Extra trees',
    'unharm_mixed_etreg.sav',
    mean_absolute_error(y_test, y_pred),
    etreg.score(X_test_cut,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#etreg_results

In [51]:
etreg_y_test = y_test
etreg_y_pred = y_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'etreg_y_pred_age': etreg_y_pred,
    })
etreg_compare = etreg_compare.reset_index()
#etreg_compare

Show results

In [52]:
mixed_based_unharmonized_on_testmix =pd.concat([linr_results,
                   llreg_results,
                   dtree_results,
                   regr_results,
                   svr_p2_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
mixed_based_unharmonized_on_testmix

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,unharm_mixed_linr.sav,5.25307,0.861353,0.861373
0,lasso linear regression,unharm_mixed_lassor.sav,5.361406,0.856111,0.856167
0,decision tree,unharm_mixed_dtree.sav,6.543134,0.76344,0.763864
0,MLP regression,unharm_mixed_regr.sav,6.808033,0.773869,0.77388
0,SVR poly2,unharm_mixed_svrp2.sav,10.975952,0.437402,0.437957
0,Elastic_netCV,unharm_mixed_eregr.sav,7.369944,0.736649,0.737776
0,Extra trees,unharm_mixed_etreg.sav,4.743476,0.882034,0.882654


In [53]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [54]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_mixed = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_mixed.head(3)

Unnamed: 0,index,y_test_real_age_x_x,linr_y_pred_age,lasso_y_pred_age,y_test_real_age_x,dtree_y_pred_age,y_test_real_age_x_y,eregr_y_pred_age,svrp2_y_pred_age,y_test_real_age_y,etreg_y_pred_age
0,0,29.42,25.628465,25.330146,29.42,38.89,29.42,27.742639,29.81664,29.42,29.844436
1,1,37.5,32.650346,32.147437,37.5,51.353425,37.5,31.115372,40.351748,37.5,33.906639
2,2,31.61,33.951288,34.248512,31.61,31.94,31.61,39.979275,49.406561,31.61,31.748603


## Save off models and csv (optional, must uncomment)

In [55]:
# # optionally save of csvs of algorithms and results
# mixed_based_unharmonized_on_testmix.to_csv('mixed_based_unharmonized_on_mri.csv')
# real_versus_projected_y3_mixed_on_mixed.to_csv('real_versus_projected_y3_mixed_on_mixed.csv')

In [56]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [57]:
# joblib.dump(linr, ('../result_models/'+ 'unharm_mixed_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'unharm_mixed_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'unharm_mixed_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+'unharm_mixed_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+'unharm_mixed_svrp2.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'unharm_mixed_elasticnet.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'unharm_mixed_extratree.sav'))

## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

# # Here we check that no rows are duplicated once patient IDs were pulled 
(if not we can map them back)

In [58]:
X_train_pandas = pd.DataFrame(X_train)
X_train_pandas.duplicated().sum()

0

top_ml_matrix
needs to be mapped to top rows in X_train,
we will use ese MD5 hashes

now we need to make a dataframe of TOP minus what is in X_train

In [59]:
X_train_pandas.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,sub-59168_2_ses-2_run-1,0,0.5728,0.44099,0.32872,0.42666,0.42666,2.694,27.0,5.7466,2.4044,1.9572,2.0226,2.0232,20.4581,62.7407,56.4464,44.0702,50.8079
1,sub-1109_1_ses-1_run-1,0,0.61626,0.4806,0.24631,0.45881,0.45881,2.364,17.0,7.0078,1.7119,1.6905,1.7671,1.6694,21.3399,64.97,56.4941,55.1801,56.3065
2,sub-59292_1_ses-1_run-1,0,0.59074,0.52605,0.34487,0.40416,0.40416,4.423,20.0,6.1642,1.6372,1.9432,1.8627,1.7752,31.6105,114.8866,98.6019,89.3701,92.8638


In [60]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [61]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [62]:
# take trained subjects out of top subjects
# we can use set math here
new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

132


filter down to only top where they are in new_top set

In [63]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
TOP_new

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
3,sub-0020_1_ses-1_run-1,21.97,0,0.67042,0.43458,0.25974,0.49124,0.49124,0.736,23.0,6.2080,1.8657,1.9092,1.6402,1.8734,27.1813,100.3259,90.7345,68.0812,79.1128
10,sub-0033_1_ses-1_run-1,29.21,1,0.59992,0.43360,0.24903,0.46776,0.46776,1.067,14.0,7.9519,1.7067,2.2134,1.9870,1.9942,23.9504,93.1921,86.2951,74.0285,78.9537
12,sub-0035_1_ses-1_run-1,31.25,1,0.70624,0.55290,0.27722,0.45968,0.45968,2.573,21.0,9.3348,1.6970,2.0326,1.8642,1.9049,24.3174,104.1033,95.0303,67.9324,79.9913
13,sub-0036_1_ses-1_run-1,44.57,1,0.78839,0.66709,0.41974,0.42043,0.42043,2.687,27.0,9.4991,1.6870,1.8951,1.9739,1.8552,27.5337,94.3162,89.4557,68.0983,79.6752
14,sub-0037_1_ses-1_run-1,46.06,0,0.65670,0.57055,0.37825,0.40903,0.40903,2.759,25.0,5.6076,2.1001,2.0220,2.5060,2.4160,23.8801,69.8487,60.6050,41.0306,50.2663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,sub-1139_1_ses-1_run-1,44.44,1,0.69288,0.61756,0.30975,0.42765,0.42765,6.074,23.0,8.2863,1.9019,1.8616,1.6780,1.9797,25.6606,94.1454,81.4243,57.0763,70.0432
512,sub-1149_1_ses-1_run-1,31.61,1,0.74811,0.57229,0.27552,0.46876,0.46876,1.642,19.0,9.0315,2.0425,2.0859,1.4591,1.9126,19.7793,70.8518,63.2505,45.3822,55.2508
513,sub-1152_1_ses-1_run-1,29.42,1,0.82354,0.59415,0.36183,0.46279,0.46279,0.867,15.0,8.1299,2.2361,2.3858,1.8466,2.3558,27.6770,118.4624,104.2936,75.2855,93.1837
519,sub-1159_1_ses-1_run-1,36.76,0,0.74380,0.52266,0.30226,0.47414,0.47414,1.144,18.0,7.6305,1.6827,1.9455,1.9445,1.9204,26.7737,90.4672,82.6515,72.8932,76.7546


In [64]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [65]:
X_top_test = X_top
y_top_test = y_top

In [66]:
y_top_pred = linr.predict(X_top_test)

In [67]:
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Linear regression: 0.549
Explained variance score: 0.592
The mean absolute error: 5.005


In [68]:
data= [[
    'linear regression',
    'unharm_mixed_linr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
linr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,unharm_mixed_linr.sav,5.005188,0.549408,0.592446


In [69]:
linr_y_test = y_top_test
linr_y_pred = y_top_pred
linr_compare = pd.DataFrame(
    {'y_test_real_age': linr_y_test,
     'linr_y_pred_age': linr_y_pred,
    })
linr_compare = linr_compare.reset_index()
#linr_compare

In [70]:
y_top_pred = llreg.predict(X_top_test)

In [71]:
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Lasso linear regression: 0.530
Explained variance score: 0.575
The mean absolute error: 5.103


In [72]:
data= [[
    'lasso regression',
    'unharm_mixed_llregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    llreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
llreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso regression,unharm_mixed_llregr.sav,5.102596,0.529979,0.574695


In [73]:
llreg_y_test = y_top_test
llreg_y_pred = y_top_pred
llreg_compare = pd.DataFrame(
    {'y_test_real_age': llreg_y_test,
     'linr_y_pred_age': llreg_y_pred,
    })
llreg_compare = llreg_compare.reset_index()
#llreg_compare

In [74]:
y_top_pred = dtree.predict(X_top_test)

In [75]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score decision tree regression: 0.079
Explained variance score: 0.195
The mean absolute error: 6.821


In [76]:
data= [[
    'decision tree',
    'unharm_mixed_dtree.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    dtree.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
dtree_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,decision tree,unharm_mixed_dtree.sav,6.820605,0.079168,0.19454


In [77]:
dtree_y_test = y_top_test
dtree_y_pred = y_top_pred
dtree_compare = pd.DataFrame(
    {'y_test_real_age': dtree_y_test,
     'linr_y_pred_age': dtree_y_pred,
    })
dtree_compare = linr_compare.reset_index()
#dtree_compare

In [78]:
y_top_pred = eregr.predict(X_top_test)

In [79]:
print('R2 score ElasticnetCV regression: %.3f' % eregr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score ElasticnetCV regression: 0.085
Explained variance score: 0.171
The mean absolute error: 7.100


In [80]:
data= [[
    'elasticnetCV',
    'unharm_mixed_eregr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    linr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
eregr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,elasticnetCV,unharm_mixed_eregr.sav,7.099925,0.549408,0.170972


In [81]:
eregr_y_test = y_top_test
eregr_y_pred = y_top_pred
eregr_compare = pd.DataFrame(
    {'y_test_real_age': eregr_y_test,
     'linr_y_pred_age': eregr_y_pred,
    })
eregr_compare = linr_compare.reset_index()
#eregr_compare

In [82]:
y_top_pred = svr_p2.predict(X_top_test)

In [83]:
print('R2 score SVR poly 2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score SVR poly 2 regression: -0.589
Explained variance score: 0.146
The mean absolute error: 9.514


In [84]:
data= [[
    'SVR polynom degree 2',
    'unharm_mixed_svrp2.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    svr_p2.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
svrp2_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#svrp2_results_top

In [85]:
svrp2_y_test = y_top_test
svrp2_y_pred = y_top_pred
svrp2_compare = pd.DataFrame(
    {'y_test_real_age': svrp2_y_test,
     'linr_y_pred_age': svrp2_y_pred,
    })
svrp2_compare = linr_compare.reset_index()
#svrp2_compare

In [86]:
y_top_pred = etreg.predict(X_top_test)

In [87]:
print('R2 score Extra trees: %.3f' % etreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Extra trees: 0.573
Explained variance score: 0.646
The mean absolute error: 4.709


In [88]:
data= [[
    'extra trees',
    'unharm_mixed_ereg.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    etreg.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
etreg_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,extra trees,unharm_mixed_ereg.sav,4.709006,0.57338,0.645582


In [89]:
etreg_y_test = y_top_test
etreg_y_pred = y_top_pred
etreg_compare = pd.DataFrame(
    {'y_test_real_age': etreg_y_test,
     'linr_y_pred_age': etreg_y_pred,
    })
etreg_compare = linr_compare.reset_index()
#etreg_compare

In [90]:
y_top_pred = regr.predict(X_top_test)

In [91]:
print('R2 score MLP regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score MLP regression: 0.252
Explained variance score: 0.317
The mean absolute error: 6.392


In [92]:
data= [[
    'multilayered percentron',
    'unharm_mixed_regr.sav',
    mean_absolute_error(y_top_test, y_top_pred),
    regr.score(X_top_test,y_top_test),
    metrics.explained_variance_score(y_top_test, y_top_pred)]]
regr_results_top = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
#regr_results_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,multilayered percentron,unharm_mixed_regr.sav,6.391926,0.252078,0.317296


In [93]:
regr_y_test = y_top_test
regr_y_pred = y_top_pred
regr_compare = pd.DataFrame(
    {'y_test_real_age': regr_y_test,
     'linr_y_pred_age': regr_y_pred,
    })
regr_compare = linr_compare.reset_index()
#regr_compare

In [94]:
# Compile results of mixed on TOP

In [95]:
mixed_based_unharmonized_on_top =pd.concat([linr_results_top,
                   llreg_results_top,
                   dtree_results_top,
                   regr_results_top,
                   svrp2_results_top,
                   eregr_results_top,
                  etreg_results_top],
                  axis=0)
mixed_based_unharmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,unharm_mixed_linr.sav,5.005188,0.549408,0.592446
0,lasso regression,unharm_mixed_llregr.sav,5.102596,0.529979,0.574695
0,decision tree,unharm_mixed_dtree.sav,6.820605,0.079168,0.19454
0,multilayered percentron,unharm_mixed_regr.sav,6.391926,0.252078,0.317296
0,SVR polynom degree 2,unharm_mixed_svrp2.sav,9.513721,-0.588973,0.145511
0,elasticnetCV,unharm_mixed_eregr.sav,7.099925,0.549408,0.170972
0,extra trees,unharm_mixed_ereg.sav,4.709006,0.57338,0.645582


In [96]:
data_frames1 = [linr_compare, llreg_compare, dtree_compare,]# regr_compare, ]#etreg_compare, svrp2_compare,]
real_versus_projected_y1 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames1)
#real_versus_projected_y1
data_frames2 = [eregr_compare, svrp2_compare, etreg_compare,]
real_versus_projected_y2 = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames2)
#real_versus_projected_y2
real_versus_projected_y2 = sep.drop_y(real_versus_projected_y2 )
real_versus_projected_y1 = sep.drop_y(real_versus_projected_y1 )
#real_versus_projected_y1

In [97]:
data_frames3 = [real_versus_projected_y1, real_versus_projected_y2,]
real_versus_projected_y3_mixed_on_top = reduce(lambda  left,right: pd.merge(left,right,on=["index"],
                                            how='outer'), data_frames3)
real_versus_projected_y3_mixed_on_top.head(3)

Unnamed: 0,index,y_test_real_age_x_x,linr_y_pred_age_x_x,level_0_x,y_test_real_age_x,linr_y_pred_age_x,level_0_x.1,y_test_real_age_x_y,linr_y_pred_age_x_y,level_0_y,y_test_real_age_y,linr_y_pred_age_y
0,0,21.97,34.954143,0,21.97,34.954143,0,21.97,34.954143,0,21.97,34.954143
1,1,29.21,34.303296,1,29.21,34.303296,1,29.21,34.303296,1,29.21,34.303296
2,2,31.25,34.218767,2,31.25,34.218767,2,31.25,34.218767,2,31.25,34.218767


## Save off csvs of results 
optional, you must uncomment

In [98]:
## optionally save of csvs of algorithms and results
#mixed_based_unharmonized_on_top.to_csv('mixed_based_unharmonized_on_top.csv')
#real_versus_projected_y3_mixed_on_top.to_csv('real_versus_projected_y3_mixed_on_top.csv')

In [99]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [100]:
# take trained subjects out of top subjects
# we can use set math here
new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects

In [110]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
#StrokeMRI_new

In [111]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 
X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [112]:
X_mri_test = X_mri
y_mri_test = y_mri

In [104]:
y_mri_pred = linr.predict(X_mri_test)

In [104]:
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Linear regression: 0.792
Explained variance score: 0.812
The mean absolute error: 5.507


In [105]:
y_mri_pred = llreg.predict(X_mri_test)

In [105]:
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso-linear regression: 0.785
Explained variance score: 0.808
The mean absolute error: 5.626


In [106]:
y_mri_pred = dtree.predict(X_mri_test)

In [106]:
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score decision tree regression: 0.705
Explained variance score: 0.732
The mean absolute error: 6.259


In [107]:
y_mri_pred = regr.predict(X_mri_test)

In [107]:
print('R2 score Lasso linear regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso linear regression: 0.666
Explained variance score: 0.695
The mean absolute error: 7.234


In [108]:
y_mri_pred = svr_p2.predict(X_mri_test)

In [108]:
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score SVR poly2 regression: 0.065
Explained variance score: 0.419
The mean absolute error: 12.472


In [109]:
y_mri_pred = etreg.predict(X_mri_test)

In [109]:
print('R2 score Extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Extra tree regression: 0.840
Explained variance score: 0.852
The mean absolute error: 4.779


# Running mixed model over SABRE dataset