# ML testing: experiment #6

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models after hharmonization with neurharmony

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/harmonized_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'neuro_harm_mri.csv') 

filepath_top = '../open_work/internal_results/harmonized_pvc2s/' 
filename_top = os.path.join(filepath_top,'neuro_harm_top.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.rename(columns={"Unnamed: 0": "participant_id"})
TOP.head(3)#TOP

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,M,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,F,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,M,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [5]:
StrokeMRI = StrokeMRI.rename(columns={"Unnamed: 0": "participant_id"})
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,F,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,F,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,F,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


In [6]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,0,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,1,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [7]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,0,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,0,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


In [8]:
# check for any duplicated patients
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [9]:
# make mixed dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [10]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [12]:
X_train_cut = X_train[:,1:]
X_train_cut = X_train_cut.astype('float')
X_train_cut.shape

(780, 18)

In [13]:
X_test_cut = X_test[:,1:]
X_test_cut = X_test_cut.astype('float')
X_test_cut.shape

(261, 18)

In [14]:
# svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=2, epsilon=0.1, coef0=1)
# svr_poly.fit(X_train_cut, y_train)

In [15]:
# y_pred = svr_poly.predict(X_test_cut)

In [16]:
# print('R2 score SV polynomial regression: %.3f' % svr_poly.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [17]:

regr = MLPRegressor(random_state=1, max_iter=700)
regr.fit(X_train_cut, y_train)

In [18]:
y_pred = regr.predict(X_test_cut)

In [19]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score neural network mlp regression: 0.295
Explained variance score: 0.298
MAE:  12.390


In [20]:
linr = LinearRegression()
linr.fit(X_train_cut, y_train)

In [21]:
y_pred = linr.predict(X_test_cut)

In [22]:
print('R2 score Linear regression: %.3f' % linr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.430
Explained variance score: 0.431
MAE:  11.832


In [23]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train_cut, y_train)

In [24]:
y_pred = llreg.predict(X_test_cut)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.432
Explained variance score: 0.432
The mean absolute error: 11.810


In [25]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train_cut, y_train)

In [26]:
y_pred = dtree.predict(X_test_cut)
print('R2 score dtree regression: %.3f' % dtree.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.521
Explained variance score: 0.524
The mean absolute error: 8.877


## Save off models

In [27]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [28]:
joblib.dump(linr, ('../result_models/'+  'neuroharm_mixed_linr.sav'))
joblib.dump(llreg, ('../result_models/'+ 'neuroharm_mixed_lassor.sav'))
joblib.dump(dtree, ('../result_models/'+ 'neuroharm_mixed_dtree.sav'))


['../result_models/neuroharm_mixed_dtree.sav']

## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

# # Here we check tht no rows once patient IDs were pulled 
(if not we can map them back)

In [29]:
X_train_pandas = pd.DataFrame(X_train)
X_train_pandas.duplicated().sum()

0

top_ml_matrix
needs to be mapped to top rows in X_train,
we will use ese MD5 hashes

now we need to make a dataframe of TOP minus what is in X_train

In [30]:
X_train_pandas.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,sub-59168_2_ses-2_run-1,0,0.605017,0.449813,0.288131,0.448148,0.448148,0.302075,23.30258,6.942208,2.252019,1.954243,1.937277,2.007259,13.014561,63.513005,56.664025,43.755446,51.329217
1,sub-1109_1_ses-1_run-1,0,0.583783,0.472448,0.284729,0.436545,0.436545,10.668121,18.505143,5.624757,1.774006,1.663407,1.827674,1.632012,22.58214,63.815685,55.800519,53.608393,54.711491
2,sub-59292_1_ses-1_run-1,0,0.622192,0.533153,0.301958,0.427729,0.427729,1.550339,17.976494,7.305474,1.602761,1.941455,1.802546,1.787806,39.764443,124.840078,109.945685,98.692307,103.927888


In [31]:
#X_train_pandas[0]

In [32]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [33]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [34]:
# take trained subjects out of top subjects
# we can use set math here

new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

132


filter down to only top where they are in new_top set

In [35]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
TOP_new

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
3,sub-0020_1_ses-1_run-1,21.97,0,0.640511,0.425481,0.301480,0.473023,0.473023,5.456875,29.731818,4.658326,1.970725,1.907052,1.663744,1.871019,26.919418,95.144721,85.064305,64.854624,74.297067
10,sub-0033_1_ses-1_run-1,29.21,1,0.563881,0.423140,0.270977,0.448665,0.448665,-1.977453,9.614776,6.754611,1.720233,2.239393,2.119197,1.999064,24.418236,88.162588,80.559617,69.174795,73.376781
12,sub-0035_1_ses-1_run-1,31.25,1,0.675241,0.544896,0.306137,0.439577,0.439577,2.843270,22.712563,8.425626,1.707826,2.037971,1.960564,1.894440,24.690736,97.831056,88.025214,63.860663,74.267851
13,sub-0036_1_ses-1_run-1,44.57,1,0.761285,0.661437,0.483895,0.395428,0.395428,3.208185,33.939238,8.624156,1.695036,1.884788,2.102275,1.836211,27.078860,89.158659,83.260845,64.005282,73.996391
14,sub-0037_1_ses-1_run-1,46.06,0,0.626140,0.564250,0.449292,0.380552,0.380552,11.932521,33.474043,3.932838,2.270536,2.032718,2.782184,2.506732,24.468255,68.138725,59.313927,41.273898,49.524292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,sub-1139_1_ses-1_run-1,44.44,1,0.661248,0.610887,0.346710,0.403549,0.403549,14.050010,26.454788,7.158680,1.969905,1.847467,1.720031,1.982076,25.688071,89.007312,76.396756,54.397112,65.724629
512,sub-1149_1_ses-1_run-1,31.61,1,0.719096,0.564685,0.304016,0.449790,0.449790,-0.136872,18.970339,8.059136,2.149741,2.097351,1.437256,1.903461,21.321166,68.366740,60.864397,44.203055,53.021223
513,sub-1152_1_ses-1_run-1,29.42,1,0.798102,0.586995,0.411667,0.443075,0.443075,-2.617655,11.485889,6.969696,2.397366,2.431457,1.937829,2.422716,27.185261,110.554725,95.942155,70.270555,85.597210
519,sub-1159_1_ses-1_run-1,36.76,0,0.717369,0.515374,0.354513,0.453789,0.453789,6.762887,20.376256,6.377191,1.736658,1.947493,2.056839,1.926085,26.616773,86.408879,78.156115,69.049372,72.271894


In [36]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [37]:
# X_top_train, X_top_test, y_top_train, y_top_test = train_test_split(
#     X_top, y_top, test_size=0.99, random_state=42)

In [38]:
X_top_test = X_top
y_top_test = y_top

In [39]:
y_top_pred = linr.predict(X_top_test)
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Linear regression: -0.879
Explained variance score: 0.592
The mean absolute error: 11.343


In [40]:
y_top_pred = llreg.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Lasso linear regression: -0.865
Explained variance score: 0.579
The mean absolute error: 11.243


In [41]:
y_top_pred = dtree.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Lasso linear regression: -0.865
Explained variance score: -0.619
The mean absolute error: 9.146


In [42]:
y_top_pred = regr.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Lasso linear regression: -1.420
Explained variance score: -0.012
The mean absolute error: 12.163


In [43]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [44]:
# take trained subjects out of top subjects
# we can use set math here

new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects
print(len(new_mri))
#print(new_mri)

129


In [45]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
StrokeMRI_new

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
1,sub-59083_1_ses-1_run-1,66.367123,0,0.636007,0.493854,0.368881,0.423664,0.423664,8.895530,21.780841,5.230700,1.881776,1.733382,2.456991,1.881870,23.811992,78.760008,67.858413,69.958931,71.725258
6,sub-59089_1_ses-1_run-1,58.419178,0,0.626644,0.435998,0.269895,0.467296,0.467296,5.581747,50.693879,4.139946,1.809928,1.821890,1.889586,1.853376,38.734255,91.486700,82.825130,70.367017,75.449422
9,sub-59090_2_ses-2_run-1,74.610959,0,0.563621,0.443640,0.384711,0.402810,0.402810,3.111932,24.063449,5.245837,1.716077,1.861623,1.783419,1.723828,58.026723,133.025386,112.393036,108.079982,111.291044
10,sub-59091_1_ses-1_run-1,74.063014,1,0.612484,0.494760,0.566308,0.366976,0.366976,36.358214,17.354338,3.402415,1.811599,1.778184,1.857674,1.584919,41.890708,74.569836,73.527304,68.132662,70.282666
11,sub-59092_1_ses-1_run-1,50.668493,0,0.662956,0.509168,0.284715,0.454709,0.454709,0.706371,18.737363,8.483565,1.751789,2.010600,2.068385,1.911779,15.725192,89.671440,79.734944,60.205009,68.324521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,sub-59421_1_ses-1_run-1,73.868493,1,0.603016,0.546355,0.331890,0.409083,0.409083,3.468017,21.158685,7.323267,1.982546,2.134685,1.793721,1.933302,19.306673,62.429296,55.348937,45.185982,50.503463
491,sub-59425_1_ses-1_run-1,79.624658,0,0.548935,0.427915,0.377477,0.402592,0.402592,22.713066,30.150405,3.636628,2.329791,1.906380,1.768084,1.664009,20.435037,60.320924,49.909828,47.560408,49.658677
499,sub-59430_2_ses-2_run-1,80.528767,1,0.639960,0.464681,0.576753,0.380334,0.380334,15.791823,56.919548,4.458982,2.414228,2.182364,2.353542,2.456893,34.790202,81.562730,78.162646,50.150927,60.305079
501,sub-59431_2_ses-2_run-1,69.221918,0,0.644384,0.507394,0.316144,0.438637,0.438637,1.342415,18.737363,7.244147,1.905387,1.982284,1.906185,2.075484,21.630727,80.526579,77.460373,56.040592,65.024965


In [46]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 

X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [47]:
X_mri_test = X_mri
y_mri_test = y_mri

In [48]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Linear regression: 0.158
Explained variance score: 0.786
The mean absolute error: 12.332


In [49]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso-linear regression: 0.156
Explained variance score: 0.787
The mean absolute error: 12.390


In [50]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score decision tree regression: 0.403
Explained variance score: 0.444
The mean absolute error: 8.601


In [51]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score Lasso linear regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso linear regression: -0.006
Explained variance score: 0.371
The mean absolute error: 12.622
