# ML testing: experiment #5

This notebook involves testing for the MRI conference abstract. This notebook shows StrokeMRI based models, after harmonization with TOP with neuroharmony

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/harmonized_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'neuro_harm_mri.csv') 

filepath_top = '../open_work/internal_results/harmonized_pvc2s/' 
filename_top = os.path.join(filepath_top,'neuro_harm_top.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.rename(columns={"Unnamed: 0": "participant_id"})
TOP.head(3)#TOP

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,M,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,F,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,M,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [6]:
StrokeMRI = StrokeMRI.rename(columns={"Unnamed: 0": "participant_id"})
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,F,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,F,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,F,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


In [7]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,0,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,1,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [8]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,0,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,0,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


## Build ML models

In [9]:
ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [11]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [12]:
y_pred = linr.predict(X_test)

In [13]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.824
Explained variance score: 0.824
MAE:  4.729


In [14]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [15]:
y_pred = llreg.predict(X_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.822
Explained variance score: 0.823
The mean absolute error: 4.706


In [16]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [17]:
y_pred = dtree.predict(X_test)
print('R2 score dtree regression: %.3f' % dtree.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.780
Explained variance score: 0.783
The mean absolute error: 5.043


In [18]:
regr = MLPRegressor(random_state=1, max_iter=900)
regr.fit(X_train, y_train)

In [19]:
y_pred = regr.predict(X_test)

In [20]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score neural network mlp regression: 0.719
Explained variance score: 0.736
MAE:  5.942


In [21]:
svr_rbf = SVR(C=1.0, epsilon=0.2)
svr_rbf.fit(X_train, y_train)

In [22]:
y_pred = svr_rbf.predict(X_test)

In [23]:
print('R2 score SVR RBF regression: %.3f' % svr_rbf.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score SVR RBF regression: 0.297
Explained variance score: 0.355
MAE:  9.500


In [24]:
svr_p2 = SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2)
svr_p2.fit(X_train, y_train)

In [25]:
y_pred = svr_p2.predict(X_test)

In [26]:
print('R2 score SVR 2nd degree poly kernel regression: %.3f' % svr_p2.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score SVR 2nd degree poly kernel regression: 0.392
Explained variance score: 0.444
MAE:  8.739


## Save off models

In [27]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [38]:
joblib.dump(linr, ('../result_models/'+  'neuroharm_mri_linr.sav'))
joblib.dump(llreg, ('../result_models/'+ 'neuroharm_mri_lassor.sav'))
joblib.dump(dtree, ('../result_models/'+ 'neuroharm_mri_dtree.sav'))
joblib.dump(regr, ('../result_models/'+  'neuroharm_mri_regr.sav'))

['../result_models/neuroharm_mri_regr.sav']

In [39]:
joblib.dump(svr_p2, ('../result_models/'+ 'neuroharm_mri_svrp2.sav'))

['../result_models/neuroharm_mri_svrp2.sav']

## Run models on other dataset (TOP)

In [40]:
top_ml_matrix = TOP.drop('participant_id', axis=1)
X_top = top_ml_matrix.drop('age', axis =1)
X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [41]:
X_top_test = X_top
y_top_test = y_top


In [42]:
y_top_pred = linr.predict(X_top_test)
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Linear regression: -5.130
Explained variance score: 0.582
The mean absolute error: 22.841


In [43]:
y_top_pred = llreg.predict(X_top_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score Lasso regression: -5.119
Explained variance score: 0.562
The mean absolute error: 22.771


In [44]:
y_top_pred = dtree.predict(X_top_test)
print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score decision tree regression: -6.562
Explained variance score: -0.036
The mean absolute error: 24.413


In [45]:
y_top_pred = regr.predict(X_top_test)
print('R2 score MLP regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score MLP regression: -6.562
Explained variance score: -0.630
The mean absolute error: 29.542


In [46]:
y_top_pred = svr_p2.predict(X_top_test)
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

R2 score SVR poly2 regression: -7.430
Explained variance score: -0.032
The mean absolute error: 26.284
