# ML testing: experiment #4

This notebook involves testing for the MRI conference abstract. This notebook shows TOP based models, but after harmonization with the StrokeMRI dataset on neuroharmony algorithm

### import libraries

In [2]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
#import ipywidgets as widgets
#import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

ModuleNotFoundError: No module named 'sklearn'

### import data

In [None]:
filepath_mri = '../open_work/internal_results/harmonized_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'neuro_harm_mri.csv') 

filepath_top = '../open_work/internal_results/harmonized_pvc2s/' 
filename_top = os.path.join(filepath_top,'neuro_harm_top.csv') 

In [None]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [10]:
#TOP.head(3)

In [1]:
len(TOP)

NameError: name 'TOP' is not defined

In [9]:
#StrokeMRI.head(3)

In [7]:
TOP = TOP.rename(columns={"Unnamed: 0": "participant_id"})
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,M,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,F,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,M,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [8]:
StrokeMRI = StrokeMRI.rename(columns={"Unnamed: 0": "participant_id"})
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,F,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,F,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,F,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


In [11]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.686888,0.519514,0.357149,0.438598,0.438598,0.18643,28.325901,7.733488,2.446866,2.358709,3.021907,2.511524,23.045935,76.893372,65.19913,50.374181,57.699251
1,sub-0002_1_ses-1_run-1,38.3,0,0.696452,0.61874,0.297726,0.427895,0.427895,8.315378,29.731818,8.122519,1.830924,2.452607,1.609489,1.988648,24.215803,83.688537,74.211362,61.291006,67.849267
2,sub-0019_1_ses-1_run-1,32.3,1,0.681526,0.524535,0.379375,0.429206,0.429206,-3.405104,7.743664,7.874985,2.076195,1.857048,2.348879,2.207727,27.096383,88.954234,80.633545,59.213314,69.173143


In [12]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.67657,0.502153,0.274844,0.464791,0.464791,-0.741153,21.019972,8.94017,1.67901,1.972876,1.677336,1.910629,28.288935,112.407756,99.71138,79.901797,93.408254
1,sub-59083_1_ses-1_run-1,66.367123,0,0.636007,0.493854,0.368881,0.423664,0.423664,8.89553,21.780841,5.2307,1.881776,1.733382,2.456991,1.88187,23.811992,78.760008,67.858413,69.958931,71.725258
2,sub-59085_1_ses-1_run-1,55.838356,0,0.647562,0.544655,0.295151,0.436396,0.436396,2.935053,25.585188,6.401311,1.75289,1.979178,1.987917,1.942573,30.412874,85.100774,77.584112,64.082736,68.226467


## Build ML models

In [13]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [15]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [16]:
y_pred = linr.predict(X_test)

In [17]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.623
Explained variance score: 0.631
MAE:  4.675


In [18]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [19]:
y_pred = llreg.predict(X_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.611
Explained variance score: 0.620
The mean absolute error: 4.784


In [20]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [21]:
y_pred = dtree.predict(X_test)
print('R2 score dtree regression: %.3f' % dtree.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.298
Explained variance score: 0.300
The mean absolute error: 6.104


In [22]:
regr = MLPRegressor(random_state=1, max_iter=900)
regr.fit(X_train, y_train)

In [23]:
y_pred = regr.predict(X_test)

In [24]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score neural network mlp regression: 0.394
Explained variance score: 0.399
MAE:  5.764


In [25]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train, y_train)

In [26]:
y_pred = eregr.predict(X_test)
print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score elasticnetcv regression: 0.179
Explained variance score: 0.186
MAE:  7.009


In [27]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train, y_train)

In [28]:
y_pred = etreg.predict(X_test)
print('R2 score elasticnetcv regression: %.3f' % etreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score elasticnetcv regression: 0.657
Explained variance score: 0.662
MAE:  4.393


In [29]:
## SAve off models

In [30]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [31]:
joblib.dump(linr, ('../result_models/'+  'neuroharm_top_linr.sav'))
joblib.dump(llreg, ('../result_models/'+ 'neuroharm_top_lassor.sav'))
joblib.dump(dtree, ('../result_models/'+ 'neuroharm_top_dtree.sav'))
joblib.dump(regr, ('../result_models/'+  'neuroharm_top_regr.sav'))

['../result_models/neuroharm_top_regr.sav']

In [32]:
joblib.dump(eregr, ('../result_models/'+ 'neuroharm_top_extratree.sav'))
joblib.dump(etreg, ('../result_models/'+ 'neuroharm_top_elasticregr.sav'))

['../result_models/neuroharm_top_elasticregr.sav']

## Run models on other dataset

In [33]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [34]:
X_mri_test = X_mri
y_mri_test = y_mri

In [35]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Linear regression: -1.751
Explained variance score: 0.762
The mean absolute error: 22.784


In [36]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso regression: -1.790
Explained variance score: 0.747
The mean absolute error: 22.904


In [37]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score Decision tree: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Decision tree: -1.913
Explained variance score: 0.476
The mean absolute error: 22.251


In [38]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score MLP regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score MLP regression: -3.003
Explained variance score: 0.026
The mean absolute error: 25.896


In [39]:
# plt.figure(figsize=(10,10))
# plt.scatter(y_test, y_pred, c='crimson')
# plt.yscale('log')
# plt.xscale('log')

# p1 = max(max(y_pred), max(y_test))
# p2 = min(min(y_pred), min(y_test))
# plt.plot([p1, p2], [p1, p2], 'b-')
# plt.xlabel('True Values', fontsize=15)
# plt.ylabel('Predictions', fontsize=15)
# plt.axis('equal')
# plt.show()