# ML testing: experiment #3

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# # demo stuff
# import ipywidgets as widgets
# import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [2]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

filepath_sabre = '../open_work/internal_results/cleaned_pvc2s/' 
filename_sabre = os.path.join(filepath_top,'SABRE_pvc2_cleaned.csv') 

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)
SABRE = pd.read_csv(filename_sabre)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
SABRE = SABRE.drop(SABRE.columns[0],axis=1)
#SABRE

In [5]:
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [6]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.71736,0.52803,0.31812,0.45881,0.45881,1.743,24.0,8.762,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981
1,sub-0002_1_ses-1_run-1,38.3,0,0.72383,0.62394,0.25673,0.45112,0.45112,1.629,23.0,9.0749,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047
2,sub-0019_1_ses-1_run-1,32.3,1,0.71224,0.53295,0.33594,0.45046,0.45046,0.621,13.0,8.8791,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588


In [7]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.64754,0.49441,0.3132,0.445,0.445,1.249,24.0,8.0434,1.7273,1.9776,1.7141,1.914,26.8262,104.3155,90.5047,73.8758,84.4527
1,sub-59083_1_ses-1_run-1,66.367123,0,0.60517,0.48594,0.42304,0.39968,0.39968,14.597,25.0,3.7791,1.9669,1.7154,2.6394,1.8815,24.9597,75.7051,65.3032,65.6771,67.1158
2,sub-59085_1_ses-1_run-1,55.838356,0,0.61724,0.53779,0.33692,0.41371,0.41371,6.341,30.0,5.1248,1.8146,1.9845,2.0827,1.9501,27.7117,81.0966,72.998,60.8317,64.3183


In [8]:
coly = TOP.columns
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE = SABRE[coly]
SABRE.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-100096_1_ses-1_run-1,78,1,0.56201,0.44945,0.42419,0.39147,0.39147,4.621,19.0,13.671,3.5865,3.8956,4.3534,3.669,15.3022,71.6047,62.0948,43.2926,54.7345
1,sub-100331_1_ses-1_run-1,71,1,0.53812,0.42593,0.49319,0.36927,0.36927,1.085,22.0,16.3956,4.3543,4.1134,3.8613,3.7593,11.3323,37.0396,30.0755,25.5827,29.7665
2,sub-102285_1_ses-1_run-1,72,1,0.56881,0.49224,0.35808,0.40082,0.40082,1.956,24.0,20.5471,4.3167,4.441,3.9533,3.9991,14.277,51.7767,41.2339,33.5018,39.2296


In [9]:
SABRE =SABRE.dropna()
#SABRE.isna().sum()

In [10]:
# check for any duplicated patients between stroke and mri
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

set()


In [11]:
# make mixed StrokeMRI and TOP dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [12]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [14]:
X_train_cut = X_train[:,1:]
X_train_cut = X_train_cut.astype('float')
X_train_cut.shape

(780, 18)

In [15]:
X_test_cut = X_test[:,1:]
X_test_cut = X_test_cut.astype('float')
X_test_cut.shape

(261, 18)

In [16]:

regr = MLPRegressor(random_state=1, max_iter=700)
regr.fit(X_train_cut, y_train)

In [17]:
y_pred = regr.predict(X_test_cut)

In [18]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score neural network mlp regression: 0.774
Explained variance score: 0.774
MAE:  6.808


In [19]:
linr = LinearRegression()
linr.fit(X_train_cut, y_train)

In [20]:
y_pred = linr.predict(X_test_cut)

In [21]:
print('R2 score Linear regression: %.3f' % linr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.861
Explained variance score: 0.861
MAE:  5.253


In [22]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train_cut, y_train)



In [23]:
y_pred = llreg.predict(X_test_cut)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.856
Explained variance score: 0.856
The mean absolute error: 5.361


In [24]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train_cut, y_train)

In [25]:
y_pred = dtree.predict(X_test_cut)
print('R2 score dtree regression: %.3f' % dtree.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.773
Explained variance score: 0.773
The mean absolute error: 6.388


In [26]:
regr = MLPRegressor(random_state=1, max_iter=700)
regr.fit(X_train_cut, y_train)

In [27]:
y_pred = regr.predict(X_test_cut)

In [28]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score neural network mlp regression: 0.774
Explained variance score: 0.774
MAE:  6.808


In [29]:
svr_p2 = SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2)
svr_p2.fit(X_train_cut, y_train)

In [30]:
y_pred = svr_p2.predict(X_test_cut)
print('R2 score SVR 2nd degree poly kernel regression: %.3f' % svr_p2.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score SVR 2nd degree poly kernel regression: 0.437
Explained variance score: 0.438
MAE:  10.976


In [31]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train_cut, y_train)

In [32]:
y_pred = eregr.predict(X_test_cut)
print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score elasticnetcv regression: 0.737
Explained variance score: 0.738
MAE:  7.370


In [33]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train_cut, y_train)

In [34]:
y_pred = etreg.predict(X_test_cut)
print('R2 score extra trees regression: %.3f' % etreg.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score extra trees regression: 0.882
Explained variance score: 0.883
MAE:  4.743


## Save off models

In [35]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [36]:
# joblib.dump(linr, ('../result_models/'+ 'unharm_mixed_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'unharm_mixed_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'unharm_mixed_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+'unharm_mixed_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+'unharm_mixed_svrp2.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'unharm_mixed_elasticnet.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'unharm_mixed_extratree.sav'))

## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

# # Here we check tht no rows once patient IDs were pulled 
(if not we can map them back)

In [37]:
X_train_pandas = pd.DataFrame(X_train)
X_train_pandas.duplicated().sum()

0

top_ml_matrix
needs to be mapped to top rows in X_train,
we will use ese MD5 hashes

now we need to make a dataframe of TOP minus what is in X_train

In [None]:
X_train_pandas.head(3)

In [None]:
#X_train_pandas[0]

In [None]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [None]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here

new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

filter down to only top where they are in new_top set

In [None]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
TOP_new

In [None]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
# X_top_train, X_top_test, y_top_train, y_top_test = train_test_split(
#     X_top, y_top, test_size=0.99, random_state=42)

In [None]:
X_top_test = X_top
y_top_test = y_top

In [None]:
y_top_pred = linr.predict(X_top_test)
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = llreg.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = dtree.predict(X_top_test)
print('R2 score decision tree regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = eregr.predict(X_top_test)
print('R2 score ElasticnetCV regression: %.3f' % eregr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = svr_p2.predict(X_top_test)
print('R2 score SVR poly 2 regression: %.3f' % svr_p2.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = etreg.predict(X_top_test)
print('R2 score Extra treen: %.3f' % etreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = regr.predict(X_top_test)
print('R2 score MLP regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here

new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects
print(len(new_mri))
#print(new_mri)

In [None]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
StrokeMRI_new

In [None]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 

X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri

In [None]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score Lasso linear regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = svr_p2.predict(X_mri_test)
print('R2 score SVR poly2 regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = etreg.predict(X_mri_test)
print('R2 score Extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))