# ML testing: experiment #1- homemade harmonization

This notebook involves testing for the MRI conference abstract. This notebook shows TOP based models with TOP having in this case been harmonized to the MRI dataset by polynomial displacement

### import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
#import ipywidgets as widgets
#import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep

### import data

In [2]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

#filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = 'TOP_mri_homemade_harmonized.csv'

In [3]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [4]:
TOP = TOP.drop(TOP.columns[0],axis=1)
#TOP.columns

In [5]:
#TOP = TOP.rename(columns={'totalgm_b.1':'totalgm_b'})

In [6]:
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [7]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,1,0.709245,0.515052,0.304668,0.46299,0.46299,-0.056836,22.738964,8.314287,2.242372,2.357394,2.686952,2.395859,22.451843,83.081729,70.57998,55.413006,63.444328
1,sub-0002_1_ses-1_run-1,38.3,0,0.714606,0.608793,0.244718,0.455314,0.455314,0.332596,22.554506,8.688955,1.725914,2.446933,1.599201,1.944629,23.505761,88.835929,79.342156,66.109891,73.53385
2,sub-0019_1_ses-1_run-1,32.3,1,0.702714,0.518369,0.328677,0.453353,0.453353,0.872945,14.27658,8.610053,1.971527,1.928939,2.170919,2.156176,26.528029,93.034792,85.343097,62.796665,73.983271


In [8]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.172603,0,0.64754,0.49441,0.3132,0.445,0.445,1.249,24.0,8.0434,1.7273,1.9776,1.7141,1.914,26.8262,104.3155,90.5047,73.8758,84.4527
1,sub-59083_1_ses-1_run-1,66.367123,0,0.60517,0.48594,0.42304,0.39968,0.39968,14.597,25.0,3.7791,1.9669,1.7154,2.6394,1.8815,24.9597,75.7051,65.3032,65.6771,67.1158
2,sub-59085_1_ses-1_run-1,55.838356,0,0.61724,0.53779,0.33692,0.41371,0.41371,6.341,30.0,5.1248,1.8146,1.9845,2.0827,1.9501,27.7117,81.0966,72.998,60.8317,64.3183


## Build ML models

In [9]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [11]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [12]:
y_pred = linr.predict(X_test)

In [13]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score Linear regression: 0.691
Explained variance score: 0.692
MAE:  3.934


In [14]:
data= [[
    'linear regression',
    'unharm_top_linr.sav',
    mean_absolute_error(y_test, y_pred),
    linr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
linr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
linr_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,unharm_top_linr.sav,3.933941,0.691045,0.691502


In [15]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [16]:
y_pred = llreg.predict(X_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score Lasso regression: 0.663
Explained variance score: 0.663
The mean absolute error: 4.082


In [17]:
data= [[
    'lasso linear regression',
    'unharm_top_lassor.sav',
    mean_absolute_error(y_test, y_pred),
    llreg.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
llreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
llreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,lasso linear regression,unharm_top_lassor.sav,4.082385,0.662781,0.662842


In [18]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [19]:
y_pred = dtree.predict(X_test)
print('R2 score dtree regression: %.3f' % dtree.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

R2 score dtree regression: 0.559
Explained variance score: 0.562
The mean absolute error: 4.810


In [20]:
data= [[
    'decision tree',
    'unharm_top_dtree.sav',
    mean_absolute_error(y_test, y_pred),
    dtree.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
dtree_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
dtree_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,decision tree,unharm_top_dtree.sav,4.809621,0.559321,0.562286


In [21]:
regr = MLPRegressor(random_state=1, max_iter=900)
regr.fit(X_train, y_train)

In [22]:
y_pred = regr.predict(X_test)

In [23]:
# print('R2 score neural network mlp regression: %.3f' % regr.score(X_test,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [24]:
data= [[
    'MLP regression',
    'unharm_top_regr.sav',
    mean_absolute_error(y_test, y_pred),
    regr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
regr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
regr_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,MLP regression,unharm_top_regr.sav,4.656663,0.594601,0.594683


In [25]:
svr_p2 = SVR(C=1.0, kernel='poly', degree =2, epsilon=0.2)
svr_p2.fit(X_train, y_train)

In [26]:
y_pred = svr_p2.predict(X_test)

In [27]:
print('R2 score SVR 2nd degree poly kernel regression: %.3f' % svr_p2.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score SVR 2nd degree poly kernel regression: 0.025
Explained variance score: 0.026
MAE:  7.707


In [28]:
data= [[
    'SVR poly2',
    'unharm_top_svrp2.sav',
    mean_absolute_error(y_test, y_pred),
    svr_p2.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
svr_p2_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
svr_p2_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,SVR poly2,unharm_top_svrp2.sav,7.707322,0.025149,0.025905


In [29]:
eregr = ElasticNetCV(cv=5, random_state=12)
eregr.fit(X_train, y_train)

In [30]:
y_pred = eregr.predict(X_test)
print('R2 score elasticnetcv regression: %.3f' % eregr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score elasticnetcv regression: 0.082
Explained variance score: 0.082
MAE:  7.418


In [31]:
data= [[
    'Elastic_netCV',
    'unharm_top_eregr.sav',
    mean_absolute_error(y_test, y_pred),
    eregr.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
eregr_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
eregr_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Elastic_netCV,unharm_top_eregr.sav,7.417658,0.081927,0.08206


In [32]:
etreg = ExtraTreesRegressor(n_estimators=100, random_state=0)
etreg.fit(X_train, y_train)

In [33]:
y_pred = etreg.predict(X_test)
print('R2 score extra trees regression: %.3f' % etreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

R2 score extra trees regression: 0.775
Explained variance score: 0.775
MAE:  3.519


In [34]:
data= [[
    'Extra trees',
    'unharm_top_etreg.sav',
    mean_absolute_error(y_test, y_pred),
    etreg.score(X_test,y_test),
    metrics.explained_variance_score(y_test, y_pred)]]
etreg_results = pd.DataFrame(data, columns=['algorithm','file_name','mae', 'r2', 'explained_variance'])
etreg_results

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,Extra trees,unharm_top_etreg.sav,3.518966,0.774794,0.774795


In [35]:
top_based_hm_harmonized_on_top =pd.concat([linr_results,
                   llreg_results,
                   dtree_results,
                   regr_results,
                   svr_p2_results,
                   eregr_results,
                  etreg_results],
                  axis=0)
top_based_hm_harmonized_on_top

Unnamed: 0,algorithm,file_name,mae,r2,explained_variance
0,linear regression,unharm_top_linr.sav,3.933941,0.691045,0.691502
0,lasso linear regression,unharm_top_lassor.sav,4.082385,0.662781,0.662842
0,decision tree,unharm_top_dtree.sav,4.809621,0.559321,0.562286
0,MLP regression,unharm_top_regr.sav,4.656663,0.594601,0.594683
0,SVR poly2,unharm_top_svrp2.sav,7.707322,0.025149,0.025905
0,Elastic_netCV,unharm_top_eregr.sav,7.417658,0.081927,0.08206
0,Extra trees,unharm_top_etreg.sav,3.518966,0.774794,0.774795


In [36]:
top_based_hm_harmonized_on_top.to_csv('top_based_unharmonized_on_top.csv')

In [37]:
## SAve off models

In [38]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [39]:
# joblib.dump(linr, ('../result_models/'+  'homemade_harm1_top_linr.sav'))
# joblib.dump(llreg, ('../result_models/'+ 'homemade_harm1_top_lassor.sav'))
# joblib.dump(dtree, ('../result_models/'+ 'homemade_harm1_top_dtree.sav'))
# joblib.dump(regr, ('../result_models/'+  'homemade_harm1_top_regr.sav'))
# joblib.dump(svr_p2, ('../result_models/'+'homemade_harm1_top_svrp2.sav'))
# joblib.dump(etreg, ('../result_models/'+ 'homemade_harm1_top_extratree.sav'))
# joblib.dump(eregr, ('../result_models/'+ 'homemade_harm1_top_elasticregr.sav'))

## Run models on other dataset

In [40]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [41]:
X_mri_test = X_mri
y_mri_test = y_mri

In [42]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Linear regression: -2.252
Explained variance score: -0.612
The mean absolute error: 19.374


In [43]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Lasso regression: -2.644
Explained variance score: -0.782
The mean absolute error: 20.593


In [44]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score Decision tree: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score Decision tree: -0.169
Explained variance score: 0.474
The mean absolute error: 13.127


In [45]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score MLP regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score MLP regression: -2.193
Explained variance score: -0.358
The mean absolute error: 20.723


In [46]:
y_mri_pred = svr_p2.predict(X_mri_test)
print('R2 score SVR polynomial regression: %.3f' % svr_p2.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score SVR polynomial regression: -2.405
Explained variance score: -0.006
The mean absolute error: 23.592


In [47]:
y_mri_pred = eregr.predict(X_mri_test)
print('R2 score ElasticNet CV : %.3f' % eregr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score ElasticNet CV : -3.366
Explained variance score: -0.386
The mean absolute error: 26.182


In [48]:
y_mri_pred = etreg.predict(X_mri_test)
print('R2 score extra tree regression: %.3f' % etreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

R2 score extra tree regression: -0.353
Explained variance score: 0.480
The mean absolute error: 14.253


In [49]:
# plt.figure(figsize=(10,10))
# plt.scatter(y_test, y_pred, c='crimson')
# plt.yscale('log')
# plt.xscale('log')

# p1 = max(max(y_pred), max(y_test))
# p2 = min(min(y_pred), min(y_test))
# plt.plot([p1, p2], [p1, p2], 'b-')
# plt.xlabel('True Values', fontsize=15)
# plt.ylabel('Predictions', fontsize=15)
# plt.axis('equal')
# plt.show()