# ML testing: experiment #9 
This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models after harmonization with neurharmony  AND using a log base 10 revision of white matter hyperintensity count, as well as white matter hyperintensity  volume

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filename_mri = 'log_neuro_harm_mri.csv'
filename_top = 'log_neuro_harm_top.csv'

In [None]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [None]:
TOP = TOP.rename(columns={"Unnamed: 0": "participant_id"})
TOP.head(3)#TOP

In [None]:
StrokeMRI = StrokeMRI.rename(columns={"Unnamed: 0": "participant_id"})
StrokeMRI.head(3)

In [None]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

In [None]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

In [None]:
# check for any duplicated patients
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

In [None]:
# make mixed dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

# keeping patient ID until right when model is fed, then use patient ID as key to what went where

In [None]:
ml_matrix = mixed_data #.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
X_train_cut = X_train[:,1:]
X_train_cut = X_train_cut.astype('float')
X_train_cut.shape

In [None]:
X_test_cut = X_test[:,1:]
X_test_cut = X_test_cut.astype('float')
X_test_cut.shape

In [None]:
# svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=2, epsilon=0.1, coef0=1)
# svr_poly.fit(X_train_cut, y_train)

In [None]:
# y_pred = svr_poly.predict(X_test_cut)

In [None]:
# print('R2 score SV polynomial regression: %.3f' % svr_poly.score(X_test_cut,y_test))
# print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
# print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:

regr = MLPRegressor(random_state=1, max_iter=700)
regr.fit(X_train_cut, y_train)

In [None]:
y_pred = regr.predict(X_test_cut)

In [None]:
print('R2 score neural network mlp regression: %.3f' % regr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
linr = LinearRegression()
linr.fit(X_train_cut, y_train)

In [None]:
y_pred = linr.predict(X_test_cut)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train_cut, y_train)

In [None]:
y_pred = llreg.predict(X_test_cut)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train_cut, y_train)

In [None]:
y_pred = dtree.predict(X_test_cut)
print('R2 score dtree regression: %.3f' % dtree.score(X_test_cut,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

## Save off models

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
joblib.dump(linr, ('../result_models/'+  'log_neuroharm_mixed_linr.sav'))
joblib.dump(llreg, ('../result_models/'+ 'log_neuroharm_mixed_lassor.sav'))
joblib.dump(dtree, ('../result_models/'+ 'log_neuroharm_mixed_dtree.sav'))


## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

# # Here we check tht no rows once patient IDs were pulled 
(if not we can map them back)

In [None]:
X_train_pandas = pd.DataFrame(X_train)
X_train_pandas.duplicated().sum()

top_ml_matrix
needs to be mapped to top rows in X_train,
we will use ese MD5 hashes

now we need to make a dataframe of TOP minus what is in X_train

In [None]:
X_train_pandas.head(3)

In [None]:
#X_train_pandas[0]

In [None]:
trained_subjects = set(X_train_pandas[0])
#trained_subjects 

In [None]:
TOP_subjects = set(TOP.participant_id)
#TOP_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here

new_top=(trained_subjects^TOP_subjects)&TOP_subjects
print(len(new_top))
#print(new_top)

filter down to only top where they are in new_top set

In [None]:
TOP_new = TOP[TOP['participant_id'].isin(list(new_top))]
TOP_new

In [None]:
top_ml_matrix = TOP_new.drop('participant_id', axis=1) 

X_top = top_ml_matrix.drop('age', axis =1)

X_top = X_top.values
X_top = X_top.astype('float')
y_top = top_ml_matrix['age'].values
y_top=y_top.astype('float')

In [None]:
# X_top_train, X_top_test, y_top_train, y_top_test = train_test_split(
#     X_top, y_top, test_size=0.99, random_state=42)

In [None]:
X_top_test = X_top
y_top_test = y_top

In [None]:
y_top_pred = linr.predict(X_top_test)
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = llreg.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % llreg.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = dtree.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % dtree.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
y_top_pred = regr.predict(X_top_test)
print('R2 score Lasso linear regression: %.3f' % regr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))

In [None]:
StrokeMRI_subjects = set(StrokeMRI.participant_id)
#StrokeMRI_subjects

In [None]:
# take trained subjects out of top subjects
# we can use set math here

new_mri=(trained_subjects^StrokeMRI_subjects)&StrokeMRI_subjects
print(len(new_mri))
#print(new_mri)

In [None]:
StrokeMRI_new = StrokeMRI[StrokeMRI['participant_id'].isin(list(new_mri))]
StrokeMRI_new

In [None]:
strokemri_ml_matrix = StrokeMRI_new.drop('participant_id', axis=1) 

X_mri = strokemri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = strokemri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_test = X_mri
y_mri_test = y_mri

In [None]:
y_mri_pred = linr.predict(X_mri_test)
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso-linear regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = dtree.predict(X_mri_test)
print('R2 score decision tree regression: %.3f' % dtree.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
y_mri_pred = regr.predict(X_mri_test)
print('R2 score Lasso linear regression: %.3f' % regr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))