# ML testing: experiment #3

This notebook involves testing for the MRI conference abstract. This notebook shows mixed_dataset (TOP + StrokeMRI) based models

### import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier

import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

### import data

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 

In [None]:
TOP = pd.read_csv(filename_top)
StrokeMRI = pd.read_csv(filename_mri)

In [None]:
TOP = TOP.drop(TOP.columns[0],axis=1)
#TOP

In [None]:
StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#StrokeMRI

In [None]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP.head(3)

In [None]:
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

In [None]:
# check for any duplicated patients
strokers = set(StrokeMRI.participant_id)
topers = set(TOP.participant_id)
z = strokers.intersection(topers)
print(z) 

In [None]:
# make mixed dataset
mixed_data = pd.concat([TOP, StrokeMRI], sort=False)


## Build ML models

In [None]:
ml_matrix = mixed_data.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [None]:
y_pred = llreg.predict(X_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)
print('R2 score dtree regression: %.3f' % dtree.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

## Save off models

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
joblib.dump(linr, ('../result_models/'+ 'unharm_mixed_linr.sav'))
joblib.dump(llreg, ('../result_models/'+ 'unharm_mixed_lassor.sav'))
joblib.dump(dtree, ('../result_models/'+ 'unharm_mixed_dtree.sav'))


## Run models on other datasets (TOP, StrokeMRI)
but without re-running the training data

In [None]:
# # Here we have to remove the dataset part that was in training...how?
# top_ml_ma

In [None]:
top_ml_matrix = TOP.drop('participant_id', axis=1) 

## top_ml_matrix = pd.concat([top_ml_matrix,ml_matrix ])# ? maybe instead drop duplicated
## top_ml_matrix_und = top_ml_matrix.drop_duplicates(top_ml_matrix,keep=False)
# X_top = top_ml_matrix.drop('age', axis =1)

# X_top = X_top.values
# X_top = X_top.astype('float')
# y_top = top_ml_matrix['age'].values
# y_top=y_top.astype('float')

In [None]:
X_top_train, X_top_test, y_top_train, y_top_test = train_test_split(
    X_top, y_top, test_size=0.99, random_state=42)

In [None]:
y_top_pred = linr.predict(X_top_test)
print('R2 score Linear regression: %.3f' % linr.score(X_top_test,y_top_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_top_test, y_top_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_top_test, y_top_pred))