# Clean PVC2 datasets

This notebooks takes the TOP, Stroke MRI, Insight 46 and SABRE datasets, and cleans them down to the relavant parameters for an ML model using only corrected ASL values.

Then we show some preliminary correlations and ML

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

In [None]:
unclean_TOP = pd.read_csv('../open_work/internal_results/top_stitched_conformed.csv ')
unclean_StrokeMRI = pd.read_csv('../open_work/internal_results/mri_stitched_conformed.csv')
unclean_Insight46 = pd.read_csv('../open_work/internal_results/inisight46_all_stitched_conformed.csv')
unclean_SABRE = pd.read_csv('../open_work/internal_results/SABRE_pvc2_stitched_conformed.csv')

In [None]:
unclean_TOP.head(130).tail(50)

In [None]:
list_of_parameters = [
    'participant_id', 
    'age',
    'sex',
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b', # we presume this is cbf, needs a check
    'aca_b', # we presume this is cbf, needs a check
    'mca_b', # we presume this is cbf, needs a check
    'pca_b', # we presume this is cbf, needs a check
    'totalgm_b', # we presume this is cbf, needs a check
]

In [None]:
TOP = unclean_TOP[list_of_parameters]
StrokeMRI = unclean_StrokeMRI[list_of_parameters]
Insight46 = unclean_Insight46[list_of_parameters]
SABRE = unclean_SABRE[list_of_parameters]

In [None]:
# now let's drop all our NAN rows from each dataframe
TOP =   TOP.dropna()
StrokeMRI = StrokeMRI.dropna()#
Insight46 = Insight46.dropna()#
SABRE =  SABRE.dropna()#

In [None]:
# now let's drop the two from TOP we know are problematic
# subject 239 and 1038
TOP.head(450).tail(50)
filtered_bad1 = TOP[TOP["participant_id"].str.contains("sub-0239_1_ses-1_run-1")]
filtered_bad2 = TOP[TOP["participant_id"].str.contains("1038")]
print(filtered_bad1, filtered_bad2)

In [None]:
TOP =TOP.drop([87,442])


In [None]:
# Now we have a clean TOP with sex mapped correctly, we can now look at out datasets for correlations

In [None]:
## pick and visualize correlations

In [None]:
#our_data = input()

In [None]:
features = widgets.SelectMultiple(
    options=TOP.columns.tolist(),
    value=['gm_vol'],
    #rows=10,
    description='Features',
    disabled=False
)
features

In [None]:
features_list = list(features.value)

features_list


## Note you can also just hard-code in your picked features

['gm_vol', 'wm_vol', 'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol']
gives a pretty good result

In [None]:
# now choose a label
label = widgets.Dropdown(
    options= TOP.columns.tolist(),# our_data.columns.tolist(),
    value='age',
    #rows=10,
    description='label',
    disabled=False
)
label

In [None]:
x_column = [label.value]
full_matrix = features_list + x_column

In [None]:
TOP[full_matrix]

Note: pandas will default correlation method to ='pearson'. Needs discussion with scientsts if other correlation is better. Types kendall and spearman are avalable.



In [None]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP

In [None]:
%matplotlib inline
seaborn.heatmap(TOP[full_matrix].corr(), annot = True)

# Now a lot of exciting correlations with everything.. 
Not everything correlates well but we see age correlations with GM volume, and negatively
This is what we would expect

In [None]:
ml_matrix = TOP[full_matrix]

In [None]:
ml_matrix

In [None]:
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')

In [None]:
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
# scale
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Maybe we want a drop down to pick the algorithm?

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))

In [None]:
y_pred

In [None]:
y_test

In [None]:
mean_absolute_error(y_test, y_pred)

## So why not just train on all features and see if it is better

In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')

In [None]:
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# scale
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

# So now we have our simple baseline model, and we can save it and apply to the other datasets

In [None]:
saving =widgets.ToggleButton(
    value=False,
    description='Click me to save model',
    disabled=False,
    button_style='success', #
    tooltip='Description',
    icon='check' # 
)

saving

In [None]:
if saving.value:
    print('You need to name your file, then hit enter')
    file_given_name = input()
    

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [None]:
# save off file
joblib.dump(linr, ('../result_models/'+file_given_name+ '.sav'))
    

Baseline model was at '../result_models/TOP_based_lr.sav'
R2 score Linear regression: 0.576
Explained variance score: 0.577
The mean absolute error: 5.181

In [None]:
# Logistic regressio means we need to map to ints at a minimal, but this is a bad algorithm choice