# Clean PVC2 datasets

This notebooks takes the TOP, Stroke MRI, Insight 46 and SABRE datasets, and cleans them down to the relavant parameters for an ML model using only corrected ASL values.

Then we show some preliminary correlations and ML

In [None]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

# demo stuff
import ipywidgets as widgets
import seaborn 

# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
import joblib

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

In [None]:
unclean_TOP = pd.read_csv('../open_work/internal_results/top_stitched_conformed.csv ')
unclean_StrokeMRI = pd.read_csv('../open_work/internal_results/mri_stitched_conformed.csv')
unclean_Insight46 = pd.read_csv('../open_work/internal_results/inisight46_all_stitched_conformed.csv')
unclean_SABRE = pd.read_csv('../open_work/internal_results/SABRE_pvc2_stitched_conformed.csv')

In [None]:
len(unclean_Insight46)

In [None]:
unclean_TOP.head(130).tail(50)

In [None]:
list_of_parameters = [
    'participant_id', 
    'age',
    'sex',
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b', # we presume this is cbf, needs a check
    'aca_b', # we presume this is cbf, needs a check
    'mca_b', # we presume this is cbf, needs a check
    'pca_b', # we presume this is cbf, needs a check
    'totalgm_b', # we presume this is cbf, needs a check
]

In [None]:
TOP = unclean_TOP[list_of_parameters]
StrokeMRI = unclean_StrokeMRI[list_of_parameters]
Insight46 = unclean_Insight46[list_of_parameters]
SABRE = unclean_SABRE[list_of_parameters]

In [None]:
# now let's drop all our NAN rows from each dataframe
TOP =   TOP.dropna()
StrokeMRI = StrokeMRI.dropna()#
Insight46 = Insight46.dropna()#
SABRE =  SABRE.dropna()#

In [None]:
# now let's drop the two from TOP we know are problematic
# subject 239 and 1038
TOP.head(450).tail(50)
filtered_bad1 = TOP[TOP["participant_id"].str.contains("sub-0239_1_ses-1_run-1")]
filtered_bad2 = TOP[TOP["participant_id"].str.contains("1038")]
print(filtered_bad1, filtered_bad2)

In [None]:
TOP =TOP.drop([87,442])


In [None]:
# now we will clean the strokeMRI dataset
#StrokeMRI.head(450).tail(50)

filtered_bad_mri = StrokeMRI[StrokeMRI["participant_id"].str.contains("59365")]
print(filtered_bad_mri)

In [None]:
StrokeMRI =StrokeMRI.drop([470,471])

In [None]:
# TOP = TOP.drop(TOP.columns[0],axis=1)
# StrokeMRI =StrokeMRI.drop(StrokeMRI.columns[0],axis=1)

In [None]:
TOP

In [None]:
# for f in SABRE.participant_id:
#     print(f)

In [None]:
filtered_bad_sabre = SABRE[SABRE["participant_id"].str.contains( "180106|164058|24646|501418|600137|502441|265542|225223|95329|68503|34935|229151|501636|500904|373519|256870|24328|234940|2341")]
print(filtered_bad_sabre)

In [None]:
SABRE = SABRE.drop(list(filtered_bad_sabre.index))
#SABRE.tail(600).head(10)

## Now we have a clean TOP and StrokeMRI with sex mapped correctly, we can now look at out datasets for correlations;

let's save off the PVC2 files 

In [None]:
# filepath = '../open_work/internal_results/cleaned_pvc2s/' 
# filename = os.path.join(filepath,'SABRE_pvc2_cleaned.csv') 
# if not os.path.exists(filepath):
#     # if filder doesn't exist, create it
#     os.makedirs(filepath)
# SABRE.to_csv(filename)  

In [None]:
# filepath = '../open_work/internal_results/cleaned_pvc2s/' 
# filename = os.path.join(filepath,'Insight46_pvc2c.csv') 
# if not os.path.exists(filepath):
#     # if filder doesn't exist, create it
#     os.makedirs(filepath)
# Insight46.to_csv(filename)  

In [None]:
# filepath = '../open_work/internal_results/cleaned_pvc2s/' 
# filename = os.path.join(filepath,'TOP_pvc2c.csv') 
# if not os.path.exists(filepath):
#     # if filder doesn't exist, create it
#     os.makedirs(filepath)
# TOP.to_csv(filename)  

In [None]:
# filepath = '../open_work/internal_results/cleaned_pvc2s/' 
# filename = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
# if not os.path.exists(filepath):
#     # if filder doesn't exist, create it
#     os.makedirs(filepath)
# StrokeMRI.to_csv(filename)  

## pick and visualize correlations

In [None]:
#our_data = input()

In [None]:
features = widgets.SelectMultiple(
    options=TOP.columns.tolist(),
    value=['gm_vol'],
    #rows=10,
    description='Features',
    disabled=False
)
features

In [None]:
features_list = list(features.value)

features_list


## Note you can also just hard-code in your picked features

['gm_vol', 'wm_vol', 'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol']
gives a pretty good result

In [None]:
# now choose a label
label = widgets.Dropdown(
    options= TOP.columns.tolist(),# our_data.columns.tolist(),
    value='age',
    #rows=10,
    description='label',
    disabled=False
)
label

In [None]:
x_column = [label.value]
full_matrix = features_list + x_column

In [None]:
TOP[full_matrix]

Note: pandas will default correlation method to ='pearson'. Needs discussion with scientsts if other correlation is better. Types kendall and spearman are avalable.



In [None]:
# Now we need to flip the sex back to numbers for a correlation
sex_mapping = {'F':0,'M':1}
TOP = TOP.assign(sex = TOP.sex.map(sex_mapping))
TOP

In [None]:
%matplotlib inline
seaborn.heatmap(TOP[full_matrix].corr(), annot = True)

# Now a lot of exciting correlations with everything.. 
Not everything correlates well but we see age correlations with GM volume, and negatively
This is what we would expect

In [None]:
ml_matrix = TOP[full_matrix]

In [None]:
ml_matrix

In [None]:
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')

In [None]:
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
X_train.shape

In [None]:
# # scale
# sc = StandardScaler()
# sc.fit(X_train)
# X_train = sc.transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
# Maybe we want a drop down to pick the algorithm?

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('MAE: % .3f' % mean_absolute_error(y_test, y_pred))

In [None]:
#y_pred

In [None]:
#y_test

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

## So why not just train on all features and see if it is better

In [None]:
ml_matrix = TOP.drop('participant_id', axis=1)
X = ml_matrix.drop('age', axis =1)
X = X.values
X = X.astype('float')

In [None]:
y = ml_matrix['age'].values
y=y.astype('float')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
# # scale
# sc = StandardScaler()
# sc.fit(X_train)
# X_train = sc.transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
linr = LinearRegression()
linr.fit(X_train, y_train)

In [None]:
y_pred = linr.predict(X_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')
#plt.yscale('log')
#plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
llreg = linear_model.LassoLars(alpha=0.01)
llreg.fit(X_train, y_train)

In [None]:
y_pred = llreg.predict(X_test)
print('R2 score Linear regression: %.3f' % llreg.score(X_test,y_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_test, y_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

# So now we have our simple baseline model, and we can save it and apply to the other datasets

In [None]:
saving =widgets.ToggleButton(
    value=False,
    description='Click me to save model',
    disabled=False,
    button_style='success', #
    tooltip='Description',
    icon='check' # 
)

saving

In [None]:
if saving.value:
    print('You need to name your file, then hit enter')
    file_given_name = input()
    

In [None]:
# check if model folder exists and if not , then create
model_folder = '../result_models/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

# optional model saving below

In [None]:
# save off file
joblib.dump(linr, ('../result_models/'+file_given_name+ '.sav'))
    

# Baseline model is at '../result_models/TOP_based_lr.sav'

Simplest Linear Regression

R2 score Linear regression: 0.576

Explained variance score: 0.577

The mean absolute error: 5.181

now we can ask how this does with our other dataset

In [None]:
StrokeMRI

In [None]:
sex_mapping = {'F':0,'M':1}
StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI

In [None]:
mri_ml_matrix = StrokeMRI.drop('participant_id', axis=1)
X_mri = mri_ml_matrix.drop('age', axis =1)
X_mri = X_mri.values
X_mri = X_mri.astype('float')
y_mri = mri_ml_matrix['age'].values
y_mri=y_mri.astype('float')

In [None]:
X_mri_train, X_mri_test, y_mri_train, y_mri_test = train_test_split(X_mri, y_mri, test_size=0.8, random_state=42)

In [None]:
# # scale
# sc = StandardScaler()
# sc.fit(X_mri_train)
# X_mri_train = sc.transform(X_mri_train)
# X_mri_test = sc.transform(X_mri_test)

In [None]:
y_mri_pred = linr.predict(X_mri_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
#y_mri_test

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_mri_test, y_mri_pred, c='purple')
plt.scatter(y_test, y_pred, c='yellow')
#plt.yscale('log')
#plt.xscale('log')

p1 = max(max(y_mri_pred), max(y_mri_test))
p2 = min(min(y_mri_pred), min(y_mri_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.title('TOP based model')
plt.show()

In [None]:
#print('The mean absolute error: %.3f' % mean_absolute_error(y_test, y_pred))

In [None]:
## This is bad... we need to understand where this went off. In the end wea will probably make a mixed model, but

In [None]:
y_mri_pred = llreg.predict(X_mri_test)
print('R2 score Lasso regression: %.3f' % llreg.score(X_mri_test,y_mri_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mri_test, y_mri_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mri_test, y_mri_pred))

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_mri_test, y_mri_pred, c='crimson')
#plt.yscale('log')
#plt.xscale('log')

p1 = max(max(y_mri_pred), max(y_mri_test))
p2 = min(min(y_mri_pred), min(y_mri_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
StrokeMRI.columns

In [None]:
%matplotlib inline
seaborn.heatmap(StrokeMRI[['age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']].corr(), annot = True)

In [None]:
%matplotlib inline
seaborn.heatmap(TOP[['age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']].corr(), annot = True)

In [None]:
TOP[['age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']].corr() - StrokeMRI[['age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']].corr()

In [None]:
TOP.age.min(), TOP.age.max()

In [None]:
StrokeMRI.age.min(), StrokeMRI.age.max()

let's make a miniStrokeMRI cut down to ages close to TOP

In [None]:
# Logistic regressio means we need to map to ints at a minimal, but this is a bad algorithm choice

In [None]:
ministroke = StrokeMRI
ministroke = ministroke[ministroke['age'] > 59.78]
ministroke

So about 60% of our data matches the TOP age range

In [None]:
mini_ml_matrix = ministroke.drop('participant_id', axis=1)
X_mini = mini_ml_matrix.drop('age', axis =1)
X_mini = X_mini.values
X_mini = X_mini.astype('float')
y_mini = mini_ml_matrix['age'].values
y_mini=y_mini.astype('float')

In [None]:
X_mini_train, X_mini_test, y_mini_train, y_mini_test = train_test_split(X_mini, y_mini, test_size=0.8, random_state=42)

In [None]:
y_mini_pred = linr.predict(X_mini_test)

In [None]:
print('R2 score Linear regression: %.3f' % linr.score(X_mini_test,y_mini_test))
print('Explained variance score: %.3f'  % metrics.explained_variance_score(y_mini_test, y_mini_pred))
print('The mean absolute error: %.3f' % mean_absolute_error(y_mini_test, y_mini_pred))

# The model gets better, but is very off, I suspect we have coded the values differently on some columns

In [None]:
TOP.describe()

TOP still has some outlying data that needs to be cleaned out before the model is made... judging from the maxes

In [None]:
StrokeMRI.describe()

In [None]:
TOP.describe() - StrokeMRI.describe()

In [None]:
top_too_high_by_totalgm =TOP[TOP['deepwm_b'] > 120]
top_too_high_by_totalgm

In [None]:
top_too_low_by_totalgm =TOP[TOP['deepw_b'] <30]
top_too_low_by_totalgm

In [None]:
TOP.sort_values('wmh_count', ascending=False)[:10]