# Clean PVC2 datasets

This notebooks takes the TOP, Stroke MRI, Insight 46 and SABRE datasets, and cleans them down to the relavant parameters for an ML model using only corrected ASL values.

In [1]:
import os       # using operating system dependent functionality (folders)
import sys

import glob

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt


sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

In [2]:
unclean_TOP = pd.read_csv('../open_work/internal_results/top_stitched_conformed.csv ')
unclean_StrokeMRI = pd.read_csv('../open_work/internal_results/mri_stitched_conformed.csv')
unclean_Insight46 = pd.read_csv('../open_work/internal_results/inisight46_all_stitched_conformed.csv')
unclean_SABRE = pd.read_csv('../open_work/internal_results/SABRE_pvc2_stitched_conformed.csv')

In [3]:
unclean_TOP.head(130).tail(50)

Unnamed: 0.1,Unnamed: 0,participant_id,session_id,run_id,age,sex,site,gm_vol,wm_vol,csf_vol,...,pca_intermediate_b_cov,pca_intermediate_l_cov,pca_intermediate_r_cov,pca_distal_b_cov,pca_distal_l_cov,pca_distal_r_cov,totalgm_b_cov,totalgm_l_cov,totalgm_r_cov,renumber
80,80,sub-0219_1_ses-1_run-1,ASL_1,1,27.21,F,TOP,0.60395,0.45576,0.22873,...,1.9693,1.8785,2.042,1.9936,1.7839,2.1741,1.6201,1.6912,1.5466,
81,81,sub-0221_1_ses-1_run-1,ASL_1,1,26.98,F,TOP,0.65392,0.46054,0.26372,...,2.0197,1.9611,1.9968,2.988,3.373,2.2858,1.9319,1.9108,1.9323,
82,82,sub-0230_1_ses-1_run-1,ASL_1,1,45.15,F,TOP,0.57708,0.42668,0.32352,...,2.3624,2.3747,2.2534,2.2654,2.2324,2.2808,1.8179,1.9151,1.7084,
83,83,sub-0231_1_ses-1_run-1,ASL_1,1,28.02,F,TOP,0.65691,0.4626,0.22239,...,2.5105,2.519,2.3222,2.2754,2.0331,2.3832,1.9072,1.894,1.8943,
84,84,sub-0234_1_ses-1_run-1,ASL_1,1,20.51,M,TOP,0.736,0.56234,0.22262,...,1.814,1.8199,1.7952,1.9907,1.7572,2.2393,1.8449,1.8866,1.8007,
85,85,sub-0235_1_ses-1_run-1,ASL_1,1,43.46,M,TOP,0.72385,0.53274,0.41613,...,2.1164,2.0991,1.7894,2.9824,2.2663,2.5009,2.0793,2.0472,1.8199,
86,86,sub-0236_1_ses-1_run-1,ASL_1,1,44.88,M,TOP,0.73357,0.64468,0.31959,...,2.1582,2.2932,2.0178,2.7378,2.5273,2.9345,2.551,2.4612,2.638,
87,87,sub-0239_1_ses-1_run-1,ASL_1,1,18.4,M,TOP,0.68888,0.46799,0.23301,...,1.8373,1.9449,1.6423,2.0671,2.1486,1.8111,2.0641,2.1161,1.9857,
88,88,sub-0241_1_ses-1_run-1,ASL_1,1,21.01,F,TOP,0.73427,0.46421,0.24198,...,1.8301,1.7795,1.8575,1.6696,1.5956,1.7522,1.8278,1.7943,1.8579,
89,89,sub-0245_1_ses-1_run-1,ASL_1,1,28.71,M,TOP,0.66657,0.46286,0.30565,...,2.0966,2.2837,1.7657,2.7704,2.4903,2.7318,2.2092,2.2433,2.0239,


In [4]:
list_of_parameters = [
    'participant_id', 
    'age',
    'sex',
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b', # we presume this is cbf, needs a check
    'aca_b', # we presume this is cbf, needs a check
    'mca_b', # we presume this is cbf, needs a check
    'pca_b', # we presume this is cbf, needs a check
    'totalgm_b', # we presume this is cbf, needs a check
]

In [5]:
TOP = unclean_TOP[list_of_parameters]
StrokeMRI = unclean_StrokeMRI[list_of_parameters]
Insight46 = unclean_Insight46[list_of_parameters]
SABRE = unclean_SABRE[list_of_parameters]

In [6]:
# now let's drop all our NAN rows from each dataframe
TOP =   TOP.dropna()
StrokeMRI = StrokeMRI.dropna()#
Insight46 = Insight46.dropna()#
SABRE =  SABRE.dropna()#

In [7]:
# now let's drop the two from TOP we know are problematic
# subject 239 and 1038
TOP.head(450).tail(50)
filtered_bad1 = TOP[TOP["participant_id"].str.contains("sub-0239_1_ses-1_run-1")]
filtered_bad2 = TOP[TOP["participant_id"].str.contains("1038")]
print(filtered_bad1, filtered_bad2)

            participant_id   age sex   gm_vol   wm_vol  csf_vol  gm_ivc_ratio  \
87  sub-0239_1_ses-1_run-1  18.4   M  0.68888  0.46799  0.23301       0.49564   

    gmwm_ivc_ratio  wmh_vol  wmh_count  deepwm_b_cov  aca_b_cov  mca_b_cov  \
87         0.49564   26.247      179.0        5.0432     1.7255     2.1306   

    pca_b_cov  totalgm_b_cov  deepwm_b     aca_b     mca_b    pca_b  totalgm_b  
87     1.5415         2.0641   26.0392  109.7616  104.4269  76.0153    90.4899                participant_id    age sex   gm_vol   wm_vol  csf_vol  \
442  sub-1038_1_ses-1_run-1  41.29   M  0.65458  0.57621  0.32367   

     gm_ivc_ratio  gmwm_ivc_ratio  wmh_vol  wmh_count  deepwm_b_cov  \
442        0.4211          0.4211   42.471      570.0        3.0126   

     aca_b_cov  mca_b_cov  pca_b_cov  totalgm_b_cov  deepwm_b     aca_b  \
442     3.5598     5.2816     3.6628         6.6893  368.1125  465.6356   

        mca_b   pca_b  totalgm_b  
442  663.0069  680.34   710.2923  


In [8]:
TOP =TOP.drop([87,442])


Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
403,sub-0962_1_ses-1_run-1,30.47,F,0.63721,0.43223,0.3157,0.46003,0.46003,1.396,10.0,7.2988,1.778,2.0274,1.654,1.7946,23.081,91.8373,80.5793,68.8472,76.0572
404,sub-0964_1_ses-1_run-1,55.0,F,0.62234,0.48814,0.27195,0.45018,0.45018,4.93,25.0,7.1638,2.0123,2.034,1.7909,1.8663,24.1996,73.0225,62.7561,53.3612,58.3759
405,sub-0966_1_ses-1_run-1,23.83,F,0.73345,0.51915,0.25679,0.48592,0.48592,1.981,21.0,7.3056,1.8127,2.0022,1.7101,1.9955,24.5471,90.4523,80.462,69.2514,77.0501
406,sub-0967_1_ses-1_run-1,50.91,F,0.58201,0.51598,0.33722,0.40552,0.40552,1.879,21.0,6.9361,1.5684,1.7632,1.5063,1.5408,24.2406,81.0158,69.6362,57.1701,64.7177
407,sub-0975_1_ses-1_run-1,23.93,F,0.79253,0.56059,0.28257,0.48452,0.48452,1.006,20.0,9.4133,1.6692,1.9968,2.1563,1.9926,21.879,81.3658,78.1643,59.5295,69.9513
408,sub-0976_1_ses-1_run-1,31.65,F,0.58085,0.41619,0.19815,0.48599,0.48599,1.613,22.0,5.8813,1.5867,1.9286,1.9164,1.8646,22.9463,78.996,72.3454,55.1956,62.9081
409,sub-0977_1_ses-1_run-1,50.52,M,0.69628,0.54836,0.31907,0.44527,0.44527,3.665,30.0,9.8259,1.8615,2.2284,1.9601,2.1344,23.6663,94.5134,83.901,56.0943,72.1671
410,sub-0979_1_ses-1_run-1,52.14,M,0.63016,0.60017,0.44213,0.37679,0.37679,1.429,24.0,6.8,1.938,2.0138,1.6873,1.9435,23.4684,83.9399,71.5861,55.101,61.1905
411,sub-0981_1_ses-1_run-1,45.2,F,0.60983,0.48338,0.27011,0.44731,0.44731,2.077,22.0,7.339,1.7251,1.8934,1.6695,1.9139,25.9152,91.6647,85.3341,60.6711,74.9387
412,sub-0983_1_ses-1_run-1,36.7,F,0.61775,0.43839,0.35335,0.43828,0.43828,1.962,14.0,5.7383,2.015,1.8588,1.9695,1.8399,25.3795,92.1275,82.4924,63.0298,76.0271


In [None]:
# Now we have a clean TOP with sex mapped correctly, we can now look