### Select patients for CFR model: Split patients in train, val and test sets ###

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_data_dir = os.path.normpath('/mnt/obi0/phi/pet/pet_cfr')
cfr_meta_date = '200425'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200425


In [3]:
match_view_filename = 'pet_match365_diff_files_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients      {}'.format(len(files_cfr.mrn.unique())))
print('Total number of echo studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of PET studies   {}'.format(len(files_cfr.petmrn_identifier.unique())))
print('Total number of echos         {}'.format(len(files_cfr.filename.unique())))

files_cfr.head(2)

Total number of patients      2719
Total number of echo studies  4143
Total number of PET studies   2830
Total number of echos         196338


Unnamed: 0,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views
0,35169325,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113,2017-05-22,2017-06-21,35169325_2017-05-22,30,30.0,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b090131...,2017-06-21 09:04:50,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,BWH,iE33,Philips Medical Systems,0.0,33.333,77.0,71.0,0.044038,0.044038,8.544426e-11,1.134038e-13,5.271967e-14,1.336331e-13,1.708275e-15,4.455052e-13,1.0,1.499724e-11,5.666772e-14,6.379312e-14,6.884242e-12,1.625494e-14,1.684185e-12,1.107935e-14,2.403445e-12,6.817977e-16,1.88851e-14,5.909166e-11,1.349804e-14,8.245356e-14,4.105062e-13,4.618445e-11,4.207604e-15,4.772874e-14,9.679026e-17,8.89431e-18,5.028371e-13,a4c,1.0
1,35169325,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113,2017-05-22,2017-06-21,35169325_2017-05-22,30,30.0,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b090131...,2017-06-21 09:04:50,48b09013185d7d6b_4903a582ec77f16c8e735c3b6113_...,BWH,iE33,Philips Medical Systems,0.0,49.49,52.0,71.0,0.04404,0.04404,2.206956e-11,2.73909e-11,1.041237e-11,1.219875e-10,7.501796e-11,1.225508e-11,2.75189e-09,4.431975e-12,7.776415e-12,1.314174e-11,6.367319e-12,5.739965e-12,6.897379e-10,1.617736e-11,0.9999999,3.547227e-12,8.979309e-12,3.594578e-11,9.016331e-11,9.644979e-11,2.262643e-12,1.0363e-09,4.691505e-11,7.552487e-08,8.309125e-13,1.391146e-14,1.978723e-10,other,1.0


### Filter data sets: GLOBAL and NON-DEFECT variables ###

#### GLOBAL VARIABLES ####

Notebook 3/17/2020: global_pet_cfr
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data

File used: post_2018_pets_with_clinical_cfr_all.csv

* After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion

Merge:

After combining, 3038 PETs
File saved as pets_with_echos_global_all.parquet
Use notes:

* Variables to use: rest_global_mbf, stress_global_mbf, global_cfr_calc
* Other variables- myocardial_perfusion, segmental data, perfusion data, TID, gated SPECT results, calcium score, height, weight
* For "cleaner" data, would exclude those with post-2018==1

In [4]:
global_pet_file = 'pets_with_echos_global_all.parquet'
global_pet = pd.read_parquet(os.path.join(pet_data_dir, global_pet_file))
global_pet = global_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(global_pet.petmrn_identifier.unique())}')

PET studies: 3031


In [5]:
# Add echo data to the pet studies (inner join, to keep only keys in both dataframes)
global_pet_echo = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(global_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(global_pet_echo.study.unique())}')

PET studies:  2192
ECHO studies: 2988


In [6]:
# PET studies without echo data
global_pet_echo_all = global_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left' )
global_pet_noecho = global_pet_echo_all.loc[global_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(global_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(global_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        3031
PET studies without echo: 839


In [7]:
# Global_pet_echo table with the variables to use (drop rows with na in any of those variables)
global_pet_echo.head(2)
global_pet_variables_target = ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
global_pet_variables = global_pet_variables_target.copy()
global_pet_variables.extend(list(files_cfr.columns))
global_pet_echo_variables = global_pet_echo[global_pet_variables].dropna(axis=0)
print(f'PET target variables: {global_pet_variables_target}')
print(f'PET studies:          {len(global_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(global_pet_echo_variables.study.unique())}')
print(f'global shape:         {global_pet_echo_variables.shape}')

PET target variables: ['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018']
PET studies:          2175
ECHO studies:         2914
global shape:         (136380, 53)


#### Unaffected (non-defect) variables ####
Notebook 2/26/2020: clean_pet_cfr_petdata4_addingseg7
File used: pet_cfr_petdata_02_26_2020_withperfandseg7.xlsx

* All segments used (segment 7 data now available)
* MBF and CFR calculated for unaffected segments
Perfusion defect segments are those with a perfusion score >0 at rest or stress
Unaffected segments are those not in a perfusion defect, and not adjacent to a perfusion defect
* Of note, PETs with no unaffected segments were excluded from the dataset
* As described above, 2871 after excluding CABG, transplant, and those with missing perfusion data
* After excluding 295 for having no "unaffected segments", 2576 remaining
File used: post_2018_pets_with_clinical_cfr_all.csv

* Clinical global MBF and CFR values for PETs after 10/2018 were obtained from EPIC
These values were also used for unaffected MBF and CFR
After excluding CABG, transplant, and missing CFR values, 167 PETs with any perfusion
After keeping only studies with normal/probably normal perfusion, 83 PETs remaining

Merge:

Total 2659 PETs
File created: non_defect_plus_p2018.parquet
Use notes:

* Variables to use: rest_mbf_unaff, stress_mbf_unaff, unaffected_cfr
* Also available: perfusion data, segmental data, gated SPECT results, calcium score, height, weight
* For clean data, would exclude PETs with post-2018==1, and would limit to PETs with myocardial_perfusion of 'normal' or 'probably normal'- if this is done, would use global_cfr_calc instead of unaffected_cfr because it is slightly more accurate in PETs with normal/prob normal perfusion (could consider looking at whether normal/probably normal PETs are still scored as having small perfusion defects)

In [8]:
nondefect_pet_file = 'non_defect_plus_p2018.parquet'
nondefect_pet = pd.read_parquet(os.path.join(pet_data_dir, nondefect_pet_file))
nondefect_pet = nondefect_pet.astype({'pet_date': 'datetime64[ns]'})
print(f'PET studies: {len(nondefect_pet.petmrn_identifier.unique())}')

PET studies: 2655


In [9]:
# Add echo data to the pet studies (inner join)
nondefect_pet_echo = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='inner')
print(f'PET studies:  {len(nondefect_pet_echo.petmrn_identifier.unique())}')
print(f'ECHO studies: {len(nondefect_pet_echo.study.unique())}')

PET studies:  1941
ECHO studies: 2606


In [10]:
# Disqualified PET studies
nondefect_pet_echo_all = nondefect_pet.merge(files_cfr, on = ['mrn', 'pet_date', 'petmrn_identifier'], how='left')
nondefect_pet_noecho = nondefect_pet_echo_all.loc[nondefect_pet_echo_all.study.isnull()]
print(f'PET studies total:        {len(nondefect_pet_echo_all.petmrn_identifier.unique())}')
print(f'PET studies without echo: {len(nondefect_pet_noecho.petmrn_identifier.unique())}')

PET studies total:        2655
PET studies without echo: 714


In [11]:
nondefect_variables_target = ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
nondefect_variables = nondefect_variables_target.copy()
nondefect_variables.extend(list(files_cfr.columns))
nondefect_pet_echo_variables = nondefect_pet_echo[nondefect_variables].dropna(axis=0)
print(f'PET target variables: {nondefect_variables_target}')
print(f'PET studies:          {len(nondefect_pet_echo_variables.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(nondefect_pet_echo_variables.study.unique())}')
print(f'nondefect shape:      {nondefect_pet_echo_variables.shape}')

PET target variables: ['rest_mbf_unaff', 'stress_mbf_unaff', 'unaffected_cfr', 'post-2018']
PET studies:          1926
ECHO studies:         2544
nondefect shape:      (117630, 53)


### Split the patients in train, validate and test sets ###
Although each view might have a little different patient pupulation distribution, because not all views are in each study. However, we want the same MRNS in each data set and for all views so that we can directly compare the performance of the algorithm for the same patients. We can expand the data frame above to add the splits.

In [12]:
def patientsplit(patient_list):

    train_test_split = 0.87
    train_eval_split = 0.90

    # Take a test set from all patients
    patient_list_train = np.random.choice(patient_list,
                                          size = int(np.floor(train_test_split*len(patient_list))),
                                          replace = False)
    patient_list_test = list(set(patient_list).difference(patient_list_train))
    train_test_intersection = set(patient_list_train).intersection(set(patient_list_test)) # This should be empty
    print('Intersection of patient_list_train and patient_list_test:', train_test_intersection)

    # Further separate some patients for evaluation from the training list
    patient_list_eval = np.random.choice(patient_list_train,
                                         size = int(np.ceil((1-train_eval_split)*len(patient_list_train))),
                                         replace = False)

    patient_list_train = set(patient_list_train).difference(patient_list_eval)
    train_eval_intersection = set(patient_list_train).intersection(set(patient_list_eval))
    print('Intersection of patient_list_train and patient_list_eval:', train_eval_intersection)

    # Show the numbers
    print('total patients:', len(patient_list))
    print()
    print('patients in set:', np.sum([len(patient_list_train),
                                     len(patient_list_eval),
                                     len(patient_list_test)]))
    print()
    print('patients in train:', len(patient_list_train))
    print('patients in eval:', len(patient_list_eval))
    print('patients in test:', len(patient_list_test))

    return patient_list_train, patient_list_eval, patient_list_test

In [13]:
dataset= nondefect_pet_echo_variables
dataset_filename = 'nondefect_pet_echo_dataset_'+cfr_meta_date+'.parquet'

#dataset = global_pet_echo_variables
#dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'

# Get a patient list
patient_list = list(dataset.sample(frac=1).mrn.unique())
patient_list_train, patient_list_eval, patient_list_test = patientsplit(patient_list)

patient_split = {'train': patient_list_train,
                 'eval': patient_list_eval,
                 'test': patient_list_test}

print('Patient IDs in train:', len(patient_split['train']))
print('Patient IDs in eval:', len(patient_split['eval']))
print('Patient IDs in test:', len(patient_split['test']))

print()

print('Intersection train-test:', set(patient_split['train']).intersection(set(patient_split['test'])))
print('Intersection train-eval:', set(patient_split['train']).intersection(set(patient_split['eval'])))
print('Intersection eval-test:', set(patient_split['eval']).intersection(set(patient_split['test'])))

Intersection of patient_list_train and patient_list_test: set()
Intersection of patient_list_train and patient_list_eval: set()
total patients: 1881

patients in set: 1881

patients in train: 1472
patients in eval: 164
patients in test: 245
Patient IDs in train: 1472
Patient IDs in eval: 164
Patient IDs in test: 245

Intersection train-test: set()
Intersection train-eval: set()
Intersection eval-test: set()


In [14]:
# Add mode column to files_cfr
split_list = []
for dset in patient_split.keys():
    dset_mrn_list = list(patient_split[dset])
    split_list.append(pd.DataFrame({'mrn': dset_mrn_list,
                                    'mode': [dset]*len(dset_mrn_list)}))

split_df = pd.concat(split_list, ignore_index = True)

dataset_split = dataset.merge(right = split_df, on = 'mrn', how = 'left').\
                    sample(frac = 1).\
                    reset_index(drop = True)

In [15]:
print(f'PET studies:          {len(dataset_split.petmrn_identifier.unique())}')
print(f'ECHO studies:         {len(dataset_split.study.unique())}')

PET studies:          1926
ECHO studies:         2544


In [16]:
# Prepare the final set that we will use for writing TFR files. We don't want any rows with NAs in some columns.
dataset_split_tfr = dataset_split.dropna(axis=0)

print('Dropped {} rows.'.format(dataset_split.shape[0]-dataset_split_tfr.shape[0]))
# Add some other information that we need and shuffle the whole thing
dataset_split_tfr = dataset_split_tfr.assign(rate = np.round(1/dataset_split_tfr.frame_time*1e3, decimals = 1))
dataset_split_tfr = dataset_split_tfr.sample(frac = 1)

Dropped 0 rows.


In [17]:
dataset_split_tfr.to_parquet(os.path.join(meta_dir, dataset_filename))
print('Saved to file:', dataset_filename)
print(dataset_split_tfr.shape)
dataset_split_tfr.head()

Saved to file: nondefect_pet_echo_dataset_200425.parquet
(117630, 55)


Unnamed: 0,rest_mbf_unaff,stress_mbf_unaff,unaffected_cfr,post-2018,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,difference(days),filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,mode,rate
18674,1.059389,0.776528,0.732996,0,11304847,4a1750e8dc0280fd_4903a582ec70ba13ff844c41e609,2017-01-13,2017-01-17,11304847_2017-01-13,4,4.0,4a1750e8dc0280fd_4903a582ec70ba13ff844c41e609_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1750e8d...,2017-01-17 14:23:41,4a1750e8dc0280fd_4903a582ec70ba13ff844c41e609_...,BWH,iE33,Philips Medical Systems,0.0,33.333,109.0,67.0,0.044038,0.044038,0.01573065,...,1.666434e-05,4.202194e-06,1.389856e-06,7.079621e-07,0.0002001586,7.394475e-06,0.9624209,6.459973e-08,9.710724e-05,2.030896e-06,0.0005103556,5.935002e-06,1.814322e-07,4.043357e-08,6.228231e-07,7.717082e-05,1.948081e-05,0.01013003,0.003366953,6.11714e-05,0.00553306,a5c,1.0,test,30.0
28543,0.644353,0.707412,1.097864,0,10720274,4a16b95a47fc158c_4903a585842f401366fc302dff4d,2010-02-01,2010-01-11,10720274_2010-02-01,-21,21.0,4a16b95a47fc158c_4903a585842f401366fc302dff4d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a16/4a16b95a4...,2010-01-11 14:54:29,4a16b95a47fc158c_4903a585842f401366fc302dff4d_...,BWH,iE33,Philips Medical Systems,0.0,33.333,36.0,93.0,0.028615,0.028615,7.555331e-09,...,1.540241e-08,0.0001018568,8.471653e-11,1.276934e-06,3.278268e-09,0.0006015286,0.0001118803,0.9844056,0.008370587,2.023102e-06,8.783142e-10,2.985569e-05,1.015667e-09,0.0002887829,3.453614e-07,0.0002122665,1.427987e-06,4.718483e-05,0.002616921,0.0002128458,4.511215e-07,apex,1.0,test,30.0
75737,1.51325,2.70475,1.787378,0,22537617,490170812e170e6e_4903a44b32ee56a552ba756edab1,2007-12-31,2008-02-01,22537617_2007-12-31,32,32.0,490170812e170e6e_4903a44b32ee56a552ba756edab1_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/490170812...,2008-02-01 08:53:52,490170812e170e6e_4903a44b32ee56a552ba756edab1_...,BWH,Vivid i,GEMS Ultrasound,0.0,40.461144,44.0,146.0,0.040921,0.040921,5.014281e-10,...,9.987834e-13,6.087255e-13,9.17286e-12,4.245338e-12,1.967342e-11,1.850203e-11,8.511962e-10,1.078135e-09,4.463536e-11,1.452259e-13,2.315353e-06,3.617304e-10,6.556838e-11,4.250919e-07,7.198787e-05,0.9999237,2.286756e-09,1.556465e-07,3.916849e-12,7.137592e-12,6.43149e-11,psax_az,1.0,train,24.7
19482,0.587932,1.089962,1.853892,0,28944601,490b13fe1869fe6f_4903a58dd5465400b623f4792e3c,2018-10-11,2018-05-25,28944601_2018-10-11,-139,139.0,490b13fe1869fe6f_4903a58dd5465400b623f4792e3c_...,/mnt/obi0/phi/echo/npyFiles/BWH/490b/490b13fe1...,2018-05-25 10:37:44,490b13fe1869fe6f_4903a58dd5465400b623f4792e3c_...,BWH,iE33,Philips Medical Systems,0.0,33.333,88.0,56.0,0.044038,0.044038,7.845179e-08,...,6.51506e-07,2.971888e-07,6.921654e-05,2.129774e-06,0.0001006591,1.9934e-07,4.207442e-07,1.915554e-07,1.123143e-06,0.0009466812,0.0002816789,9.776259e-06,4.159056e-08,0.3594388,1.885697e-06,0.0001061307,1.79908e-08,0.0009106238,0.1041057,5.176221e-10,2.493195e-09,a3c,1.0,test,30.0
72888,0.61363,1.787859,2.913577,0,15106081,4a134274127a86a9_4903a582ec73c1ce2d1d1b54e9bb,2017-11-06,2017-02-14,15106081_2017-11-06,-265,265.0,4a134274127a86a9_4903a582ec73c1ce2d1d1b54e9bb_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a1342741...,2017-02-14 13:06:21,4a134274127a86a9_4903a582ec73c1ce2d1d1b54e9bb_...,BWH,iE33,Philips Medical Systems,0.0,49.967,61.0,83.0,0.037484,0.037484,5.081746e-10,...,3.285212e-09,2.856128e-08,4.236119e-11,3.319493e-11,1.280459e-09,9.824478e-09,3.020438e-08,1.764973e-13,1.0,1.605269e-11,1.024106e-10,1.789865e-11,7.282268e-09,5.621912e-10,1.050436e-15,2.279305e-10,3.402834e-09,3.213294e-09,7.67635e-13,3.443551e-12,1.171578e-09,other,1.0,train,20.0


In [18]:
minrate = 20
minframes = 40
minduration = minframes/minrate
print('Minmum rate:      {}'.format(minrate))
print('Minimum frames:   {}'.format(minframes))
print('Minimum duration: {}'.format(minduration))
#minduration = 1.5
#minframes = int(np.ceil(minrate*minduration))

maxrows = dataset_split_tfr.shape[0]
rows_disqualified = dataset_split_tfr[(dataset_split_tfr.rate < minrate) | (dataset_split_tfr.number_of_frames<minframes)].shape[0]
rows_qualified = dataset_split_tfr[(dataset_split_tfr.rate >= minrate) & (dataset_split_tfr.number_of_frames>=minframes)].shape[0]

print('Disqualified Rows {}, fraction:{:.1f}'.format(rows_disqualified, 
                                                     np.round(rows_disqualified/maxrows*100), decimals=1))

print('Qualified Rows {}, fraction:{:.1f}'.format(rows_qualified, 
                                                     np.round(rows_qualified/maxrows*100), decimals=1))

Minmum rate:      20
Minimum frames:   40
Minimum duration: 2.0
Disqualified Rows 31440, fraction:27.0
Qualified Rows 86190, fraction:73.0
