### Select patients for CFR model: Split patients in train, val and test sets ###

In [2]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [3]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200304'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200304


In [4]:
match_view_filename = 'pet_match365_diff_files_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients {}'.format(len(files_cfr.mrn.unique())))
print('Total number of studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of echos    {}'.format(len(files_cfr.filename.unique())))

files_cfr.head()

Total number of patients 1577
Total number of studies  2291
Total number of echos    103271


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view
0,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09016b...,2018-11-02 09:49:36,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,33.5,...,2.565391e-08,7.528357e-13,4.774701e-10,2.70874e-12,3.670912e-09,1.567517e-11,1.610227e-10,1.509277e-07,2.56671e-06,1.36324e-13,2.349999e-08,7.267388e-09,2.949915e-12,9.663496e-15,0.0001294855,9.732323e-06,1.612119e-12,0.999858,4.653681e-10,5.331582e-11,3.434301e-11,4.860822e-09,5.479821e-14,2.98651e-08,psax_avz
1,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09016b...,2018-11-02 09:49:36,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,33.371233,...,1.450411e-07,1.366147e-09,3.045583e-11,2.590847e-12,3.514123e-09,1.119247e-12,1.174291e-11,1.653268e-09,6.647998e-09,7.400897e-07,4.554047e-08,2.202479e-12,1.033245e-11,1.524424e-10,1.03291e-10,1.392449e-09,8.828201e-10,5.749271e-07,0.9999073,7.699048e-07,6.217265e-08,8.135313e-09,1.209222e-09,9.03969e-05,psax_az
2,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09016b...,2018-11-02 09:49:36,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,33.533333,...,2.660267e-07,3.19628e-10,2.934359e-09,3.283927e-07,3.254719e-09,1.172012e-07,4.847203e-10,3.966162e-09,1.291069e-09,1.924162e-08,6.683372e-07,0.9999976,1.350526e-09,4.16507e-10,1.1771e-10,1.866018e-09,3.144208e-09,3.432988e-10,9.421681e-10,8.57389e-07,4.335133e-08,1.434178e-10,4.363453e-12,2.205031e-09,other
3,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09016b...,2018-11-02 09:49:36,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,33.70892,...,1.941936e-14,2.563091e-18,2.31556e-14,1.126485e-08,5.550321e-15,1.119701e-19,4.69512e-16,9.429649e-14,1.808776e-17,1.098487e-13,1.432877e-15,2.962554e-15,2.005398e-15,3.412526e-12,1.698335e-14,8.875975e-14,8.122525e-18,1.691228e-16,5.1492060000000005e-17,7.832482e-14,1.353168e-15,1.3699309999999998e-19,3.447312e-22,2.979074e-17,a2c
4,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f,35133560,2018-11-02,2618,191,2018-04-25,35133560_2018-04-25,0,normal,1.716143,1.699871,0.934766,1.588982,17,0.923,1.584,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09016b...,2018-11-02 09:49:36,48b09016b25a7f36_4903a58dd44b7b81d524809aba5f_...,BWH,Vivid E95,GE Vingmed Ultrasound,0.0,33.403756,...,4.836035e-07,0.9999956,4.835767e-11,1.211003e-13,1.254464e-09,3.868239e-12,6.760433e-14,1.656497e-10,1.586072e-09,2.450811e-11,3.36056e-12,2.185993e-14,4.238282e-11,6.147999e-12,9.961027e-12,3.035175e-11,3.153882e-10,3.031236e-14,4.888586e-13,3.554045e-12,1.007157e-11,7.237515e-15,2.361346e-18,9.15012e-12,a3c_laocc


### Split the patients in train, validate and test sets ###
Although each view might have a little different patient pupulation distribution, because not all views are in each study. However, we want the same MRNS in each data set and for all views so that we can directly compare the performance of the algorithm for the same patients. We can expand the data frame above to add the splits.

In [10]:
def patientsplit(patient_list):

    train_test_split = 0.87
    train_eval_split = 0.90

    # Take a test set from all patients
    patient_list_train = np.random.choice(patient_list,
                                          size = int(np.floor(train_test_split*len(patient_list))),
                                          replace = False)
    patient_list_test = list(set(patient_list).difference(patient_list_train))
    train_test_intersection = set(patient_list_train).intersection(set(patient_list_test)) # This should be empty
    print('Intersection of patient_list_train and patient_list_test:', train_test_intersection)

    # Further separate some patients for evaluation from the training list
    patient_list_eval = np.random.choice(patient_list_train,
                                         size = int(np.ceil((1-train_eval_split)*len(patient_list_train))),
                                         replace = False)

    patient_list_train = set(patient_list_train).difference(patient_list_eval)
    train_eval_intersection = set(patient_list_train).intersection(set(patient_list_eval))
    print('Intersection of patient_list_train and patient_list_eval:', train_eval_intersection)

    # Show the numbers
    print('total patients:', len(patient_list))
    print()
    print('patients in set:', np.sum([len(patient_list_train),
                                     len(patient_list_eval),
                                     len(patient_list_test)]))
    print()
    print('patients in train:', len(patient_list_train))
    print('patients in eval:', len(patient_list_eval))
    print('patients in test:', len(patient_list_test))

    return patient_list_train, patient_list_eval, patient_list_test

In [12]:
# Get a patient list
patient_list = list(files_cfr.sample(frac=1).mrn.unique())
patient_list_train, patient_list_eval, patient_list_test = patientsplit(patient_list)

patient_split = {'train': patient_list_train,
                 'eval': patient_list_eval,
                 'test': patient_list_test}

print('Patient IDs in train:', len(patient_split['train']))
print('Patient IDs in eval:', len(patient_split['eval']))
print('Patient IDs in test:', len(patient_split['test']))

print()

print('Intersection train-test:', set(patient_split['train']).intersection(set(patient_split['test'])))
print('Intersection train-eval:', set(patient_split['train']).intersection(set(patient_split['eval'])))
print('Intersection eval-test:', set(patient_split['eval']).intersection(set(patient_split['test'])))

Intersection of patient_list_train and patient_list_test: set()
Intersection of patient_list_train and patient_list_eval: set()
total patients: 1577

patients in set: 1577

patients in train: 1233
patients in eval: 138
patients in test: 206
Patient IDs in train: 1233
Patient IDs in eval: 138
Patient IDs in test: 206

Intersection train-test: set()
Intersection train-eval: set()
Intersection eval-test: set()


In [30]:
# Add mode column to files_cfr
split_list = []
for dset in patient_split.keys():
    dset_mrn_list = list(patient_split[dset])
    split_list.append(pd.DataFrame({'mrn': dset_mrn_list,
                                    'mode': [dset]*len(dset_mrn_list)}))

split_df = pd.concat(split_list, ignore_index = True)

files_cfr_dset = files_cfr.merge(right = split_df, on = 'mrn', how = 'left').\
                    sample(frac = 1).\
                    reset_index(drop = True)

In [31]:
files_cfr_dset.head(5)

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode
0,4906a124ed0d0dce_4903a58584298228583bf8226717,25003559,2010-07-28,2284,-2,2010-07-30,25003559_2010-07-30,0,normal,1.261163,1.227413,1.311941,1.610294,17,1.187,1.497,4906a124ed0d0dce_4903a58584298228583bf8226717_...,/mnt/obi0/phi/echo/npyFiles/BWH/4906/4906a124e...,2010-07-28 10:32:05,4906a124ed0d0dce_4903a58584298228583bf8226717_...,BWH,iE33,Philips Medical Systems,0.0,33.333,...,9.388647e-12,6.49974e-09,1.580688e-12,3.786818e-10,2.568283e-10,1.570374e-09,2.452145e-05,6.417338e-10,1.268554e-09,7.550944e-08,9.125862999999999e-19,4.00085e-11,6.93753e-09,4.381331e-10,1.032149e-10,8.479908e-06,6.580638e-08,1.110766e-10,0.9987854,0.001181237,5.327792e-08,7.091428e-15,3.272365e-10,psax_mv,train
1,4b7f0ac7ce0eb576_4903a44b32ee56a6429cfd4c15c8,4670790,2008-02-02,449,-307,2008-12-05,4670790_2008-12-05,0,normal,1.316832,1.308031,2.513882,3.288235,17,2.525,3.325,4b7f0ac7ce0eb576_4903a44b32ee56a6429cfd4c15c8_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7f/4b7f0ac7c...,2008-02-02 10:32:59,4b7f0ac7ce0eb576_4903a44b32ee56a6429cfd4c15c8_...,BWH,Vivid i,GEMS Ultrasound,0.0,46.715471,...,3.230762e-11,3.042493e-14,1.98537e-17,1.065918e-13,7.302222e-14,2.357833e-14,4.066218e-16,3.235265e-14,1.860593e-15,6.676245e-15,1.0,1.385712e-15,4.961606e-13,1.338586e-16,1.641447e-12,6.877948e-14,9.477457e-16,1.227924e-13,6.64193e-14,1.14259e-13,1.188788e-17,2.31726e-15,1.03861e-15,other,test
2,4a12b7aba8c603f6_4903a585842a253e69d2fc63f34e,14684252,2010-04-30,1233,0,2010-04-30,14684252_2010-04-30,0,normal,1.016969,1.074092,2.443706,2.624765,17,2.534,2.577,4a12b7aba8c603f6_4903a585842a253e69d2fc63f34e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a12/4a12b7aba...,2010-04-30 11:14:34,4a12b7aba8c603f6_4903a585842a253e69d2fc63f34e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.147134,...,3.9887e-05,7.350071e-06,7.908297e-09,1.125901e-05,1.283584e-09,2.599064e-06,9.306795e-05,2.097071e-07,1.528883e-08,3.094548e-07,5.34563e-07,0.000639104,7.49295e-06,2.560201e-09,8.957485e-08,0.0009314755,1.213045e-05,3.483761e-06,0.8679969,0.130137,2.727836e-06,1.067082e-08,8.148727e-05,psax_mv,train
3,4a11381a56e57f93_4903a44b32ee56a041ca4e8c15c3,17635343,2008-02-04,1483,0,2008-02-04,17635343_2008-02-04,0,normal,1.79617,1.79836,1.606824,2.889647,17,1.619,2.908,4a11381a56e57f93_4903a44b32ee56a041ca4e8c15c3_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a11/4a11381a5...,2008-02-04 08:16:48,4a11381a56e57f93_4903a44b32ee56a041ca4e8c15c3_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.628933,...,4.915686e-12,2.185127e-13,1.488997e-13,2.213529e-14,1.700033e-13,3.097473e-14,2.45444e-13,1.677871e-12,7.030694000000001e-17,2.388882e-07,8.927103e-12,1.052461e-12,7.575542e-15,0.9999996,2.349485e-11,1.055219e-09,9.432121e-08,4.276389e-16,9.709612e-12,2.005225e-12,2.234949e-14,4.575504e-16,1.31494e-16,plax_laz,train
4,4a175452bcd0ee68_4903a58584283859142b614b3001,11718897,2010-06-29,999,1,2010-06-28,11718897_2010-06-28,0,normal,4.508552,4.300807,0.918706,3.951176,17,0.877,3.954,4a175452bcd0ee68_4903a58584283859142b614b3001_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175452b...,2010-06-29 09:24:10,4a175452bcd0ee68_4903a58584283859142b614b3001_...,BWH,iE33,Philips Medical Systems,0.0,43.348,...,5.090393e-12,1.161359e-12,6.993663e-13,3.05247e-12,5.095012e-14,1.436033e-09,1.104003e-13,2.629955e-11,2.829583e-10,3.416125e-17,1.0,5.763017e-15,4.402358e-10,7.959901e-16,1.778235e-10,1.768193e-11,2.485257e-18,2.279241e-12,1.752349e-11,2.706917e-10,2.942441e-14,2.250821e-14,3.740708e-11,other,train


In [32]:
# Prepare the final set that we will use for writing TFR files. We don't want any rows with NAs in some columns.
no_na_columns = ['study', 'mrn', 'echo_study_date', 'mrnstudyid', 'unaffected_cfr']
files_tfr = files_cfr_dset.dropna(subset=no_na_columns)

print('Dropped {} rows.'.format(files_cfr_dset.shape[0]-files_tfr.shape[0]))
# Add some other information that we need and shuffle the whole thing
files_tfr = files_tfr.assign(rate = np.round(1/files_tfr.frame_time*1e3, decimals = 1))
files_tfr = files_tfr.sample(frac = 1)

Dropped 0 rows.


In [33]:
dset_filename = 'tfr_files_dset_BWH_'+cfr_meta_date+'.parquet'
files_tfr.to_parquet(os.path.join(meta_dir, dset_filename))
print(files_tfr.shape)
files_tfr.head()

(101409, 59)


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate
91752,49028a1fb2ca702e_4903a58793b43aa1aff31b524260,21052774,2012-03-17,1796,12,2012-03-05,21052774_2012-03-05,0,normal,1.2237,1.216658,0.823471,1.001882,17,0.827,1.012,49028a1fb2ca702e_4903a58793b43aa1aff31b524260_...,/mnt/obi0/phi/echo/npyFiles/BWH/4902/49028a1fb...,2012-03-17 11:46:17,49028a1fb2ca702e_4903a58793b43aa1aff31b524260_...,BWH,Vivid S6,GEMS Ultrasound,0.0,33.504903,...,1.001125e-09,3.501491e-15,3.000134e-12,1.007713e-14,5.500488e-13,1.237732e-09,1.859025e-11,2.019787e-14,1.146686e-10,3.473322e-11,1.29218e-11,2.384686e-09,5.052117e-09,3.909834e-11,0.9999999,8.003702e-08,3.890277e-11,4.797572e-10,7.064245e-10,2.26923e-08,3.9156449999999996e-20,1.200919e-09,plax_plax,train,29.8
44246,49004dc39796d03c_4903a44ab12a9646c21c25df4bed,23356231,2009-01-02,2058,74,2008-10-20,23356231_2008-10-20,0,abnormal,1.560336,1.857667,0.7082,1.3156,5,0.953,1.487,49004dc39796d03c_4903a44ab12a9646c21c25df4bed_...,/mnt/obi0/phi/echo/npyFiles/BWH/4900/49004dc39...,2009-01-02 08:17:19,49004dc39796d03c_4903a44ab12a9646c21c25df4bed_...,BWH,iE33,Philips Medical Systems,0.0,59.36,...,1.193256e-14,3.882084e-09,1.996547e-12,4.74991e-13,8.378746e-11,4.390234e-13,1.680682e-14,7.335824e-14,5.029179e-16,1.0,1.362494e-14,1.085192e-14,3.921255e-14,7.56941e-14,2.555645e-12,4.70485e-18,5.643799e-13,6.246489e-12,2.789734e-10,3.5195000000000004e-17,1.678291e-16,3.478501e-14,other,train,16.8
27726,4a175353c273de20_4903a44b32e4622fe4d1b7bc8426,11051976,2008-08-14,958,-50,2008-10-03,11051976_2008-10-03,0,probably normal,1.925373,1.923164,1.534545,2.951182,11,1.474,2.838,4a175353c273de20_4903a44b32e4622fe4d1b7bc8426_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175353c...,2008-08-14 07:15:15,4a175353c273de20_4903a44b32e4622fe4d1b7bc8426_...,BWH,iE33,Philips Medical Systems,0.0,93.55,...,5.357701e-16,4.908052e-12,1.449995e-13,9.592719000000001e-17,1.625817e-13,3.157377e-16,5.674968e-15,2.297781e-14,2.261094e-16,1.0,9.236566e-17,1.180632e-15,9.317318e-18,7.624617e-14,9.094248e-14,8.175866e-19,2.310466e-13,6.339116e-14,1.781203e-12,1.156225e-16,6.224031000000001e-17,3.601097e-15,other,test,10.7
78540,49004a04801557c4_4903a44b330891872df331673630,23458060,2008-12-04,2068,-1,2008-12-05,23458060_2008-12-05,0,abnormal,2.417808,3.046386,0.3708,1.1296,5,0.584,1.412,49004a04801557c4_4903a44b330891872df331673630_...,/mnt/obi0/phi/echo/npyFiles/BWH/4900/49004a048...,2008-12-04 15:11:38,49004a04801557c4_4903a44b330891872df331673630_...,BWH,SEQUOIA,ACUSON,0.0,33.333,...,1.774926e-10,8.373244e-10,5.152446e-08,6.58406e-08,1.559555e-09,1.063635e-11,5.292048e-11,8.141809e-11,1.319268e-06,0.999953,1.325032e-10,2.146804e-10,1.776688e-09,3.916178e-07,1.586996e-06,2.393213e-05,1.505128e-13,2.696799e-09,1.828527e-05,1.303273e-09,3.06816e-10,6.534714e-13,other,eval,30.0
45791,4b79cbcb4706d24c_4903a5805095fff898d84001ff7c,2636207,2015-02-05,234,-1,2015-02-06,2636207_2015-02-06,0,abnormal,1.831979,1.767319,0.332,0.58675,4,0.369,0.676,4b79cbcb4706d24c_4903a5805095fff898d84001ff7c_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b79/4b79cbcb4...,2015-02-05 10:03:53,4b79cbcb4706d24c_4903a5805095fff898d84001ff7c_...,BWH,CX50,Philips Medical Systems,0.0,25.003,...,7.262886e-11,2.704623e-13,2.575664e-10,2.608807e-12,1.232007e-10,1.019369e-14,2.902633e-13,7.079634e-12,3.192791e-14,2.001189e-15,1.404179e-13,1.395372e-11,0.3177727,0.000403237,2.772404e-15,0.6818241,4.785466e-14,3.531493e-12,4.270111e-11,7.453136999999999e-19,1.049685e-13,2.11593e-13,psax_avz,train,40.0


In [34]:
minrate = 20
minframes = 40
minduration = minframes/minrate
print('Minmum rate:      {}'.format(minrate))
print('Minimum frames:   {}'.format(minframes))
print('Minimum duration: {}'.format(minduration))
#minduration = 1.5
#minframes = int(np.ceil(minrate*minduration))

maxrows = files_tfr.shape[0]
rows_disqualified = files_tfr[(files_tfr.rate < minrate) | (files_tfr.number_of_frames<minframes)].shape[0]
rows_qualified = files_tfr[(files_tfr.rate >= minrate) & (files_tfr.number_of_frames>=minframes)].shape[0]

print('Disqualified Rows {}, fraction:{:.1f}'.format(rows_disqualified, 
                                                     np.round(rows_disqualified/maxrows*100), decimals=1))

print('Qualified Rows {}, fraction:{:.1f}'.format(rows_qualified, 
                                                     np.round(rows_qualified/maxrows*100), decimals=1))

Minmum rate:      20
Minimum frames:   40
Minimum duration: 2.0
Disqualified Rows 27360, fraction:27.0
Qualified Rows 74002, fraction:73.0
