### Select patients for CFR model: Split patients in train, val and test sets ###

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200202'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200202


In [3]:
match_view_filename = '210_getStressTest_match365_files_BWH_'+cfr_meta_date+'.parquet'
files_cfr = pd.read_parquet(os.path.join(meta_dir, match_view_filename))

print('Total number of patients {}'.format(len(files_cfr.mrn.unique())))
print('Total number of studies  {}'.format(len(files_cfr.study.unique())))
print('Total number of files    {}'.format(len(files_cfr.filename.unique())))

files_cfr.head()

Total number of patients 1792
Total number of studies  2921
Total number of files    131290


Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,view_a2c,view_a3c,view_a4c,view_plax,view_psax,view_other,sum_views,max_view
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.089256,56.0,85.0,0.027944,0.027944,0.9762369,0.0002330583,0.003340996,0.0001106907,0.0007710441,0.01930706,1.0,view_a2c
1,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.064101,53.0,88.0,0.017671,0.017671,1.080687e-06,2.221534e-07,1.34489e-06,0.9993535,6.972564e-06,0.0006367801,1.0,view_plax
2,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,48.333332,46.0,107.0,0.076726,0.076726,2.892548e-11,2.382341e-09,3.241086e-09,1.0,2.436432e-10,4.737633e-10,1.0,view_plax
3,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.450451,38.0,84.0,0.043312,0.043312,3.644072e-10,1.350347e-10,1.540048e-07,1.887208e-10,1.837416e-07,0.9999996,1.0,view_other
4,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.119048,57.0,85.0,0.020879,0.020879,1.801199e-13,9.366453e-13,6.398218e-12,3.151624e-12,6.481914e-12,1.0,1.0,view_other


### Split the patients in train, validate and test sets ###
Although each view might have a little different patient pupulation distribution, because not all views are in each study. However, we want the same MRNS in each data set and for all views so that we can directly compare the performance of the algorithm for the same patients. We can expand the data frame above to add the splits.

In [4]:
def patientsplit(patient_list):

    train_test_split = 0.86
    train_eval_split = 0.90

    # Split train/test sets
    patient_list_train = np.random.choice(patient_list,
                                          size = int(np.floor(train_test_split*len(patient_list))),
                                          replace = False)
    patient_list_test = list(set(patient_list).difference(patient_list_train))
    train_test_intersection = set(patient_list_train).intersection(set(patient_list_test)) # This should be empty
    print('Intersection of patient_list_train and patient_list_test:', train_test_intersection)

    # Further separate some patients for evaluation
    patient_list_eval = np.random.choice(patient_list_train,
                                         size = int(np.ceil((1-train_eval_split)*len(patient_list_train))),
                                         replace = False)

    patient_list_train = set(patient_list_train).difference(patient_list_eval)
    train_eval_intersection = set(patient_list_train).intersection(set(patient_list_eval))
    print('Intersection of patient_list_train and patient_list_eval:', train_eval_intersection)

    # Show the numbers
    print('total patients:', len(patient_list))
    print()
    print('patients in set:', np.sum([len(patient_list_train),
                                     len(patient_list_eval),
                                     len(patient_list_test)]))
    print()
    print('patients in train:', len(patient_list_train))
    print('patients in eval:', len(patient_list_eval))
    print('patients in test:', len(patient_list_test))

    return patient_list_train, patient_list_eval, patient_list_test

In [5]:
# Get a patient list
patient_list = list(files_cfr.sample(frac=1).mrn.unique())
patient_list_train, patient_list_eval, patient_list_test = patientsplit(patient_list)

patient_split = {'train': patient_list_train,
                 'eval': patient_list_eval,
                 'test': patient_list_test}

print('Patient IDs in train:', len(patient_split['train']))
print('Patient IDs in eval:', len(patient_split['eval']))
print('Patient IDs in test:', len(patient_split['test']))

print()

print('contamination train-test:', set(patient_split['train']).intersection(set(patient_split['test'])))
print('contamination train-eval:', set(patient_split['train']).intersection(set(patient_split['eval'])))
print('contamination eval-test:', set(patient_split['eval']).intersection(set(patient_split['test'])))

Intersection of patient_list_train and patient_list_test: set()
Intersection of patient_list_train and patient_list_eval: set()
total patients: 1792

patients in set: 1792

patients in train: 1386
patients in eval: 155
patients in test: 251
Patient IDs in train: 1386
Patient IDs in eval: 155
Patient IDs in test: 251

contamination train-test: set()
contamination train-eval: set()
contamination eval-test: set()


In [6]:
# Add dset column to files_cfr
split_list = []
for dset in patient_split.keys():
    dset_mrn_list = list(patient_split[dset])
    split_list.append(pd.DataFrame({'mrn': dset_mrn_list,
                                    'dset': [dset]*len(dset_mrn_list)}))
split_df = pd.concat(split_list, ignore_index = True)

files_cfr_dset = files_cfr.merge(right = split_df, on = 'mrn', how = 'left').\
                    sample(frac = 1).\
                    reset_index(drop = True)

In [9]:
files_cfr_dset.head(5)

Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,view_a2c,view_a3c,view_a4c,view_plax,view_psax,view_other,sum_views,max_view,dset
0,15807142,4a134bddf3e7bda5_4903a585842d912471d51857d418,2010-03-08,101234,19,1486,EVS0259159,2010-02-17,2010-08-04,1.557312,4a134bddf3e7bda5_4903a585842d912471d51857d418_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a134bddf...,2010-03-08 16:14:25,4a134bddf3e7bda5_4903a585842d912471d51857d418_...,BWH,Vivid i,GEMS Ultrasound,0.0,63.531296,45.0,62.0,0.046036,0.046036,3.898935e-10,5.720248e-09,5.932444e-09,8.931723e-09,2.915045e-09,1.0,1.0,view_other,train
1,30951032,48b5f0657cba806b_4903a582ec746c0bbfc0974fe76b,2017-05-02,154266,-6,7296,E4895329,2017-05-08,2017-05-02,1.782776,48b5f0657cba806b_4903a582ec746c0bbfc0974fe76b_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b5/48b5f0657...,2017-05-02 09:51:57,48b5f0657cba806b_4903a582ec746c0bbfc0974fe76b_...,BWH,iE33,Philips Medical Systems,0.0,33.333,52.0,105.0,0.035325,0.035325,1.314419e-06,7.985801e-09,8.749991e-09,1.114371e-06,0.999994,4e-06,1.0,view_psax,train
2,12945283,4a14f3d279677334_4903a5864f00cec2d7c9f2c058bc,2013-12-31,128929,-10,1375,EVS0456945,2014-01-10,2013-12-31,2.975839,4a14f3d279677334_4903a5864f00cec2d7c9f2c058bc_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f3d27...,2013-12-31 13:51:06,4a14f3d279677334_4903a5864f00cec2d7c9f2c058bc_...,BWH,iE33,Philips Medical Systems,0.0,33.333,59.0,63.0,0.052627,0.052627,5.65921e-12,5.165135e-11,2.588519e-11,9.892238e-13,6.386083e-08,1.0,1.0,view_other,train
3,6338578,4b7d772493cf8511_4903a44ab12fce6e71204d32264c,2009-04-07,94351,1,3058,EVS0203456,2009-04-06,2009-09-04,1.637666,4b7d772493cf8511_4903a44ab12fce6e71204d32264c_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7d/4b7d77249...,2009-04-07 08:38:14,4b7d772493cf8511_4903a44ab12fce6e71204d32264c_...,BWH,iE33,Philips Medical Systems,0.0,33.333,91.0,61.0,0.039452,0.039452,7.783201e-05,2.392791e-08,1.932503e-06,3.033627e-07,5.360696e-05,0.999866,1.0,view_other,train
4,8170888,4b7358d860949f55_4903a58792a368790e6f1cb8f0a4,2012-12-04,121094,-7,5169,EVS0393455,2012-12-11,2012-12-04,2.773672,4b7358d860949f55_4903a58792a368790e6f1cb8f0a4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7358d86...,2012-12-04 15:36:07,4b7358d860949f55_4903a58792a368790e6f1cb8f0a4_...,BWH,iE33,Philips Medical Systems,0.0,66.0,27.0,182.0,0.025754,0.025754,3.539501e-11,2.29578e-09,2.044834e-09,5.589552e-11,6.525645e-09,1.0,1.0,view_other,train


In [45]:
# Prepare the final set that we will use for writing TFR files
files_tfr = files_cfr_dset.dropna(subset = ['filename', 'frame_time', 'cfr', 'mrn']).\
            drop(columns = ['sum_views', 'index'])

print('Dropped {} rows.'.format(files_cfr_dset.shape[0]-files_tfr.shape[0]))
# Add some other information that we need and shuffle the whole thing
files_tfr = files_tfr.assign(rate = np.round(1/files_tfr.frame_time*1e3, decimals = 1))
files_tfr = files_tfr.sample(frac = 1)

Dropped 9 rows.


In [46]:
dset_filename = 'tfr_files_dset_BWH_'+cfr_meta_date+'.parquet'
files_tfr.to_parquet(os.path.join(meta_dir, dset_filename))
print(files_tfr.shape)
files_tfr.head()

(131282, 31)


Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,frame_time,number_of_frames,heart_rate,deltaX,deltaY,view_a2c,view_a3c,view_a4c,view_plax,view_psax,view_other,max_view,dset,rate
63145,2523025,4b79c80061112008_4903a584a1b88b12c7ef5ed0ac87,2011-02-17,107083,78,3724,EVS0284554,2010-12-01,2011-02-17,1.832814,4b79c80061112008_4903a584a1b88b12c7ef5ed0ac87_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b79/4b79c8006...,2011-02-17 14:29:35,4b79c80061112008_4903a584a1b88b12c7ef5ed0ac87_...,BWH,iE33,Philips Medical Systems,33.333,47.0,78.0,0.033177,0.033177,5.550229e-07,1.214206e-05,4.393036e-05,3.881712e-05,0.9996817,0.0002228351,view_psax,train,30.0
109853,13410485,4a153253670792d1_4903a5864e6f0b9e90675e01b4bd,2013-09-24,126208,32,1242,EVS0436560,2013-08-23,2013-08-20,1.333333,4a153253670792d1_4903a5864e6f0b9e90675e01b4bd_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a15/4a1532536...,2013-09-24 14:13:23,4a153253670792d1_4903a5864e6f0b9e90675e01b4bd_...,BWH,SEQUOIA,ACUSON,50.0,8.0,89.0,1.0,1.0,0.0001005139,0.003361952,3.504304e-07,2.105851e-05,4.823518e-07,0.9965157,view_other,train,20.0
6604,11051976,4a175353c273de20_4903a44b32edbb6759e2ac6da82b,2008-01-30,91068,-247,2397,0802970P,2008-10-03,2008-01-30,1.925373,4a175353c273de20_4903a44b32edbb6759e2ac6da82b_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a175353c...,2008-01-30 15:20:26,4a175353c273de20_4903a44b32edbb6759e2ac6da82b_...,BWH,iE33,Philips Medical Systems,33.333,91.0,133.0,0.038987,0.038987,0.001581763,1.311611e-07,2.6458e-07,0.001313987,0.9368781,0.06022581,view_psax,train,30.0
16589,22268882,490177d647fb1d91_4903a580509069a4f66253a99498,2015-07-20,133209,361,5606,E1117952,2014-07-24,2015-07-20,1.809524,490177d647fb1d91_4903a580509069a4f66253a99498_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/490177d64...,2015-07-20 10:19:03,490177d647fb1d91_4903a580509069a4f66253a99498_...,BWH,iE33,Philips Medical Systems,33.333,65.0,84.0,0.041843,0.041843,1.515666e-13,3.476256e-14,1.0,8.354233e-13,1.426983e-10,8.950727e-12,view_a4c,train,30.0
8977,20942504,4903addc0e43ef32_4903a44b32ea00fb1ca50ebb1b82,2008-06-17,73586,-1,2572,0817899R,2008-06-18,2008-06-17,1.119613,4903addc0e43ef32_4903a44b32ea00fb1ca50ebb1b82_...,/mnt/obi0/phi/echo/npyFiles/BWH/4903/4903addc0...,2008-06-17 09:13:12,4903addc0e43ef32_4903a44b32ea00fb1ca50ebb1b82_...,BWH,Vivid7,GE Vingmed Ultrasound,33.431373,86.0,59.0,0.038363,0.038363,5.581668e-11,1.42525e-09,2.570663e-08,1.973289e-08,1.0,1.619786e-09,view_psax,train,29.9


In [47]:
minrate = 20
minduration = 1.5
minframes = int(np.ceil(minrate*minduration))

maxrows = files_tfr.shape[0]
rows_disqualified = files_tfr[(files_tfr.rate < minrate) | (files_tfr.number_of_frames<minframes)].shape[0]
rows_qualified = files_tfr[(files_tfr.rate >= minrate) & (files_tfr.number_of_frames>=minframes)].shape[0]

print('Disqualified Rows {}, fraction:{:.1f}'.format(rows_disqualified, 
                                                     np.round(rows_disqualified/maxrows*100), decimals=1))

print('Qualified Rows {}, fraction:{:.1f}'.format(rows_qualified, 
                                                     np.round(rows_qualified/maxrows*100), decimals=1))

Disqualified Rows 32492, fraction:25.0
Qualified Rows 98790, fraction:75.0


In [48]:
minframes

30