### Summary statistics for the training, evaluation and testing data sets ###

In [2]:
import os
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [3]:
dset='cfr'
meta_date = '200617'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_'+meta_date+'A', dset)

meta_dir = os.path.join(cfr_data_root, 'metadata_'+meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr


In [8]:
# TFR .parquet data files
train_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_train_'+meta_date+'_*.parquet')))
eval_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_eval_'+meta_date+'_*.parquet')))
test_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_test_'+meta_date+'_*.parquet')))

# List of files that failed TFR conversion
train_failed_files = [file.replace('.parquet', '.failed') for file in train_files]
eval_failed_files = [file.replace('.parquet', '.failed') for file in eval_files]
test_failed_files = [file.replace('.parquet', '.failed') for file in test_files]

print(train_failed_files)

['/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_0.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_1.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_2.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_3.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_4.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_5.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_6.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617A/cfr/cfr_a4c_train_200617_7.failed']


In [35]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
df = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

train_failed_df = pd.concat([pd.read_parquet(file) for file in train_failed_files])
eval_failed_df = pd.concat([pd.read_parquet(file) for file in eval_failed_files])
test_failed_df = pd.concat([pd.read_parquet(file) for file in test_failed_files])

tf_data = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)
tf_failed_data = pd.concat([train_failed_df, eval_failed_df, test_failed_df], ignore_index=True).reset_index(drop=True)

tf_failed_data = tf_failed_data.assign(dur = tf_failed_data.frame_time*1e-3*tf_failed_data.number_of_frames)
tf_data = tf_data.assign(dur = tf_data.frame_time*1e-3*tf_data.number_of_frames)

In [36]:
n_videos_success = len(tf_data.filename.unique())
n_videos_failed = len(tf_failed_data.filename.unique())
n_videos = n_videos_success + n_videos_failed
n_videos_success_frac = np.around(n_videos_success/n_videos, decimals=2)
n_videos_failed_frac = np.around(n_videos_failed/n_videos, decimals=2)
print(f'Successful conversions: {n_videos_success} of {n_videos}, {n_videos_success_frac}')
print(f'Failed conversions:     {n_videos_failed}  of {n_videos}, {n_videos_failed_frac}')

Successful conversions: 7445 of 11077, 0.67
Failed conversions:     3632  of 11077, 0.33


In [37]:
# Reason for failure
failed_stats = tf_failed_data.err.value_counts().to_frame().reset_index(drop=False)
failed_stats = failed_stats.rename(columns={'index':'reason', 'err': 'n'})
failed_stats

Unnamed: 0,reason,n
0,frame_rate,2418
1,video_len,1050
2,deltaXY,164


In [47]:
tf_failed_data.head(20)

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,tracer_obi,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,pet_measurement,difference(days),filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,...,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,sum_views,dset_mode,rate,dur,err
0,0.671,1.27,1.892697,0,rubidium,1258813,4b7a857e1589ed70_4903a44ab12fcc8ede06afa6231e,2009-02-06,2009-04-20,1258813_2009-02-06,73,1.0,73.0,4b7a857e1589ed70_4903a44ab12fcc8ede06afa6231e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a857e1...,2009-04-20 10:50:46,4b7a857e1589ed70_4903a44ab12fcc8ede06afa6231e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.609755,42.0,81.0,0.061381,...,4.006848e-14,1.08693e-11,1.591278e-09,1.841748e-13,1.391018e-12,2.993983e-09,1.959036e-08,9.034701e-11,3.359011e-12,1.157823e-11,9.674532e-15,1.147529e-09,9.054673e-12,4.092726e-13,9.533064e-12,6.801878e-10,1.773107e-10,1.720243e-14,1.111507e-11,a4c,1.0,train,24.6,1.70561,frame_rate
1,0.717,1.071,1.493724,0,rubidium,24776189,4907be21c5742dc3_4903a58584260936d072e8eeb5d7,2010-04-28,2010-08-24,24776189_2010-04-28,118,1.0,118.0,4907be21c5742dc3_4903a58584260936d072e8eeb5d7_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907be21c...,2010-08-24 12:33:11,4907be21c5742dc3_4903a58584260936d072e8eeb5d7_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,52.701148,30.0,86.0,0.056266,...,1.124111e-06,2.798097e-08,0.007670134,3.653413e-07,4.662313e-09,1.317948e-05,2.608606e-06,1.417438e-09,2.320615e-09,4.24158e-09,4.034444e-08,0.01942723,0.3962738,0.001779584,3.719391e-06,0.01911558,0.0001082126,4.211617e-08,0.08341381,a4c,1.0,train,19.0,1.581034,frame_rate
2,1.487,0.999,0.671822,0,rubidium,19561356,4a1fe3f9078e7151_4903a44ab122d65b390411ae3c51,2009-06-01,2009-09-16,19561356_2009-06-01,107,1.0,107.0,4a1fe3f9078e7151_4903a44ab122d65b390411ae3c51_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1f/4a1fe3f90...,2009-09-16 15:34:05,4a1fe3f9078e7151_4903a44ab122d65b390411ae3c51_...,BWH,Vivid i,GEMS Ultrasound,0.0,40.400649,69.0,63.0,0.046036,...,9.787160999999999e-21,3.942709e-15,1.849095e-15,3.431561e-18,1.7720159999999998e-20,2.6537100000000003e-17,1.500834e-15,9.473301e-16,2.092604e-18,2.0608620000000002e-17,1.2419659999999999e-19,1.8641500000000002e-18,9.522944e-15,5.759288e-20,9.217348e-17,2.532945e-14,1.161603e-20,1.007748e-21,2.01598e-18,a4c,1.0,train,24.8,2.787645,frame_rate
3,2.373,2.812,1.184998,0,rubidium,17451121,4a113a00a31bee44_4903a44ab128db06815e6e25ad79,2009-04-02,2009-03-16,17451121_2009-04-02,-17,1.0,17.0,4a113a00a31bee44_4903a44ab128db06815e6e25ad79_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a11/4a113a00a...,2009-03-16 14:41:37,4a113a00a31bee44_4903a44ab128db06815e6e25ad79_...,BWH,Vivid E9,GE Vingmed Ultrasound,0.0,33.498168,92.0,59.0,0.050216,...,6.936445000000001e-17,1.378393e-13,3.42856e-10,1.387009e-13,1.360533e-12,5.482224e-12,2.400814e-13,6.971834e-13,6.190041e-13,6.104367e-14,2.194962e-12,1.069013e-13,3.026332e-11,1.54106e-11,1.730941e-13,1.533053e-12,6.218407e-13,1.404187e-15,3.817291e-14,a4c,1.0,train,29.9,3.081831,frame_rate
4,0.607,0.594,0.978583,0,rubidium,405811,4b7b42ea7990f522_4903a44ab0957d4f8209bbcd5dc9,2009-03-27,2009-12-18,405811_2009-03-27,266,1.0,266.0,4b7b42ea7990f522_4903a44ab0957d4f8209bbcd5dc9_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b42ea7...,2009-12-18 15:15:58,4b7b42ea7990f522_4903a44ab0957d4f8209bbcd5dc9_...,BWH,Vivid i,GEMS Ultrasound,0.0,40.075227,77.0,60.0,0.048593,...,7.930772e-08,1.246466e-06,7.848919e-09,7.811391e-08,1.704199e-08,1.668181e-07,9.328541e-05,5.796964e-07,1.043296e-07,3.657285e-07,4.645654e-11,4.340004e-05,6.116669e-07,7.990619e-06,0.0001186512,1.030577e-05,0.0003338596,1.787704e-05,4.431965e-06,a4c,1.0,train,25.0,3.085792,frame_rate
5,0.668,1.31,1.961078,0,ammonia,26346544,49053540f5b45e7c_4903a580509e8bfc7c215619fa44,2015-08-12,2015-09-02,26346544_2015-08-12,21,1.0,21.0,49053540f5b45e7c_4903a580509e8bfc7c215619fa44_...,/mnt/obi0/phi/echo/npyFiles/BWH/4905/49053540f...,2015-09-02 09:56:25,49053540f5b45e7c_4903a580509e8bfc7c215619fa44_...,BWH,EPIQ 7C,Philips Medical Systems,0.0,54.59,84.0,48.0,0.036816,...,1.859211e-13,5.542149e-08,9.000821e-07,5.188083e-10,7.879942e-08,3.573153e-11,0.2928109,4.269598e-09,1.311914e-08,7.544939e-08,1.841928e-11,3.010075e-10,1.166488e-10,8.906148e-12,3.842252e-12,5.567179e-06,2.844534e-09,1.579103e-12,3.59783e-10,a4c,1.0,train,18.3,4.58556,frame_rate
6,0.623,1.05,1.685393,0,rubidium,10551521,4a16bbc8d17695b0_4903a44517b58d9ea5b3ec70daf6,2006-12-18,2006-12-08,10551521_2006-12-18,-10,1.0,10.0,4a16bbc8d17695b0_4903a44517b58d9ea5b3ec70daf6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a16/4a16bbc8d...,2006-12-08 15:01:29,4a16bbc8d17695b0_4903a44517b58d9ea5b3ec70daf6_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.227272,45.0,77.0,0.056266,...,1.221619e-07,1.166104e-08,2.958817e-06,2.317289e-08,0.000126921,3.331988e-05,4.659791e-05,2.12421e-08,4.842994e-07,2.473405e-10,1.181511e-09,1.685985e-07,1.305832e-05,7.239636e-07,1.283002e-05,0.001968394,9.419044e-05,1.104952e-07,1.775529e-05,a4c,1.0,train,24.9,1.810227,frame_rate
7,0.64,1.328,2.075,0,ammonia,9462359,4b7297888f75b8c1_4903a582ec77f38d2c43dac436b0,2017-02-01,2017-06-08,9462359_2017-02-01,127,1.0,127.0,4b7297888f75b8c1_4903a582ec77f38d2c43dac436b0_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b72/4b7297888...,2017-06-08 09:44:32,4b7297888f75b8c1_4903a582ec77f38d2c43dac436b0_...,BWH,iE33,Philips Medical Systems,0.0,33.333,27.0,87.0,0.039451,...,3.614179e-14,1.950641e-14,5.850936e-11,3.489171e-15,1.318146e-11,1.914055e-14,3.577661e-12,9.999347e-13,1.363396e-14,4.840279e-16,1.967542e-14,9.58677e-14,1.613477e-12,1.440955e-13,1.306101e-14,1.18066e-09,6.403413e-14,2.9681699999999996e-19,2.254818e-13,a4c,1.0,train,30.0,0.899991,video_len
8,1.116,2.131,1.909498,0,rubidium,1834175,4b7a8f56cd8195eb_4903a44ab12c996797429c61ffcf,2010-07-09,2009-07-29,1834175_2010-07-09,-345,1.0,345.0,4b7a8f56cd8195eb_4903a44ab12c996797429c61ffcf_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a8f56c...,2009-07-29 09:11:53,4b7a8f56cd8195eb_4903a44ab12c996797429c61ffcf_...,BWH,iE33,Philips Medical Systems,0.0,33.333,61.0,67.0,0.039046,...,3.309157e-11,2.784723e-12,1.352769e-10,3.253669e-14,3.653664e-15,5.268406e-15,3.476586e-13,1.093098e-15,2.393558e-13,2.078671e-13,1.503573e-16,2.141381e-16,4.056394e-13,2.488645e-13,1.151283e-11,3.007573e-11,6.980672e-16,2.8853e-20,3.255406e-15,a4c,1.0,train,30.0,2.033313,video_len
9,1.26,2.021,1.603968,0,rubidium,25001462,4906a124ef51f926_4903a5858428385915c8b6c3d39b,2010-06-30,2010-06-29,25001462_2010-06-30,-1,1.0,1.0,4906a124ef51f926_4903a5858428385915c8b6c3d39b_...,/mnt/obi0/phi/echo/npyFiles/BWH/4906/4906a124e...,2010-06-29 14:45:35,4906a124ef51f926_4903a5858428385915c8b6c3d39b_...,BWH,Vivid E9,GE Vingmed Ultrasound,0.0,40.026882,63.0,77.0,0.053166,...,2.196304e-14,7.064956e-15,5.74384e-13,3.921948e-15,2.252118e-14,5.923541e-15,1.272169e-14,2.051514e-16,4.993836e-14,1.352577e-14,4.533649e-13,2.059582e-16,2.0429e-12,1.190869e-14,9.3937e-16,6.380581e-13,2.361612e-15,4.644116e-18,1.495485e-15,a4c,1.0,train,25.0,2.521694,frame_rate


In [46]:
# Save the failed data frame
print(len(df.filename.unique()))
# Save all image data that was converted to TFR
failed_file_name = 'global_pet_echo_dataset_200617.failed'
tf_failed_data.to_parquet(os.path.join(meta_dir, failed_file_name))

7445


In [6]:
train_df.columns

Index(['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018', 'tracer_obi', 'mrn', 'study', 'pet_date', 'echo_date', 'petmrn_identifier', 'days_post_pet', 'difference(days)', 'pet_measurement', 'filename', 'dir', 'datetime', 'fileid', 'institution', 'model', 'manufacturer', 'index', 'frame_time', 'number_of_frames', 'heart_rate', 'deltaX', 'deltaY', 'a2c', 'a2c_laocc', 'a2c_lvocc_s', 'a3c', 'a3c_laocc', 'a3c_lvocc_s', 'a4c', 'a4c_far', 'a4c_laocc', 'a4c_lvocc_s', 'a4c_rv', 'a4c_rv_laocc', 'a5c', 'apex', 'other', 'plax_far', 'plax_lac', 'plax_laz', 'plax_laz_ao', 'plax_plax', 'psax_avz', 'psax_az', 'psax_mv', 'psax_pap', 'rvinf', 'subcostal', 'suprasternal', 'max_view', 'sum_views', 'dset_mode', 'rate', 'im_array_shape'], dtype='object')

In [9]:
def print_numbers(df):
    print(f'Dataset mode: {list(df["dset_mode"].unique())}')
    print(f'View        : {list(df.max_view.unique())}')
    print(f'Patients    : {len(df.mrn.unique())}')
    print(f'PET  studies: {len((df.petmrn_identifier.unique()))}')
    print(f'Echo studies: {len((df.study.unique()))}') 
    print(f'Echo videos : {len((df.filename.unique()))}')

for df in [train_df, eval_df, test_df]:
    print_numbers(df)
    print()

Dataset mode: ['train']
View        : ['a4c']
Patients    : 1319
PET  studies: 1375
Echo studies: 1923
Echo videos : 5830

Dataset mode: ['eval']
View        : ['a4c']
Patients    : 140
PET  studies: 143
Echo studies: 203
Echo videos : 588

Dataset mode: ['test']
View        : ['a4c']
Patients    : 250
PET  studies: 264
Echo studies: 364
Echo videos : 1027



In [29]:
# Overall image stats
dset = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [30]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 298
Maximum image width  398
Image scale factor 0.7513


In [31]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (7819, 60)
Percentile boundaries: [1.154, 1.262, 1.391]


In [32]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.min(p_list)))

Maximum image size that scales at 25th percentile: 259.0987868284229
