### Summary statistics for the training, evaluation and testing data sets ###

In [8]:
import os
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [9]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_200208')
cfr_meta_date = '200208'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200208


In [11]:
# TFR .parquet data files
train_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized_a4c_train_200208_*.parquet'))
eval_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized_a4c_eval_200208_*.parquet'))
test_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized_a4c_test_200208_*.parquet'))

In [12]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
dset = pd.concat([train_df, eval_df, test_df], axis = 0, ignore_index=True).reset_index(drop=True)
dset.head(2)

Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape
0,1540947,4b7a826cb58d6913_4903a44516df7dbe0da035089a96,2006-04-19,85786,-225,1723,0607871K,2006-11-30,2006-04-19,1.003046,4b7a826cb58d6913_4903a44516df7dbe0da035089a96_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a826cb...,2006-04-19 16:26:32,4b7a826cb58d6913_4903a44516df7dbe0da035089a96_...,BWH,,Philips_Medical_Systems,33.0,96.0,56.0,0.068052,0.054441,4.141169e-12,1.869846e-08,1.003276e-13,...,1.0,1.051286e-08,6.757379e-11,2.952959e-11,1.496102e-14,6.830049e-14,1.754431e-13,1.585634e-13,7.59067e-12,1.79054e-12,1.061786e-14,3.461857e-13,1.712119e-14,9.724757e-12,7.152225e-13,5.908565e-12,3.865749e-12,6.979855e-10,1.509561e-16,4.563077e-16,7.794424e-17,a4c,train,30.3,"[234, 327, 40]"
1,15302599,4a13402152a4fc32_4903a584a1b2945bd16771efa32d,2011-08-01,114848,-175,4781,EVS0312663,2012-01-23,2011-08-01,2.182906,4a13402152a4fc32_4903a584a1b2945bd16771efa32d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a1340215...,2011-08-01 14:11:51,4a13402152a4fc32_4903a584a1b2945bd16771efa32d_...,BWH,iE33,Philips Medical Systems,33.333,67.0,55.0,0.035325,0.035325,9.463757e-14,4.190171e-18,3.30517e-15,...,1.0,2.304627e-15,1.034795e-14,8.43824e-16,1.689003e-09,8.075573e-16,7.749743000000001e-17,4.93797e-15,1.53619e-16,6.843063e-14,4.661182e-14,2.7553830000000003e-17,4.825357999999999e-19,3.320423e-15,3.721341e-13,4.607266e-13,3.44647e-15,7.108752e-11,2.382514e-15,1.083428e-18,6.530801e-15,a4c,train,30.0,"[212, 283, 40]"


In [13]:
df_list = []
for m in dset['mode'].unique():

    s = {'view': list(dset[dset['mode']==m].max_view.unique()),
         'mode': [m],
         'mrns': [len(dset[dset['mode']==m].mrn.unique())],
         'studies': [len(dset[dset['mode']==m].study.unique())],
         'videos': [len(dset[dset['mode']==m].filename.unique())],
         'unique_cfr_values': [len(dset[dset['mode']==m].cfr.unique())]}
    
    df_list.append(pd.DataFrame(s))

df_stat = pd.concat(df_list, ignore_index=True).reset_index(drop=True)
print(df_stat)

  view   mode  mrns  studies  videos  unique_cfr_values
0  a4c  train   960     1347    3738               1025
1  a4c   eval   101      122     282                102
2  a4c   test   272      356    1044                288


In [14]:
# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [16]:
dset.head(2)

Unnamed: 0,mrn,study,echo_study_date,reportID,days_post_cfr,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,...,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape,width,height,sf
0,1540947,4b7a826cb58d6913_4903a44516df7dbe0da035089a96,2006-04-19,85786,-225,1723,0607871K,2006-11-30,2006-04-19,1.003046,4b7a826cb58d6913_4903a44516df7dbe0da035089a96_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a826cb...,2006-04-19 16:26:32,4b7a826cb58d6913_4903a44516df7dbe0da035089a96_...,BWH,,Philips_Medical_Systems,33.0,96.0,56.0,0.068052,0.054441,4.141169e-12,1.869846e-08,1.003276e-13,...,2.952959e-11,1.496102e-14,6.830049e-14,1.754431e-13,1.585634e-13,7.59067e-12,1.79054e-12,1.061786e-14,3.461857e-13,1.712119e-14,9.724757e-12,7.152225e-13,5.908565e-12,3.865749e-12,6.979855e-10,1.509561e-16,4.563077e-16,7.794424e-17,a4c,train,30.3,"[234, 327, 40]",327,234,0.914373
1,15302599,4a13402152a4fc32_4903a584a1b2945bd16771efa32d,2011-08-01,114848,-175,4781,EVS0312663,2012-01-23,2011-08-01,2.182906,4a13402152a4fc32_4903a584a1b2945bd16771efa32d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a1340215...,2011-08-01 14:11:51,4a13402152a4fc32_4903a584a1b2945bd16771efa32d_...,BWH,iE33,Philips Medical Systems,33.333,67.0,55.0,0.035325,0.035325,9.463757e-14,4.190171e-18,3.30517e-15,...,8.43824e-16,1.689003e-09,8.075573e-16,7.749743000000001e-17,4.93797e-15,1.53619e-16,6.843063e-14,4.661182e-14,2.7553830000000003e-17,4.825357999999999e-19,3.320423e-15,3.721341e-13,4.607266e-13,3.44647e-15,7.108752e-11,2.382514e-15,1.083428e-18,6.530801e-15,a4c,train,30.0,"[212, 283, 40]",283,212,1.056537


In [10]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 398
Maximum image width  530
Image scale factor 0.5642


In [17]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (5064, 56)
Percentile boundaries: [0.849, 0.946, 1.031]


In [19]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.amin(p_list)))

Maximum image size that scales at 25th percentile: 352.17903415783275
