### Summary statistics for the training, evaluation and testing data sets ###

In [2]:
import os
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [3]:
cfr_meta_date = '200304'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_'+cfr_meta_date)

meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200304


In [11]:
# TFR .parquet data files
train_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_train_'+cfr_meta_date+'_*.parquet')))
eval_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_eval_'+cfr_meta_date+'_*.parquet')))
test_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_test_'+cfr_meta_date+'_*.parquet')))

In [14]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
dset = pd.concat([train_df, eval_df, test_df], axis = 0, ignore_index=True).reset_index(drop=True)
print(dset.shape)
dset.head(2)

(4110, 60)


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape
0,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69,24867491,2011-03-22,2262,302,2010-05-24,24867491_2010-05-24,0,normal,2.437252,2.450845,0.790412,1.937176,17,0.757,1.845,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907b1bf3...,2011-03-22 08:06:27,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69_...,BWH,iE33,Philips Medical Systems,0.0,33.333,...,1.0,3.078564e-13,6.936902e-15,5.975178e-13,1.128219e-13,1.31249e-13,8.505656e-16,3.279004e-12,3.921602e-16,3.154265e-12,2.399879e-14,2.179827e-11,4.844475e-15,7.093673e-12,1.226187e-10,5.04656e-13,7.947762e-13,1.796938e-12,3.000783e-15,3.7378049999999997e-19,1.314921e-16,a4c,train,30.0,"[169, 225, 40]"
1,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2,25132044,2012-11-19,2303,-2,2012-11-21,25132044_2012-11-21,0,abnormal,1.329975,1.364016,0.824444,1.124556,9,0.794,1.056,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2_...,/mnt/obi0/phi/echo/npyFiles/BWH/4906/4906a0b6c...,2012-11-19 08:19:14,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2_...,BWH,Vivid S6,GEMS Ultrasound,0.0,33.550724,...,0.853535,1.943321e-07,0.001056534,3.3109e-07,0.00229831,2.954689e-07,4.571272e-06,5.513391e-06,1.074721e-08,2.505709e-07,6.072935e-05,3.855222e-07,6.210845e-09,1.552717e-07,3.10684e-07,2.133884e-06,1.033491e-06,0.0001975696,4.156411e-06,3.346775e-10,6.949776e-07,a4c,train,29.8,"[137, 206, 40]"


In [15]:
df_list = []
for m in dset['mode'].unique():

    s = {'view': list(dset[dset['mode']==m].max_view.unique()),
         'mode': [m],
         'mrns': [len(dset[dset['mode']==m].mrn.unique())],
         'studies': [len(dset[dset['mode']==m].study.unique())],
         'videos': [len(dset[dset['mode']==m].filename.unique())],
         'unique_cfr_values': [len(dset[dset['mode']==m].unaffected_cfr.unique())]}
    
    df_list.append(pd.DataFrame(s))

df_stat = pd.concat(df_list, ignore_index=True).reset_index(drop=True)
print(df_stat)

  view   mode  mrns  studies  videos  unique_cfr_values
0  a4c  train   922     1141    3175                942
1  a4c   eval   105      139     350                107
2  a4c   test   156      211     585                167


In [16]:
# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [17]:
dset.head(2)

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape,width,height,sf
0,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69,24867491,2011-03-22,2262,302,2010-05-24,24867491_2010-05-24,0,normal,2.437252,2.450845,0.790412,1.937176,17,0.757,1.845,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907b1bf3...,2011-03-22 08:06:27,4907b1bf3b33766b_4903a584a1b9a5737afe76d98d69_...,BWH,iE33,Philips Medical Systems,0.0,33.333,...,5.975178e-13,1.128219e-13,1.31249e-13,8.505656e-16,3.279004e-12,3.921602e-16,3.154265e-12,2.399879e-14,2.179827e-11,4.844475e-15,7.093673e-12,1.226187e-10,5.04656e-13,7.947762e-13,1.796938e-12,3.000783e-15,3.7378049999999997e-19,1.314921e-16,a4c,train,30.0,"[169, 225, 40]",225,169,1.328889
1,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2,25132044,2012-11-19,2303,-2,2012-11-21,25132044_2012-11-21,0,abnormal,1.329975,1.364016,0.824444,1.124556,9,0.794,1.056,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2_...,/mnt/obi0/phi/echo/npyFiles/BWH/4906/4906a0b6c...,2012-11-19 08:19:14,4906a0b6cbb1f945_4903a58792a07bfce0b19c9eeea2_...,BWH,Vivid S6,GEMS Ultrasound,0.0,33.550724,...,3.3109e-07,0.00229831,2.954689e-07,4.571272e-06,5.513391e-06,1.074721e-08,2.505709e-07,6.072935e-05,3.855222e-07,6.210845e-09,1.552717e-07,3.10684e-07,2.133884e-06,1.033491e-06,0.0001975696,4.156411e-06,3.346775e-10,6.949776e-07,a4c,train,29.8,"[137, 206, 40]",206,137,1.451456


In [18]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 298
Maximum image width  398
Image scale factor 0.7513


In [19]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (4110, 63)
Percentile boundaries: [1.177, 1.262, 1.41]


In [53]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.min(p_list)))

Maximum image size that scales at 25th percentile: 254.035683942226
