### Summary statistics for the training, evaluation and testing data sets ###

In [44]:
import os
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [45]:
cfr_meta_date = '200227'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_'+cfr_meta_date)

meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200227


In [46]:
# TFR .parquet data files
train_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_train_200227_*.parquet'))
eval_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_eval_200227_*.parquet'))
test_files = glob.glob(os.path.join(tfr_dir, 'cfr_resized75_a4c_test_200227_*.parquet'))

In [47]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
dset = pd.concat([train_df, eval_df, test_df], axis = 0, ignore_index=True).reset_index(drop=True)
print(dset.shape)
dset.head(2)

(4026, 60)


Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape
0,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a,10955292,2017-09-26,950,-21,2017-10-17,10955292_2017-10-17,0,abnormal,1.749164,1.811592,0.857298,1.553073,7,0.897,1.569,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a16/4a16b78fd...,2017-09-26 09:46:10,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a_...,BWH,Affiniti 70C,Philips Medical Systems,0.0,33.333,...,0.813025,0.1864794,8.62887e-09,2.45034e-10,2.697658e-10,1.356396e-09,0.0004879497,1.454432e-08,1.625594e-12,1.447865e-14,8.245728e-10,3.372125e-07,4.602742e-08,1.063295e-10,7.083745e-06,4.548125e-10,1.278141e-10,2.140425e-08,5.318327e-08,6.353986e-11,4.304011e-09,a4c,train,30.0,"[157, 209, 40]"
1,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319,12246930,2008-01-25,1038,-81,2008-04-15,12246930_2008-04-15,0,normal,1.81073,1.69186,0.718353,1.215353,17,0.671,1.215,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f8a0a...,2008-01-25 13:01:06,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.571427,...,1.0,1.048707e-14,2.831727e-13,1.358871e-14,2.769936e-10,2.626319e-16,1.0762e-12,2.360171e-15,4.545515e-15,7.299319e-16,1.136504e-14,1.720896e-14,2.156752e-17,3.669402e-17,5.922067e-13,1.024678e-13,2.603482e-12,4.049234e-14,9.746905000000001e-17,1.233754e-19,9.313774e-16,a4c,train,24.6,"[142, 207, 40]"


In [48]:
df_list = []
for m in dset['mode'].unique():

    s = {'view': list(dset[dset['mode']==m].max_view.unique()),
         'mode': [m],
         'mrns': [len(dset[dset['mode']==m].mrn.unique())],
         'studies': [len(dset[dset['mode']==m].study.unique())],
         'videos': [len(dset[dset['mode']==m].filename.unique())],
         'unique_cfr_values': [len(dset[dset['mode']==m].unaffected_cfr.unique())]}
    
    df_list.append(pd.DataFrame(s))

df_stat = pd.concat(df_list, ignore_index=True).reset_index(drop=True)
print(df_stat)

  view   mode  mrns  studies  videos  unique_cfr_values
0  a4c  train   843     1071    2928                868
1  a4c   eval    95      115     336                 98
2  a4c   test   237      279     762                240


In [49]:
# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [50]:
dset.head(2)

Unnamed: 0,study,mrn,echo_study_date,mrnstudyid,days_post_cfr,cfr_study_date,petmrn_identifier,post-2018,myocardial_perfusion,global_cfr_calc,unaffected_cfr,rest_mbf_unaff,stress_mbf_unaff,num_unaffected_segs,rest_global_mbf,stress_global_mbf,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,...,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,max_view,mode,rate,im_array_shape,width,height,sf
0,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a,10955292,2017-09-26,950,-21,2017-10-17,10955292_2017-10-17,0,abnormal,1.749164,1.811592,0.857298,1.553073,7,0.897,1.569,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a16/4a16b78fd...,2017-09-26 09:46:10,4a16b78fdee0edbe_4903a582ec78ee72194cd5aa653a_...,BWH,Affiniti 70C,Philips Medical Systems,0.0,33.333,...,2.45034e-10,2.697658e-10,1.356396e-09,0.0004879497,1.454432e-08,1.625594e-12,1.447865e-14,8.245728e-10,3.372125e-07,4.602742e-08,1.063295e-10,7.083745e-06,4.548125e-10,1.278141e-10,2.140425e-08,5.318327e-08,6.353986e-11,4.304011e-09,a4c,train,30.0,"[157, 209, 40]",209,157,1.430622
1,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319,12246930,2008-01-25,1038,-81,2008-04-15,12246930_2008-04-15,0,normal,1.81073,1.69186,0.718353,1.215353,17,0.671,1.215,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a14/4a14f8a0a...,2008-01-25 13:01:06,4a14f8a0ab909b97_4903a44b32edba1d33791f30f319_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.571427,...,1.358871e-14,2.769936e-10,2.626319e-16,1.0762e-12,2.360171e-15,4.545515e-15,7.299319e-16,1.136504e-14,1.720896e-14,2.156752e-17,3.669402e-17,5.922067e-13,1.024678e-13,2.603482e-12,4.049234e-14,9.746905000000001e-17,1.233754e-19,9.313774e-16,a4c,train,24.6,"[142, 207, 40]",207,142,1.444444


In [51]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 298
Maximum image width  398
Image scale factor 0.7513


In [52]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (4026, 63)
Percentile boundaries: [1.177, 1.262, 1.41]


In [53]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.min(p_list)))

Maximum image size that scales at 25th percentile: 254.035683942226
