### Summary statistics for the training, evaluation and testing data sets ###

In [1]:
import os
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
dset='cfr'
meta_date = '200617'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_'+meta_date, dset)

meta_dir = os.path.join(cfr_data_root, 'metadata_'+meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200617/cfr


In [3]:
# TFR .parquet data files
train_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_train_'+meta_date+'_*.parquet')))
eval_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_eval_'+meta_date+'_*.parquet')))
test_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_test_'+meta_date+'_*.parquet')))

In [5]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
df = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

In [7]:
print(len(df.filename.unique()))
# Save all image data that was converted to TFR
dataset_file_name = 'global_pet_echo_dataset_200617_.parquet'


7445


In [6]:
train_df.columns

Index(['rest_global_mbf', 'stress_global_mbf', 'global_cfr_calc', 'post-2018', 'tracer_obi', 'mrn', 'study', 'pet_date', 'echo_date', 'petmrn_identifier', 'days_post_pet', 'difference(days)', 'pet_measurement', 'filename', 'dir', 'datetime', 'fileid', 'institution', 'model', 'manufacturer', 'index', 'frame_time', 'number_of_frames', 'heart_rate', 'deltaX', 'deltaY', 'a2c', 'a2c_laocc', 'a2c_lvocc_s', 'a3c', 'a3c_laocc', 'a3c_lvocc_s', 'a4c', 'a4c_far', 'a4c_laocc', 'a4c_lvocc_s', 'a4c_rv', 'a4c_rv_laocc', 'a5c', 'apex', 'other', 'plax_far', 'plax_lac', 'plax_laz', 'plax_laz_ao', 'plax_plax', 'psax_avz', 'psax_az', 'psax_mv', 'psax_pap', 'rvinf', 'subcostal', 'suprasternal', 'max_view', 'sum_views', 'dset_mode', 'rate', 'im_array_shape'], dtype='object')

In [9]:
def print_numbers(df):
    print(f'Dataset mode: {list(df["dset_mode"].unique())}')
    print(f'View        : {list(df.max_view.unique())}')
    print(f'Patients    : {len(df.mrn.unique())}')
    print(f'PET  studies: {len((df.petmrn_identifier.unique()))}')
    print(f'Echo studies: {len((df.study.unique()))}') 
    print(f'Echo videos : {len((df.filename.unique()))}')

for df in [train_df, eval_df, test_df]:
    print_numbers(df)
    print()

Dataset mode: ['train']
View        : ['a4c']
Patients    : 1319
PET  studies: 1375
Echo studies: 1923
Echo videos : 5830

Dataset mode: ['eval']
View        : ['a4c']
Patients    : 140
PET  studies: 143
Echo studies: 203
Echo videos : 588

Dataset mode: ['test']
View        : ['a4c']
Patients    : 250
PET  studies: 264
Echo studies: 364
Echo videos : 1027



In [29]:
# Overall image stats
dset = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [30]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 298
Maximum image width  398
Image scale factor 0.7513


In [31]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (7819, 60)
Percentile boundaries: [1.154, 1.262, 1.391]


In [32]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.min(p_list)))

Maximum image size that scales at 25th percentile: 259.0987868284229
