### Summary statistics for the training, evaluation and testing data sets ###

In [22]:
import os
import glob
import numpy as np
import pandas as pd
import lz4.frame

from werdich_cfr.utils.processing import Videoconverter

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [2]:
dset='cfr'
meta_date = '200617'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
tfr_dir = os.path.join(cfr_data_root, 'tfr_'+meta_date, dset)

meta_dir = os.path.join(cfr_data_root, 'metadata_'+meta_date)
print(tfr_dir)

/mnt/obi0/andreas/data/cfr/tfr_200617/cfr


In [3]:
# TFR .parquet data files
train_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_train_'+meta_date+'_*.parquet')))
eval_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_eval_'+meta_date+'_*.parquet')))
test_files = sorted(glob.glob(os.path.join(tfr_dir, dset+'_a4c_test_'+meta_date+'_*.parquet')))

# List of files that failed TFR conversion
train_failed_files = [file.replace('.parquet', '.failed') for file in train_files]
eval_failed_files = [file.replace('.parquet', '.failed') for file in eval_files]
test_failed_files = [file.replace('.parquet', '.failed') for file in test_files]

print(train_failed_files)

['/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_0.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_1.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_2.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_3.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_4.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_5.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_6.failed', '/mnt/obi0/andreas/data/cfr/tfr_200617/cfr/cfr_a4c_train_200617_7.failed']


In [4]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files])
eval_df = pd.concat([pd.read_parquet(file) for file in eval_files])
test_df = pd.concat([pd.read_parquet(file) for file in test_files])
df = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

train_failed_df = pd.concat([pd.read_parquet(file) for file in train_failed_files])
eval_failed_df = pd.concat([pd.read_parquet(file) for file in eval_failed_files])
test_failed_df = pd.concat([pd.read_parquet(file) for file in test_failed_files])

tf_data = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)
tf_failed_data = pd.concat([train_failed_df, eval_failed_df, test_failed_df], ignore_index=True).reset_index(drop=True)

tf_failed_data = tf_failed_data.assign(dur = tf_failed_data.frame_time*1e-3*tf_failed_data.number_of_frames)
tf_data = tf_data.assign(dur = tf_data.frame_time*1e-3*tf_data.number_of_frames)

In [8]:
n_videos_success = len(tf_data.filename.unique())
n_videos_failed = len(tf_failed_data.filename.unique())
n_videos = n_videos_success + n_videos_failed
n_videos_success_frac = np.around(n_videos_success/n_videos, decimals=2)
n_videos_failed_frac = np.around(n_videos_failed/n_videos, decimals=2)
print(f'Successful conversions: {n_videos_success} of {n_videos}, {n_videos_success_frac}')
print(f'Failed conversions:     {n_videos_failed}  of {n_videos}, {n_videos_failed_frac}')

Successful conversions: 7445 of 11077, 0.67
Failed conversions:     3632  of 11077, 0.33


In [6]:
# Reason for failure
failed_stats = tf_failed_data.err.value_counts().to_frame().reset_index(drop=False)
failed_stats = failed_stats.rename(columns={'index':'reason', 'err': 'n'})
failed_stats

Unnamed: 0,reason,n
0,frame_rate,2418
1,video_len,1050
2,deltaXY,164


In [7]:
tf_failed_data.head(20)

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,tracer_obi,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,pet_measurement,difference(days),filename,dir,datetime,file_base,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,...,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer,max_view,sum_views,dset_mode,rate,dur,err
0,1.629,3.554,2.181707,0,rubidium,24694317,4907bffcc55e193e_4903a5858429801f3d28657787f6,2010-07-09,2010-07-09,24694317_2010-07-09,0,1.0,0.0,4907bffcc55e193e_4903a5858429801f3d28657787f6_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907bffcc...,2010-07-09 15:15:24,4907bffcc55e193e_4903a5858429801f3d28657787f6_...,4907bffcc55e193e_4903a5858429801f3d28657787f6_...,33.333,67.0,81.0,0.041472,0.041472,8.206877e-11,4.298955e-07,...,9.621187e-10,2.958736e-11,2.449592e-11,2.121141e-12,2.387291e-10,5.524107e-14,1.667631e-10,1.614435e-10,1.91818e-10,1.632679e-08,2.787947e-09,6.720195e-10,6.062132e-13,3.009142e-12,2010.0,20100709151524,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,2.233311,video_len
1,0.716,1.208,1.687151,0,rubidium,4518163,4b7f092645290121_4903a44ab097f2fccf05c961ebaa,2009-04-02,2009-10-29,4518163_2009-04-02,210,1.0,210.0,4b7f092645290121_4903a44ab097f2fccf05c961ebaa_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7f/4b7f09264...,2009-10-29 10:27:47,4b7f092645290121_4903a44ab097f2fccf05c961ebaa_...,4b7f092645290121_4903a44ab097f2fccf05c961ebaa_...,40.416667,65.0,72.0,0.050217,0.050217,2.563471e-12,1.705865e-07,...,9.946922e-10,7.782781e-09,1.267277e-10,2.655376e-10,1.797094e-10,8.629686e-09,1.361794e-09,8.984329e-09,2.733633e-06,2.379117e-09,7.179961e-09,9.537369e-10,5.890232e-10,3.312e-11,2009.0,20091029102747,BWH,Vivid E9,GE Vingmed Ultrasound,a4c,1.0,train,24.7,2.627083,frame_rate
2,1.211,2.373,1.959538,0,rubidium,24316077,4907ba5fe3d7f446_4903a44ab097f14e1c70db9830ab,2009-10-13,2009-10-14,24316077_2009-10-13,1,1.0,1.0,4907ba5fe3d7f446_4903a44ab097f14e1c70db9830ab_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907ba5fe...,2009-10-14 10:27:29,4907ba5fe3d7f446_4903a44ab097f14e1c70db9830ab_...,4907ba5fe3d7f446_4903a44ab097f14e1c70db9830ab_...,33.333,34.0,73.0,0.035144,0.035144,2.372665e-11,2.564562e-15,...,5.420644e-14,5.345586e-12,3.797113e-13,3.104103e-15,2.651655e-14,1.875427e-17,1.229814e-16,1.516357e-12,6.276083e-15,6.199515e-14,1.641613e-12,4.226151e-15,1.892683e-17,3.064699e-15,2009.0,20091014102729,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,1.133322,video_len
3,0.797,1.011,1.268507,0,rubidium,14644843,4a12b7a74d0e36c6_4903a444c30e2e1e8adab9816e85,2007-06-26,2007-06-26,14644843_2007-06-26,0,1.0,0.0,4a12b7a74d0e36c6_4903a444c30e2e1e8adab9816e85_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a12/4a12b7a74...,2007-06-26 11:27:12,4a12b7a74d0e36c6_4903a444c30e2e1e8adab9816e85_...,4a12b7a74d0e36c6_4903a444c30e2e1e8adab9816e85_...,101.306174,28.0,69.0,0.061381,0.061381,0.0001702231,1.011546e-05,...,0.0001914554,0.011474,3.336459e-05,2.521699e-07,3.554875e-05,1.364855e-07,4.516478e-08,1.005326e-05,4.93061e-08,2.443395e-09,0.0008864172,2.209445e-05,4.628018e-09,1.516717e-07,2007.0,20070626112712,BWH,Vivid i,GEMS Ultrasound,a4c,1.0,train,9.9,2.836573,frame_rate
4,1.654,4.754,2.874244,0,rubidium,15311319,4a1340201c063f18_4903a584a1b9a68867e4c4b60c54,2011-03-10,2011-03-10,15311319_2011-03-10,0,1.0,0.0,4a1340201c063f18_4903a584a1b9a68867e4c4b60c54_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a13/4a1340201...,2011-03-10 07:56:16,4a1340201c063f18_4903a584a1b9a68867e4c4b60c54_...,4a1340201c063f18_4903a584a1b9a68867e4c4b60c54_...,33.333,65.0,60.0,0.046853,0.046853,4.936139e-13,1.213953e-15,...,3.746063e-15,1.725824e-13,3.917265e-17,8.781217e-15,5.889839e-13,4.3025360000000003e-17,3.890459e-17,1.018989e-13,5.406511e-13,3.916223e-15,5.155264e-15,2.758181e-16,1.856173e-19,3.461369e-16,2011.0,20110310075616,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,2.166645,video_len
5,0.93,3.132,3.367742,0,rubidium,24045387,4907b902358a4c64_4903a5858429822916e2540beccf,2010-10-25,2010-07-29,24045387_2010-10-25,-88,1.0,88.0,4907b902358a4c64_4903a5858429822916e2540beccf_...,/mnt/obi0/phi/echo/npyFiles/BWH/4907/4907b9023...,2010-07-29 09:31:46,4907b902358a4c64_4903a5858429822916e2540beccf_...,4907b902358a4c64_4903a5858429822916e2540beccf_...,40.387597,87.0,54.0,0.047267,0.047267,8.647122e-15,7.420977e-15,...,1.465861e-15,1.474246e-13,3.520148e-15,4.2796030000000006e-17,7.532801000000001e-17,1.786237e-17,5.716181e-16,3.764442e-15,3.2521880000000003e-17,1.503784e-17,2.800366e-16,1.7497569999999999e-19,2.738206e-19,1.4494230000000002e-17,2010.0,20100729093146,BWH,Vivid E9,GE Vingmed Ultrasound,a4c,1.0,train,24.8,3.513721,frame_rate
6,1.538,2.693,1.750975,0,rubidium,22207716,490177d05b03d4ad_4903a44b32ee56a55353c140b7e8,2007-09-11,2008-02-01,22207716_2007-09-11,143,1.0,143.0,490177d05b03d4ad_4903a44b32ee56a55353c140b7e8_...,/mnt/obi0/phi/echo/npyFiles/BWH/4901/490177d05...,2008-02-01 15:20:11,490177d05b03d4ad_4903a44b32ee56a55353c140b7e8_...,490177d05b03d4ad_4903a44b32ee56a55353c140b7e8_...,40.123458,55.0,88.0,0.043478,0.043478,3.628767e-11,1.556447e-15,...,1.159955e-13,3.001388e-16,4.193659e-17,3.806855e-15,3.418707e-13,1.313262e-15,1.386445e-14,1.932937e-11,3.491538e-17,1.128745e-12,2.447341e-12,8.573352e-15,3.690611e-18,3.597121e-12,2008.0,20080201152011,BWH,Vivid7,GE Vingmed Ultrasound,a4c,1.0,train,24.9,2.20679,frame_rate
7,0.492,0.872,1.772358,0,ammonia,26777318,490531628e339f13_4903a5864e6f0a03bbe8d16f4eff,2014-09-22,2013-09-30,26777318_2014-09-22,-357,1.0,357.0,490531628e339f13_4903a5864e6f0a03bbe8d16f4eff_...,/mnt/obi0/phi/echo/npyFiles/BWH/4905/490531628...,2013-09-30 14:26:33,490531628e339f13_4903a5864e6f0a03bbe8d16f4eff_...,490531628e339f13_4903a5864e6f0a03bbe8d16f4eff_...,40.447154,42.0,105.0,0.061381,0.061381,9.68684e-08,1.052777e-08,...,6.193774e-10,3.138468e-05,2.469585e-10,8.297696e-09,6.351979e-08,9.346096e-08,3.307147e-06,2.148017e-05,8.157374e-09,4.829132e-07,5.115066e-07,1.422789e-12,1.969297e-09,4.122657e-09,2013.0,20130930142633,BWH,Vivid7,GE Vingmed Ultrasound,a4c,1.0,train,24.7,1.69878,frame_rate
8,0.543,0.419,0.771639,0,rubidium,18895060,4a1edade99c52430_4903a44ab12389f5027f455a8755,2009-01-08,2009-08-19,18895060_2009-01-08,223,1.0,223.0,4a1edade99c52430_4903a44ab12389f5027f455a8755_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1edade9...,2009-08-19 10:06:56,4a1edade99c52430_4903a44ab12389f5027f455a8755_...,4a1edade99c52430_4903a44ab12389f5027f455a8755_...,33.666668,76.0,53.0,0.021269,0.021269,0.02036329,0.07786521,...,0.08791157,0.002373015,0.0003595549,0.0009721497,0.1931262,0.1970933,0.1006609,0.000592171,6.435654e-05,1.985194e-05,0.04343905,0.01388689,2.666904e-06,1.565516e-05,2009.0,20090819100656,BWH,Vivid7,GE Vingmed Ultrasound,a4c,1.0,train,29.7,2.558667,frame_rate
9,1.162,2.432,2.092943,0,rubidium,4868063,4b7f04c9020420d3_4903a44517b6077dd658288bec8e,2007-05-31,2006-11-21,4868063_2007-05-31,-191,1.0,191.0,4b7f04c9020420d3_4903a44517b6077dd658288bec8e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7f/4b7f04c90...,2006-11-21 07:29:46,4b7f04c9020420d3_4903a44517b6077dd658288bec8e_...,4b7f04c9020420d3_4903a44517b6077dd658288bec8e_...,33.333,74.0,73.0,1.0,1.0,7.643438e-08,1.062404e-09,...,4.625091e-11,2.93488e-06,3.194065e-12,2.026071e-11,5.703438e-12,1.626033e-11,8.147382e-14,6.898128e-10,5.741164e-14,1.629153e-14,3.31714e-10,8.135783e-10,5.264187e-15,9.129731e-15,2006.0,20061121072946,BWH,SEQUOIA,ACUSON,a4c,1.0,train,30.0,2.466642,deltaXY


In [30]:
im_array_ser_list = [] # list of pd.Series object for the files in im_array_list

ser_df = tf_failed_data.loc[tf_failed_data.filename == filename, :]
file = os.path.join(ser_df.dir.values[0], filename)
frame_time = ser_df.frame_time.values[0] * 1e-3
rate = 1 / frame_time
ser = ser_df.iloc[0]

try:
    with lz4.frame.open(file, 'rb') as fp:
        data = np.load(fp)
except IOError as err:
    print('Cannot open npy file.')
    print(err)
    error='load'
else:
    video_len = data.shape[0] / rate
    ser_df2 = ser_df.assign(data_n_frames = data.shape[0],
                            data_rows = data.shape[1],
                            data_cols = data.shape[2],
                            data_video_len = video_len)

    im_array_ser_list.append(ser_df2)

In [31]:
im_array_ser_list[0]

Unnamed: 0,rest_global_mbf,stress_global_mbf,global_cfr_calc,post-2018,tracer_obi,mrn,study,pet_date,echo_date,petmrn_identifier,days_post_pet,pet_measurement,difference(days),filename,dir,datetime,file_base,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,...,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer,max_view,sum_views,dset_mode,rate,dur,err,data_n_frames,data_rows,data_cols,data_video_len
68,1.495,2.647,1.770569,0,rubidium,1878966,4b7a8f5267194433_4903a584a1bcb4d4843409be2b9a,2011-06-17,2011-06-17,1878966_2011-06-17,0,1.0,0.0,4b7a8f5267194433_4903a584a1bcb4d4843409be2b9a_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7a/4b7a8f526...,2011-06-17 07:58:23,4b7a8f5267194433_4903a584a1bcb4d4843409be2b9a_...,4b7a8f5267194433_4903a584a1bcb4d4843409be2b9a_...,33.333,46.0,78.0,0.049313,0.049313,6.988266e-11,1.016306e-09,...,1.713143e-07,3.257266e-12,5.395293e-12,1.480544e-12,4.761209e-13,4.893514e-12,7.838013e-09,4.015477e-13,2.496494e-12,3.612823e-12,2011.0,20110617075823,BWH,iE33,Philips Medical Systems,a4c,1.0,train,30.0,1.533318,video_len,20,480,640,0.66666


In [12]:
# Lets get the true size of the arrays
max_frame_time_ms = 33.34 # Maximum frame_time acceptable in ms
min_rate = 1/max_frame_time_ms*1e3
min_frames = 40 # Minimum number of frames at min_rate (2 s)
min_length = max_frame_time_ms*min_frames*1e-3
file_list = list(tf_failed_data.filename.unique())

im_array_ser_list = [] # list of pd.Series object for the files in im_array_list

for f, filename in enumerate(file_list):

    if (f+1) % 200 == 0:
        print('Loading video {} of {} into memory.'.format(f+1, len(file_list)))

    ser_df = tf_failed_data.loc[tf_failed_data.filename == filename, :]
    file = os.path.join(ser_df.dir.values[0], filename)
    frame_time = ser_df.frame_time.values[0] * 1e-3
    rate = 1 / frame_time
    ser = ser_df.iloc[0]
    
    try:
        with lz4.frame.open(file, 'rb') as fp:
            data = np.load(fp)
    except IOError as err:
        print('Cannot open npy file.')
        print(err)
        error='load'
    else:
        video_len = data.shape[0] / rate
        ser_df2 = ser_df.assign(data_n_frames = data.shape[0],
                                data_rows = data.shape[1],
                                data_cols = data.shape[2],
                                data_video_len = video_len)
        im_array_ser_list.append(ser_df2)
  
# When this is done, save the parquet file
im_array_df = pd.concat(im_array_ser_list)
print(len(df.filename.unique()))
failed_file_name = 'global_pet_echo_dataset_200617_shape.failed'
im_array_df.to_parquet(os.path.join(meta_dir, parquet_filename))

Video is too short: 1.10s. Skipping.
Frame rate is too low: 24.74s^-1. Skipping.
Video is too short: 1.13s. Skipping.
Frame rate is too low: 9.87s^-1. Skipping.
Video is too short: 1.30s. Skipping.
Frame rate is too low: 24.76s^-1. Skipping.
Frame rate is too low: 24.92s^-1. Skipping.
Frame rate is too low: 24.72s^-1. Skipping.
Frame rate is too low: 29.70s^-1. Skipping.
Meta data invalid for 4b7f04c9020420d3_4903a44517b6077dd658288bec8e_Image-25.npy.lz4. Skipping
Video is too short: 1.04s. Skipping.
Frame rate is too low: 24.60s^-1. Skipping.
Video is too short: 1.13s. Skipping.
Video is too short: 1.20s. Skipping.
Frame rate is too low: 24.86s^-1. Skipping.
Video is too short: 0.77s. Skipping.
Frame rate is too low: 29.90s^-1. Skipping.
Video is too short: 0.70s. Skipping.
Frame rate is too low: 24.76s^-1. Skipping.
Frame rate is too low: 29.95s^-1. Skipping.
Video is too short: 1.00s. Skipping.
Frame rate is too low: 22.71s^-1. Skipping.
Video is too short: 1.00s. Skipping.
Video is

KeyboardInterrupt: 

In [15]:
error, im_array = vc.process_video(filename)

Video is too short: 0.67s. Skipping.


In [17]:
im_array

array([0.])

In [9]:
def print_numbers(df):
    print(f'Dataset mode: {list(df["dset_mode"].unique())}')
    print(f'View        : {list(df.max_view.unique())}')
    print(f'Patients    : {len(df.mrn.unique())}')
    print(f'PET  studies: {len((df.petmrn_identifier.unique()))}')
    print(f'Echo studies: {len((df.study.unique()))}') 
    print(f'Echo videos : {len((df.filename.unique()))}')

for df in [train_df, eval_df, test_df]:
    print_numbers(df)
    print()

Dataset mode: ['train']
View        : ['a4c']
Patients    : 1319
PET  studies: 1375
Echo studies: 1923
Echo videos : 5830

Dataset mode: ['eval']
View        : ['a4c']
Patients    : 140
PET  studies: 143
Echo studies: 203
Echo videos : 588

Dataset mode: ['test']
View        : ['a4c']
Patients    : 250
PET  studies: 264
Echo studies: 364
Echo videos : 1027



In [29]:
# Overall image stats
dset = pd.concat([train_df, eval_df, test_df], ignore_index=True).reset_index(drop=True)

# Add width and height of the images
dset = dset.assign(width = dset.im_array_shape.apply(lambda s: s[1]),
                   height = dset.im_array_shape.apply(lambda s: s[0]))

# Let's define a scale factor column
im_size = 299
dset = dset.assign(sf = dset.im_array_shape.apply(lambda s: im_size/np.amax([s[0], s[1]])))

In [30]:
max_image_size = (dset.height.max(), dset.width.max())
print('Maximum image height {}'.format(max_image_size[0]))
print('Maximum image width  {}'.format(max_image_size[1]))
image_scale_factor = 299/np.amax(max_image_size)
print('Image scale factor {:.4f}'.format(image_scale_factor))

Maximum image height 298
Maximum image width  398
Image scale factor 0.7513


In [31]:
# Percentile the scale factors
print('Size of table:', dset.shape)
sf_array = dset.sf.values
p_list = [np.round(np.percentile(sf_array, p), decimals = 3) for p in (25, 50, 75)]
print('Percentile boundaries:', p_list)

Size of table: (7819, 60)
Percentile boundaries: [1.154, 1.262, 1.391]


In [32]:
print('Maximum image size that scales at 25th percentile: {}'.format(299/np.min(p_list)))

Maximum image size that scales at 25th percentile: 259.0987868284229
