### Summary statistics for the training, evaluation and testing data sets ###

In [25]:
import os
import glob
import numpy as np
import pandas as pd

import lz4.frame

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [31]:
# This should give us ~70% useful files
max_frame_time_ms = 33.34 # Maximum frame_time acceptable in ms
min_rate = 1/max_frame_time_ms*1e3
min_frames = 40 # Minimum number of frames at min_rate (2 s)
min_length = max_frame_time_ms*min_frames*1e-3
dset='cfr'
meta_date = '200617'
location='BWH'
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
meta_dir = os.path.join(cfr_data_root, 'metadata_'+meta_date)

# Meta data with files that failed
df = pd.read_parquet(os.path.join(meta_dir, 'global_pet_echo_dataset_200617.failed'))

In [227]:
# Pick one file that failed
filename = df.iloc[37].filename
print(filename)

# Load that file
meta = df[df.filename==filename]
file = os.path.join(meta.dir.values[0], filename)

try:
    with lz4.frame.open(file, 'rb') as fp:
        data = np.load(fp)
except IOError as err:
    print('Cannot open npy file.')
    print(err)
    error='load'
    
print(meta.number_of_frames.values[0])
print(data.shape)
# Clearly, something is not right. The number of frames do not match. Then, is the frame_time correct?

4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_Image-51.npy.lz4
91.0
(33, 480, 640, 1)


In [222]:
filename

'4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_Image-51.npy.lz4'

In [148]:
# Can we find the correct number of frames in the meta data?
# Load the file names
file_df_file = 'echo_'+location+'_npy_feather_files_'+meta_date+'.parquet'
file_df = pd.read_parquet(os.path.join(meta_dir, file_df_file))

# There are several meta files associated with each file. Lets pick the ones we need
feather_dsc_list = ['video_metadata_withScale', 'viewPredictionsVideo_withRV', 'study_metadata']
file_df2 = file_df[file_df.dsc.isin(feather_dsc_list)]

file_meta = file_df2[file_df2.filename==filename]

file_df2.head()

Unnamed: 0,filename,dir,study,mrn,datetime,meta_filename,meta_dir,dsc
0,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,video_metadata_withScale
1,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,study_metadata
2,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,viewPredictionsVideo_withRV
3,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,video_metadata_withScale
4,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,study_metadata


### Collect meta data for each file ####

In [153]:
study_list = list(file_df2.study.unique())
print(len(study_list))
study = file_df2[file_df2.filename==filename].study.values[0]
print(study)

46075
4a1753541a1c22ef_4903a44b330893f3ea40ffd10108


In [333]:
meta_file_df.filename.unique()

array(['4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_Image-42.npy.lz4',
       '4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_Image-46.npy.lz4',
       '4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_Image-27.npy.lz4',
       ...,
       '4d8a355ecd18035c_4903a58050928222b0548efc9e0d_Image-58.npy.lz4',
       '4d8a355ecd18035c_4903a58050928222b0548efc9e0d_Image-35.npy.lz4',
       '4d8a355ecd18035c_4903a58050928222b0548efc9e0d_Image-16.npy.lz4'],
      dtype=object)

In [338]:
meta_df_study = meta_df[meta_df.study==study]

In [339]:
study

'4a1753541a1c22ef_4903a44b330893f3ea40ffd10108'

In [342]:
meta_df_study.dsc.unique()

array(['video_metadata_withScale', 'viewPredictionsVideo_withRV',
       'study_metadata'], dtype=object)

In [351]:
dsc = 'video_metadata_withScale'
m = meta_df_study[meta_df_study.dsc==dsc][['meta_dir', 'meta_filename']].drop_duplicates()

In [353]:
dsc_file = os.path.join(m.meta_dir.values[0], m.meta_filename.values[0])

In [354]:
dsc_file

'/mnt/obi0/phi/echo/featherFiles/BWH/4a17/4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_video_metadata_withScale.feather'

In [343]:
# Function to load meta data for a study
def get_study_metadata(study, meta_df):
    
    meta_df_study = meta_df[meta_df.study==study]
    meta_dsc_list = set(meta_df_study.dsc.unique())
    
    meta_dict = {}
    
    for dsc in meta_dsc_list:

        m = meta_df_study[meta_df_study.dsc==dsc][['meta_dir', 'meta_filename']].drop_duplicates()
        dsc_file = os.path.join(m.meta_dir.values[0], m.meta_filename.values[0])
        
        df = pd.DataFrame()
        
        try:
            with open(dsc_file, 'rb') as fl:
                df = pd.read_feather(fl)
        except IOError as err:
            print(err)
        else:
            if df.shape[0]>0:
                meta_dict[dsc] = df
                
    return meta_dict

In [358]:
len(file_df2[file_df2.study==study].filename.unique())
file_df2.head(2)

Unnamed: 0,filename,dir,study,mrn,datetime,meta_filename,meta_dir,dsc
0,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,video_metadata_withScale
1,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed43e1...,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4,18682831,2015-09-17 10:57:55,4a1ed43e1337d2ed_4903a580509e8af268eea9d211f4_...,/mnt/obi0/phi/echo/featherFiles/BWH/4a1e,study_metadata


In [344]:
# Get a list of studies and then collect metadata for all files in the study
default_dsc_list = ['video_metadata_withScale', 'viewPredictionsVideo_withRV', 'study_metadata']
def get_study_metadata_files(study, meta_df, meta_dsc_list=default_dsc_list):
    
    study_df = pd.DataFrame()
    meta_dict = get_study_metadata(study=study, meta_df=meta_df)
    
    if len(meta_dict)>0:
        meta_file_difference = set(meta_dsc_list).symmetric_difference(meta_dict.keys())
        # Open meta data files only if they are all resent. Otherwise, skip.
        if len(meta_file_difference)==0:

            # Now we can collect the meta data for each file that we expect in this study
            meta_df_study = meta_df[meta_df.study==study]
            meta_df_study = meta_df_study.assign(file_base = meta_df_study.filename.apply(lambda s: s.split('.')[0]))
            # One row per file. We need this for later.
            meta_df_study_file = meta_df_study.drop(columns = ['meta_filename', 'meta_dir', 'dsc']).drop_duplicates()
            file_base_list = list(meta_df_study_file.file_base.unique())

            # Video meta data
            video_df = meta_dict['video_metadata_withScale']
            video_df = video_df.assign(file_base = video_df.identifier.apply(lambda s: s.split('.')[0]))
            video_df_files = video_df[video_df.file_base.isin(file_base_list)].drop(columns=['index'])

            # View classification results
            view_df = meta_dict['viewPredictionsVideo_withRV']
            view_df = view_df.assign(file_base = view_df['index'].apply(lambda s: s.split('.')[0]))
            view_df = view_df.drop(columns=['index'])
            view_df_files = view_df[view_df.file_base.isin(file_base_list)]

            # Combine meta data and view classification and add study meta data
            study_df = video_df_files.merge(view_df_files, on='file_base', how='outer')
            study_meta = meta_dict['study_metadata'].drop(columns=['index'])
            study_df = study_df.merge(study_meta, on='study', how='left')
            
            # Add the original meta data for the study (with the file names and directories)
            study_df = meta_df_study_file.merge(study_df, on=['file_base', 'study'], how = 'left').reset_index(drop=True)
            
        else:
            print(f'Meta data file {meta_file_difference} missing. Skipping this study.')
    else:
        print(f'No meta data files for study {study}. Skipping this study.')
    
    return study_df

In [345]:
meta_df=file_df2
meta_df_study = meta_df[meta_df.study==study]
meta_df_study = meta_df_study.assign(file_base = meta_df_study.filename.apply(lambda s: s.split('.')[0]))
study_df = get_study_metadata_files(study, meta_df=file_df2, meta_dsc_list=default_dsc_list)

NameError: name 'meta_df_filename' is not defined

In [337]:
study_df.head()

Unnamed: 0,filename,dir,study,mrn,datetime,file_base,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer
0,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1753541...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,11029667,2008-12-29 16:55:12,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,91,125,0.0415,0.0415,2.685842e-13,2.825765e-13,1.302806e-12,2.225345e-10,1.835168e-11,1.401866e-13,4.634683e-14,2.658381e-13,3.9045e-14,2.478305e-13,2.906231e-14,2.302654e-12,2.326174e-11,1.743731e-14,1.0,5.144916e-13,1.954599e-11,9.542664e-13,2.991231e-13,1.896425e-12,1.179467e-14,1.090652e-10,3.407125e-10,4.022104e-12,1.073732e-15,1.888418e-13,6.947489e-13,2008,20081229165512,BWH,iE33,Philips Medical Systems
1,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1753541...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,11029667,2008-12-29 16:55:12,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,64,107,0.02769,0.02769,4.046573e-11,5.67993e-09,1.353802e-07,1.363567e-05,4.625845e-08,2.005825e-08,2.85734e-09,1.606812e-08,5.127291e-07,3.68391e-09,3.585221e-08,4.436864e-07,2.233337e-07,0.9917065,2.203399e-06,1.144577e-09,8.171689e-10,4.914823e-08,1.776646e-08,1.051622e-09,9.493733e-07,0.0002621131,4.330292e-05,0.00789558,7.397324e-05,1.48692e-11,3.372187e-07,2008,20081229165512,BWH,iE33,Philips Medical Systems
2,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1753541...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,11029667,2008-12-29 16:55:12,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,91,111,0.02489,0.02489,2.60623e-11,7.962099e-05,1.606098e-08,1.800338e-07,4.317098e-11,9.959678e-09,3.14553e-11,5.368899e-09,0.0003096618,1.431539e-09,5.149242e-10,3.172148e-08,6.527061e-08,0.9932988,1.150622e-06,4.771579e-11,5.093928e-08,3.504902e-09,2.460463e-09,0.0002068681,4.723653e-09,2.634437e-11,0.005783168,2.974741e-06,0.0003173997,2.295583e-13,1.262687e-11,2008,20081229165512,BWH,iE33,Philips Medical Systems
3,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1753541...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,11029667,2008-12-29 16:55:12,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,91,114,0.024868,0.024868,2.792368e-08,2.519359e-07,1.921771e-06,9.777278e-06,1.640833e-07,3.225352e-07,1.475316e-06,2.975731e-06,0.003500136,4.112342e-07,3.863657e-08,1.579902e-07,9.252796e-09,0.993275,2.563382e-06,6.666567e-07,5.922639e-06,3.917077e-07,2.948325e-10,7.21507e-06,8.256555e-09,2.177372e-08,0.001884932,0.001299192,6.324653e-06,5.245846e-08,9.716207e-10,2008,20081229165512,BWH,iE33,Philips Medical Systems
4,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a17/4a1753541...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,11029667,2008-12-29 16:55:12,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,91,96,0.02769,0.02769,5.098568e-08,0.9603143,7.107756e-06,3.019427e-05,1.289327e-07,5.925548e-06,2.565577e-05,0.0002607862,0.007173009,2.106631e-07,2.144775e-06,4.349567e-05,2.769985e-06,0.02947678,9.615594e-11,6.269208e-07,4.735822e-07,5.51149e-06,4.789714e-09,9.701697e-06,3.994692e-05,7.600558e-07,0.001517604,0.0001064391,0.0009763179,7.420279e-09,1.738659e-08,2008,20081229165512,BWH,iE33,Philips Medical Systems


In [316]:
meta_df_study.columns

Index(['filename', 'dir', 'study', 'mrn', 'datetime', 'meta_filename', 'meta_dir', 'dsc', 'file_base'], dtype='object')

In [317]:
meta_df_study_file = meta_df_study.drop(columns = ['meta_filename', 'meta_dir', 'dsc']).drop_duplicates()
print(meta_df_study_file.shape)

(74, 6)


In [318]:
len(meta_df_study_file.filename.unique())

74

Unnamed: 0,study,identifier,frame_time,number_of_frames,heart_rate,deltaX,deltaY,file_base,index,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal,year_month,study_full_time,institution,model,manufacturer
0,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,70,129,0.035079,0.035079,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,2.09589e-12,5.2297e-15,9.954627e-13,1.077487e-09,1.8237e-09,1.250664e-12,9.969169e-08,7.668651e-13,1.863756e-13,5.14646e-12,3.713086e-10,2.011728e-10,5.722893e-11,2.689352e-14,1.0,3.006103e-12,8.312597e-13,2.260718e-13,6.539299e-12,3.46888e-11,2.960676e-17,1.801901e-11,1.02285e-10,8.118004e-10,7.475331e-14,8.059665e-11,1.528645e-10,2008,20081229165512,BWH,iE33,Philips Medical Systems
1,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,33.333,48,115,0.041474,0.041474,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,4a1753541a1c22ef_4903a44b330893f3ea40ffd10108_...,1.670232e-11,5.581027e-13,6.865124e-12,8.887784e-10,1.239729e-09,3.488358e-12,3.303299e-11,4.008863e-08,5.607043e-12,1.352887e-11,1.600065e-09,2.7309e-11,6.646223e-11,7.316615e-09,1.0,1.50627e-10,2.177096e-12,7.665604e-11,3.999467e-09,2.89519e-12,2.100343e-13,3.940963e-12,2.647996e-10,1.280517e-10,2.912668e-14,2.318695e-13,8.445708e-13,2008,20081229165512,BWH,iE33,Philips Medical Systems
