In [61]:
import os
import pickle
import glob
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
cfr_meta_date = '200519'
meta_dir = os.path.join(cfr_data_root, 'metadata_'+cfr_meta_date)
print(meta_dir)

/mnt/obi0/andreas/data/cfr/metadata_200519


In [6]:
metadata_matched_file = 'pet_match365_diff_files_200519.parquet'
pet_echo_meta = pd.read_parquet(os.path.join(meta_dir, metadata_matched_file))
print(f'After combining with metadata (data containes echos with failed conversions):')
print(f'Unique PET studies with metadata: {len(pet_echo_meta.petmrn_identifier.unique())}')
print(f'Unique patients with metadata:    {len(pet_echo_meta.mrn.unique())}')
print(f'Echo studies with metadata:       {len(pet_echo_meta.study.unique())}')

After combining with metadata (data containes echos with failed conversions):
Unique PET studies with metadata: 3590
Unique patients with metadata:    3257
Echo studies with metadata:       6443


In [12]:
# REMOVING STUDIES THAT FAILED CONVERSION
failed_conversions_file = 'pet_match365_failed_conversions.parquet'
pet_echo_meta_failed = pet_echo_meta.loc[pet_echo_meta.frame_time.isnull()]
print(f'Echo studies that failed conversions: {len(pet_echo_meta_failed.study.unique())}')
print(f'Patients with failed conversions:     {len(pet_echo_meta_failed.mrn.unique())}')
pet_echo_meta_failed.to_parquet(os.path.join(meta_dir, failed_conversions_file))

pet_echo_meta_failed.head()

pet_echo_meta_non_failed = pet_echo_meta.loc[~pet_echo_meta.frame_time.isnull()]
print('Patients and studies with meta data:')
print(f'Unique PET studies: {len(pet_echo_meta_non_failed.petmrn_identifier.unique())}')
print(f'Unique patients:    {len(pet_echo_meta_non_failed.mrn.unique())}')
print(f'Echo studies:       {len(pet_echo_meta_non_failed.study.unique())}')

Echo studies that failed conversions: 172
Patients with failed conversions:     140
Patients and studies with meta data:
Unique PET studies: 3567
Unique patients:    3246
Echo studies:       6271


In [57]:
# After splitting the patients in train, validate and test sets.
dataset_filename = 'global_pet_echo_dataset_'+cfr_meta_date+'.parquet'
df_global_dataset = pd.read_parquet(os.path.join(meta_dir, dataset_filename))

# We want to remove the studies that failed video conversion
df_global_dataset = df_global_dataset.loc[~df_global_dataset.frame_time.isnull()]

print(f'Unique PET studies: {len(df_global_dataset.petmrn_identifier.unique())}')
print(f'Unique patients:    {len(df_global_dataset.mrn.unique())}')
print(f'Echo studies:       {len(df_global_dataset.study.unique())}')

# Data that were sorted out by making the global data set
lost_PET = set(pet_echo_meta_non_failed.petmrn_identifier.unique()).\
            difference(set(df_global_dataset.petmrn_identifier.unique()))
lost_PET_list = list(lost_PET)

lost_mrn = pet_echo_meta_non_failed.loc[pet_echo_meta_non_failed.petmrn_identifier.isin(lost_PET_list)].\
            mrn.unique()
lost_mrn_list = list(lost_mrn)

print()
print(f'Removed PET studies after making specific data set: {len(lost_PET)}')
print(f'Removed PET studies corresponded to {len(lost_mrn_list)} patients.')

Unique PET studies: 2670
Unique patients:    2562
Echo studies:       4348

Removed PET studies after making specific data set: 897
Removed PET studies corresponded to 721 patients.


In [94]:
# Sort out the echo studies that did not have a4c views
view = 'a4c'
df = df_global_dataset.copy()
df_view = df.loc[df.max_view==view]
study_list_with_view = df_view.study.unique()

# Studies that do not have a4c views
df_no_view = df[~df.study.isin(study_list_with_view)]
print()
print(f'Echo studies without {view} view: {len(df_no_view.study.unique())}')
print(f'Patients from these studies without {view} view: {len(df_no_view.mrn.unique())}')
print()
print(f'PET studies with {view} views: {len(df_view.petmrn_identifier.unique())}')
print(f'patients with {view} views:    {len(df_view.mrn.unique())}')
print(f'Studies with {view} views:     {len(df_view.study.unique())}')
print(f'Video files with {view} views: {len(df_view.filename.unique())}')

dset_list = ['train', 'eval', 'test']
for dset in dset_list:
    print()
    print(f'Data set: {dset}')
    print(f'Number of patients in {dset}: {len(df_view[df_view["mode"]==dset].mrn.unique())}')
    print(f'PET studies in {dset}:        {len(df_view[df_view["mode"]==dset].petmrn_identifier.unique())}')
    print(f'ECHO studies in {dset}:       {len(df_view[df_view["mode"]==dset].study.unique())}')
    print(f'A4C videos in {dset}:         {len(df_view[df_view["mode"]==dset].filename.unique())}')
    


Echo studies without a4c view: 523
Patients from these studies without a4c view: 448

PET studies with a4c views: 2496
patients with a4c views:    2402
Studies with a4c views:     3825
Video files with a4c views: 11536

Data set: train
Number of patients in train: 1876
PET studies in train:        1945
ECHO studies in train:       2966
A4C videos in train:         9027

Data set: eval
Number of patients in eval: 206
PET studies in eval:        220
ECHO studies in eval:       327
A4C videos in eval:         961

Data set: test
Number of patients in test: 320
PET studies in test:        331
ECHO studies in test:       532
A4C videos in test:         1548


In [87]:
# Load the final data sets from the tfr_files
tfr_dir = os.path.join(cfr_data_root, 'tfr_200519', 'global')
train_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_global_a4c_train_200519_?.parquet')))
eval_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_global_a4c_eval_200519_?.parquet')))
test_files = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_global_a4c_eval_200519_?.parquet')))

def dset_numbers():
    
    dset_list = ['train', 'eval', 'test']
    df_list = []
    
    for dset in dset_list:
    
        file_list = sorted(glob.glob(os.path.join(tfr_dir, 'cfr_global_a4c_'+dset+'_200519_?.parquet')))

        df = pd.concat([pd.read_parquet(file) for file in file_list]).reset_index(drop=True)
        df_list.append(df)

        print()
        print(f'Data set: {dset}')
        print(f'Number of patients in {dset}: {len(df.mrn.unique())}')
        print(f'PET studies in {dset}:        {len(df.petmrn_identifier.unique())}')
        print(f'ECHO studies in {dset}:       {len(df.study.unique())}')
        print(f'A4C videos in {dset}:         {len(df.filename.unique())}')
        
    
    df_all = pd.concat(df_list).reset_index(drop=True)
    print()
    print('Numbers for all data sets')
    print(f'Number of patients:           {len(df_all.mrn.unique())}')
    print(f'PET studies:                  {len(df_all.petmrn_identifier.unique())}')
    print(f'ECHO studies:                 {len(df_all.study.unique())}')
    print(f'A4C videos:                   {len(df_all.filename.unique())}')
    
dset_numbers()


Data set: train
Number of patients in train: 1478
PET studies in train:        1526
ECHO studies in train:       2159
A4C videos in train:         6147

Data set: eval
Number of patients in eval: 160
PET studies in eval:        168
ECHO studies in eval:       234
A4C videos in eval:         620

Data set: test
Number of patients in test: 263
PET studies in test:        270
ECHO studies in test:       392
A4C videos in test:         1052

Numbers for all data sets
Number of patients:           1901
PET studies:                  1964
ECHO studies:                 2785
A4C videos:                   7819


In [81]:
df = pd.concat([pd.read_parquet(file) for file in train_files])
df.max_view.unique()

array(['a4c'], dtype=object)

### Forward pass on events ###

In [96]:
echo_dir = os.path.join(cfr_data_root, 'predictions_echodata', 'FirstEcho')
predictions_global_file = 'BWH_2015-05-01_2015-10-31_FirstEcho_a4c_global_cfr_calc.parquet'
disqualified_file = 'BWH_2015-05-01_2015-10-31_FirstEcho_a4c_disqualified.parquet'

df = pd.read_parquet(os.path.join(echo_dir, predictions_global_file))
df_disqualified = pd.read_parquet(os.path.join(echo_dir, disqualified_file))

In [106]:
print(f'Input patients: {len(df.mrn.unique()) + len(df_disqualified.mrn.unique())}')
print(f'Input videos:   {len(df.filename.unique()) + len(df_disqualified.filename.unique())}')

Input patients: 7879
Input videos:   21735


In [101]:
print(f'Patients in forward pass: {len(df.mrn.unique())}')
print(f'Echo studies in forward:  {len(df.study.unique())}')
print(f'Video files in forward:   {len(df.filename.unique())}')

Patients in forward pass: 5184
Echo studies in forward:  5184
Video files in forward:   14930


In [104]:
print(f'Disqualified patients:     {len(df_disqualified.mrn.unique())}')
print(f'Disqualified echo studies: {len(df_disqualified.study.unique())}')
print(f'Disqualified videos:       {len(df_disqualified.filename.unique())}')

Disqualified patients:     2695
Disqualified echo studies: 2695
Disqualified videos:       6805
