In [None]:
import os
import h5py

import numpy as np
import xml.etree.ElementTree as et 
from collections import Counter, defaultdict

%matplotlib inline
import matplotlib.pyplot as plt

DATA_DIRECTORY = '/mnt/disks/sax-lax-40k/2019-11-08/'

In [None]:
def random_copy_hd5_datasets(source_hd5, source_hd6, destination_hd5, group_path='/'):
    for k in source_hd5[group_path]:
        if isinstance(source_hd5[group_path][k], h5py.Dataset):
            dice = np.random.rand()
            if dice > 0.5 and group_path + k in source_hd6:
                if source_hd6[group_path][k].chunks is None:
                    destination_hd5.create_dataset(group_path + k, data=source_hd6[group_path][k])
                else:
                    destination_hd5.create_dataset(group_path + k, data=source_hd6[group_path][k], compression='gzip')
            else:
                if source_hd5[group_path][k].chunks is None:
                    destination_hd5.create_dataset(group_path + k, data=source_hd5[group_path][k])
                else:
                    destination_hd5.create_dataset(group_path + k, data=source_hd5[group_path][k], compression='gzip')                
        else:
            #logging.debug(f"copying group {group_path + k}")
            random_copy_hd5_datasets(source_hd5, source_hd6, destination_hd5, group_path=group_path + k + '/')

In [None]:
files = os.listdir(DATA_DIRECTORY)
for i,file1 in enumerate(files):
    file2 = files[i+1]
    if file1[-4:] != '.hd5' or file2[-4:] != '.hd5':
        continue
    if i > 10:
        break
    with h5py.File(os.path.join(DATA_DIRECTORY, file1)) as hd5:
        with h5py.File(os.path.join(DATA_DIRECTORY, file2)) as hd6:
            with h5py.File(f'fake_{i}.hd5', 'w') as hdfake:
                random_copy_hd5_datasets(hd5, hd6, hdfake)

In [None]:
summary_dict = defaultdict(Counter)
def return_rhythm_class(hd5):
    if 'poor_data_quality' in hd5['categorical']:
        return 'poor_data_quality'
    for rhythm in ('Normal_sinus_rhythm', 'Sinus_bradycardia', 'Marked_sinus_bradycardia'):
        if rhythm in hd5['categorical']:
            return 'Sinus_rhythm'
    if 'Atrial_fibrillation' in hd5['categorical']:
        return 'Atrial_fibirillation'    
    return 'Other_rhythm'

def new_rhythm(hd5):
    if 'poor_data_quality' in hd5['categorical']:
        return 'poor_data_quality'
    ecg_interpretation = str(hd5['ecg_rest_text'][0])
    for afib in ['Atrial fibrillation']:
        if afib in ecg_interpretation:
            return 'Atrial_fibrillation'
    for rhythm in ['sinus', 'Sinus']:
        if rhythm in ecg_interpretation:
            return 'Sinus_rhythm'
    return 'Other_rhythm'

channel_map={'Normal_sinus_rhythm': 0, 'Sinus_bradycardia': 1, 'Marked_sinus_bradycardia': 2, 'Other_sinus_rhythm': 3, 'Atrial_fibrillation': 4, 'Other_rhythm': 5}
def semi_coarse_rhythm(hd5):
#     if 'poor_data_quality' in hd5['categorical']:
#         return 'poor_data_quality'
    ecg_interpretation = str(hd5['ecg_rest_text'][0])
    for channel in channel_map:
        if channel in hd5['categorical']:
            return channel
    for afib in ['Atrial fibrillation']:
        if afib in ecg_interpretation:
            return 'Atrial_fibrillation'
    for rhythm in ['sinus', 'Sinus']:
        if rhythm in ecg_interpretation:
            return 'Other_sinus_rhythm'
    return 'Other_rhythm'

num_files = 0
a_fib_tab = np.zeros((6, 2))
brady_tab = np.zeros((6, 2))
for file in os.listdir(DATA_DIRECTORY):
    if file[-4:] != '.hd5':
        continue
    num_files += 1
    if num_files % 1000 == 0:
        print('.')
    with h5py.File(os.path.join(DATA_DIRECTORY, file)) as hd5:
        summary_dict[semi_coarse_rhythm(hd5)].update(hd5['ecg_rest_text'][:])
        a_fib = 1 if 'atrial_fibrillation_or_flutter' in hd5 and int(hd5['atrial_fibrillation_or_flutter'][0]) != 0 else 0
        brady = 1 if 'bradyarrhythmia_general_inclusive_definition' in hd5 and int(hd5['bradyarrhythmia_general_inclusive_definition'][0]) != 0 else 0
        a_fib_tab[channel_map[semi_coarse_rhythm(hd5)], a_fib] += 1
        brady_tab[channel_map[semi_coarse_rhythm(hd5)], brady] += 1
# with open('semi_coarse_rhythm_summary2.csv', mode='w') as file_:
#     file_writer = csv.writer(file_, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
#     for key, counter in summary_dict.items():
#         for text, count in counter.most_common():
#             file_writer.writerow([key, text, count])
print(a_fib_tab)

In [None]:
from matplotlib.table import Table
afibs = ['No EHR Bradyarrhythmia', 'EHR Bradyarrhythmia']  # ['No EHR aFib', 'EHR aFib']
nrows = len(channel_map)
ncols = len(afibs)
def checkerboard_table(data, fmt='{:1.0f}', bkg_colors=['yellow', 'white']):
    fig, ax = plt.subplots(figsize=(16, 16))
    ax.set_axis_off()
    tb = Table(ax, bbox=[0,0,1,1])

    width, height = 1.0 / ncols, 1.0 / nrows
    
    # Add cells
    for (i,j), val in np.ndenumerate(data):
        # Index either the first or second item of bkg_colors based on a checker board pattern
        idx = [j % 2, (j + 1) % 2][i % 2]
        color = bkg_colors[idx]
        tb.add_cell(i, j, width, height, text=fmt.format(val), loc='center', facecolor=color)

    # Labels...
    col_offset = height/2
    row_offset = -width/6
    font_size = 28
    for count, string in enumerate(afibs):
        ax.annotate('  '+string, xy=(count*width, 1), xycoords='axes fraction', ha='left', va='bottom', 
                    rotation=30, size=font_size)    
    for count, string in enumerate(reversed(list(channel_map.keys()))):
        ax.annotate('  '+string, xy=(row_offset, col_offset +count*height), xycoords='axes fraction', ha='right', 
                    va='center', size=font_size)      
    
    tb.auto_set_font_size(False)
    tb.set_fontsize(font_size)
    ax.add_table(tb)
    return fig

checkerboard_table(brady_tab)
plt.show()