In [1]:
import os

import numpy as np
import pandas as pd
from scipy.io import loadmat

diags = [
    { 
        'desc': 'Atrial Fibrilation', 
        'diag': 'AF', 
        'code': '164889003' 
    },
    {
        'desc': 'Sinus Rhythm',
        'diag': 'SR',
        'code': '426783006'
    }
]
diags_code = np.array(list(map(lambda x: x['code'], diags)))
path_folder = '../databases/'
available_databases = os.listdir(path_folder)
available_databases.remove('.gitkeep')
available_databases



['WFDB_Ga',
 'WFDB_StPetersburg',
 'WFDB_PTB',
 'WFDB_Ningbo',
 'WFDB_CPSC2018_2',
 'WFDB_CPSC2018',
 'WFDB_PTBXL',
 'WFDB_ChapmanShaoxing']

In [13]:
recordings = []

for path in available_databases:
    path = f'../databases/{path}/'
    headers = list(filter(lambda x: os.path.splitext(x)[1] == '.hea', 
                          sorted(os.listdir(path))))  
    hea_file = open(path + headers[0]).read()
    freq = int(hea_file.split('\n')[0].split(' ')[2])
           
    for filename in headers:
        filename, extension = os.path.splitext(filename)
        record_path = path + filename + '.mat'
        header_path = path + filename + '.hea'
        
        if (not os.path.isfile(record_path) or 
            not os.path.isfile(header_path)):
            continue
            
        header_file = open(header_path, 'r').read()
        
        rec_diags = []
        for row in header_file.split('\n'):
            if not row.startswith('#Dx'):
                continue
            entries = row.split(': ')[1].split(',')
            for entry in entries:
                rec_diags.append(entry.strip())

        rec_diags = np.array(rec_diags)
        diags_code = np.array(diags_code)
        intersec = np.intersect1d(rec_diags, diags_code)
        
        # if not intersec.size == 1 and rec_diags == 1:
        if not intersec.size == 1:
            '''
             0 - not found diag
             1 - found only one diag in diags
             2 - found two in diags? af and sr
            '''
            continue
            
        leads, sampling = loadmat(record_path)['val'].shape
        
        if int(leads) != 12:
            continue
        
        database = record_path.split('/')[-2]
        filename = record_path.split('/')[-1]
        diagnose = list(filter(lambda x: x['code'] == intersec[0], diags))[0]['diag']
        total_diags = int(rec_diags.size)
        time = float(sampling/freq)
        
        recordings.append([database, 
                           filename, 
                           diagnose, 
                           total_diags, 
                           freq,
                           time])
    print(len(recordings), path)

2322 ../databases/WFDB_Ga/
2324 ../databases/WFDB_StPetersburg/
2419 ../databases/WFDB_PTB/
8718 ../databases/WFDB_Ningbo/
8875 ../databases/WFDB_CPSC2018_2/
11014 ../databases/WFDB_CPSC2018/
30546 ../databases/WFDB_PTBXL/
33169 ../databases/WFDB_ChapmanShaoxing/


In [14]:
df = pd.DataFrame(recordings, columns=['db', 
                                       'filename', 
                                       'diag', 
                                       'total_diags', 
                                       'db_freq',
                                       'time'])
df.head()
print(df.shape)

(33169, 6)


In [15]:
df = df.sample(frac=1, ignore_index=True, random_state=32)
df.head()

Unnamed: 0,db,filename,diag,total_diags,db_freq,time
0,WFDB_CPSC2018,A4644.mat,SR,1,500,10.0
1,WFDB_PTBXL,HR15248.mat,SR,1,500,10.0
2,WFDB_Ga,E02092.mat,SR,1,500,10.0
3,WFDB_PTBXL,HR18712.mat,SR,5,500,10.0
4,WFDB_PTBXL,HR21119.mat,AF,4,500,10.0


In [16]:
print(f"AF: {str(df[df.diag == 'AF'].shape[0])} rows")
print(f"SR: {str(df[df.diag == 'SR'].shape[0])} rows")
df.groupby(['diag', 'db']).size()

AF: 4711 rows
SR: 28458 rows


diag  db                  
AF    WFDB_CPSC2018            1221
      WFDB_CPSC2018_2           153
      WFDB_ChapmanShaoxing     1273
      WFDB_Ga                   570
      WFDB_PTB                   15
      WFDB_PTBXL               1477
      WFDB_StPetersburg           2
SR    WFDB_CPSC2018             918
      WFDB_CPSC2018_2             4
      WFDB_ChapmanShaoxing     1350
      WFDB_Ga                  1752
      WFDB_Ningbo              6299
      WFDB_PTB                   80
      WFDB_PTBXL              18055
dtype: int64

In [18]:
df.to_csv('../workdata/dohko/dohko.csv', index=False)