In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast

In [2]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

In [4]:
path = 'dat/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'
sampling_rate=100
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

In [10]:
Y.scp_codes.head()

ecg_id
1    {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                {'NORM': 80.0, 'SBRAD': 0.0}
3                  {'NORM': 100.0, 'SR': 0.0}
4                  {'NORM': 100.0, 'SR': 0.0}
5                  {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, dtype: object

In [6]:
# Load raw signal data
X = load_raw_data(Y, sampling_rate, path)

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

In [9]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [17]:
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

# Split data into train and test


In [15]:
Y[['scp_codes','diagnostic_superclass']]

Unnamed: 0_level_0,scp_codes,diagnostic_superclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",[NORM]
2,"{'NORM': 80.0, 'SBRAD': 0.0}",[NORM]
3,"{'NORM': 100.0, 'SR': 0.0}",[NORM]
4,"{'NORM': 100.0, 'SR': 0.0}",[NORM]
5,"{'NORM': 100.0, 'SR': 0.0}",[NORM]
...,...,...
21833,"{'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...",[STTC]
21834,"{'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}",[NORM]
21835,"{'ISCAS': 50.0, 'SR': 0.0}",[STTC]
21836,"{'NORM': 100.0, 'SR': 0.0}",[NORM]


In [20]:
# convert list to a string
Y['diagnosis_combined'] = Y['diagnostic_superclass'].apply(lambda x: ' '.join(x))

Y[['diagnosis_combined','diagnostic_superclass']]
print(Y['diagnosis_combined'].nunique())

32


In [23]:
print(Y['diagnosis_combined'].value_counts())

diagnosis_combined
NORM              9069
MI                2532
STTC              2400
CD                1708
CD MI             1278
STTC HYP           608
MI STTC            599
HYP                535
CD STTC            471
                   411
CD NORM            407
STTC MI HYP        340
CD HYP             300
CD MI STTC         220
MI HYP             183
HYP STTC           173
STTC CD HYP        156
STTC CD MI HYP     140
CD MI HYP          111
HYP CD STTC         55
STTC NORM           28
HYP MI STTC         21
MI CD               19
HYP CD MI STTC      14
MI CD HYP            6
STTC CD NORM         5
MI CD STTC           3
HYP CD NORM          2
HYP NORM             2
MI STTC CD HYP       1
MI HYP CD NORM       1
MI HYP CD STTC       1
Name: count, dtype: int64


In [None]:
# Split into train and test. 

test_fold = 10
# Train
X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass