In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast

def load_raw_data(df, sampling_rate):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

sampling_rate=100

# load and convert annotation data
df = pd.read_csv('ptbxl_database.csv')
df.scp_codes = df.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
Signals = load_raw_data(df, sampling_rate)

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv('scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
df.reset_index(inplace = True)
Labels = df.scp_codes.apply(aggregate_diagnostic)

In [8]:
def augment_signals(signals, labels, augmentation_factor = 5, noise_factor=0.01):
    augmented_signals = []
    augmented_labels = []
    
    for index in range(len(labels)):
        repeated_signal = np.tile(signals[index], (augmentation_factor, 1, 1))
        noisy_signal = repeated_signal + noise_factor * np.random.randn(*repeated_signal.shape)
        augmented_signals.append(noisy_signal)
        augmented_labels.extend([labels[index]] * augmentation_factor)
        
    X = np.concatenate([signals] + augmented_signals)
    Y = np.concatenate([labels, augmented_labels])
    
    return X,Y

In [None]:
X,Y = augment_signals(Signals, Labels)

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

mlb = MultiLabelBinarizer()
one_hot_encoded_labels = mlb.fit_transform(Labels)

X_train, X_test, y_train, y_test = train_test_split(Signals, one_hot_encoded_labels, test_size = 0.2)