In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import tensorflow as tf
# noinspection PyUnresolvedReferences
from tensorflow.keras.models import Sequential
# noinspection PyUnresolvedReferences
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization

# Constants
SAMPLING_RATE = 500  # Hz
WINDOW_SIZE = 2 * SAMPLING_RATE
STEP_SIZE = 1 * SAMPLING_RATE
ecg_folder = "../../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Label mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

# Load diagnostics data
diagnostics_df = pd.read_excel(diagnostics_file)
diagnostics_df['Rhythm'] = diagnostics_df['Rhythm'].map(rhythm_mapping)

2024-11-27 12:04:12.482559: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-27 12:04:12.493308: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-27 12:04:12.496560: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 12:04:12.505958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
diagnostics_df

Unnamed: 0,FileName,Rhythm,Beat,PatientAge,Gender,VentricularRate,AtrialRate,QRSDuration,QTInterval,QTCorrected,RAxis,TAxis,QRSCount,QOnset,QOffset,TOffset
0,MUSE_20180113_171327_27000,AFIB,RBBB TWC,85,MALE,117,234,114,356,496,81,-27,19,208,265,386
1,MUSE_20180112_073319_29000,SB,TWC,59,FEMALE,52,52,92,432,401,76,42,8,215,261,431
2,MUSE_20180111_165520_97000,SR,NONE,20,FEMALE,67,67,82,382,403,88,20,11,224,265,415
3,MUSE_20180113_121940_44000,SB,NONE,66,MALE,53,53,96,456,427,34,3,9,219,267,447
4,MUSE_20180112_122850_57000,AFIB,STDD STTC,73,FEMALE,162,162,114,252,413,68,-40,26,228,285,354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10641,MUSE_20181222_204306_99000,GSVT,NONE,80,FEMALE,196,73,168,284,513,258,244,32,177,261,319
10642,MUSE_20181222_204309_22000,GSVT,NONE,81,FEMALE,162,81,162,294,482,110,-75,27,173,254,320
10643,MUSE_20181222_204310_31000,GSVT,NONE,39,MALE,152,92,152,340,540,250,38,25,208,284,378
10644,MUSE_20181222_204312_58000,GSVT,NONE,76,MALE,175,178,128,310,529,98,-83,29,205,269,360


In [3]:
from scipy.signal import butter, filtfilt

# Define a Butterworth bandpass filter
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

# Apply the Butterworth bandpass filter
def bandpass_filter(data, lowcut=0.5, highcut=50.0, fs=500, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    filtered_data = filtfilt(b, a, data)
    return filtered_data


def preprocess_ecg_data(ecg_folder, diagnostics_df):
    segments = []
    segment_labels = []

    for _, row in tqdm(diagnostics_df.iterrows(), total=diagnostics_df.shape[0]):
        file_name = row['FileName']
        rhythm_label = row['Rhythm']

        # Skip if rhythm label is invalid
        if pd.isnull(rhythm_label) or rhythm_label not in rhythm_mapping.values():
            print("Invalid rhythm label", rhythm_label)
            continue

        # Load ECG file
        ecg_file = os.path.join(ecg_folder, f"{file_name}.csv")
        if not os.path.exists(ecg_file):
            print("File not found", ecg_file)
            continue

        ecg_data = pd.read_csv(ecg_file, header=0).iloc[:, 1].values
        ecg_data = ecg_data.astype(float)

        ecg_data = bandpass_filter(ecg_data, lowcut=0.5, highcut=50.0, fs=500)

        # Normalize the signal
        # ecg_data = (ecg_data - np.mean(ecg_data)) / np.std(ecg_data)
        # print(len(ecg_data))
        # Segment the data using sliding window
        for start in range(0, len(ecg_data) - WINDOW_SIZE + 1, STEP_SIZE):
            segment = ecg_data[start:start + WINDOW_SIZE]
            segments.append(segment)
            segment_labels.append(rhythm_label)

    return np.array(segments), np.array(segment_labels)


# Preprocess data
segments, segment_labels = preprocess_ecg_data(ecg_folder, diagnostics_df)
segments

100%|██████████| 10646/10646 [00:58<00:00, 183.38it/s]


array([[ -54.80315802,  -52.03122887,  -50.81890882, ..., -106.69487746,
         -93.32025106,  -80.6932936 ],
       [  16.71588221,   26.90299297,   31.29498008, ...,   38.26315102,
          33.3374666 ,   26.74318279],
       [ -70.11533772,  -62.67772659,  -59.09794289, ...,  -30.88801473,
         -33.07046557,  -33.53648724],
       ...,
       [  -7.04697507,   -2.45773578,    1.7604805 , ..., -157.73240267,
        -155.50880072, -153.71140666],
       [ -35.10222605,  -30.23766349,  -24.70039684, ..., -130.82928096,
        -124.10242375, -117.10045159],
       [-152.25653906, -150.93842796, -149.45201828, ...,   86.40206903,
          82.0118788 ,   77.65999621]])

In [4]:
# Encode labels
label_encoder = LabelEncoder()
segment_labels_encoded = label_encoder.fit_transform(segment_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(segments, segment_labels_encoded, test_size=0.2, random_state=42)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train.reshape(X_train.shape[0], -1), y_train)
y_pred_dt = dt.predict(X_test.reshape(X_test.shape[0], -1))
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))
print("Depth of the tree", dt.get_depth())
print("Leaf nodes of the tree", dt.get_n_leaves())

Decision Tree Classification Report:
              precision    recall  f1-score   support

        AFIB       0.36      0.36      0.36      3963
        GSVT       0.59      0.58      0.59      4051
          SB       0.71      0.71      0.71      7037
          SR       0.48      0.50      0.49      4112

    accuracy                           0.57     19163
   macro avg       0.54      0.54      0.54     19163
weighted avg       0.57      0.57      0.57     19163

Depth of the tree 76
Leaf nodes of the tree 11490


In [5]:
# MLP
mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
mlp.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=100, batch_size=1024, validation_data=(X_test, y_test))
y_pred_mlp = np.argmax(mlp.predict(X_test), axis=1)
print("MLP Classification Report:")
print(classification_report(y_test, y_pred_mlp, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732687668.193342  163749 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732687668.224041  163749 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732687668.225301  163749 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732687668.22767

Epoch 1/100


I0000 00:00:1732687669.718019  165943 service.cc:146] XLA service 0x7f599800d1f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732687669.718050  165943 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-11-27 12:07:49.752868: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-27 12:07:49.867306: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907




[1m74/75[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.3246 - loss: 1.8421

I0000 00:00:1732687671.396825  165943 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.3257 - loss: 1.8367 - val_accuracy: 0.5123 - val_loss: 1.2850
Epoch 2/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4542 - loss: 1.3119 - val_accuracy: 0.6044 - val_loss: 0.9977
Epoch 3/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5152 - loss: 1.1650 - val_accuracy: 0.6240 - val_loss: 0.9427
Epoch 4/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5508 - loss: 1.0777 - val_accuracy: 0.6432 - val_loss: 0.9078
Epoch 5/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5750 - loss: 1.0325 - val_accuracy: 0.6505 - val_loss: 0.8815
Epoch 6/100
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5879 - loss: 1.0033 - val_accuracy: 0.6596 - val_loss: 0.8569
Epoch 7/100
[1m75/75[0m [32m━━━━━━━━━━━━━━

In [6]:
# Standardize input shape for CNN
X_train_cnn = X_train[..., np.newaxis]
X_test_cnn = X_test[..., np.newaxis]

# CNN
cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=64, validation_data=(X_test_cnn, y_test))
y_pred_cnn = np.argmax(cnn.predict(X_test_cnn), axis=1)
print("CNN Classification Report:")
print(classification_report(y_test, y_pred_cnn, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.4138 - loss: 3.7438 - val_accuracy: 0.7395 - val_loss: 0.7103
Epoch 2/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.6801 - loss: 0.8278 - val_accuracy: 0.7853 - val_loss: 0.5922
Epoch 3/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7369 - loss: 0.6960 - val_accuracy: 0.8021 - val_loss: 0.5351
Epoch 4/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7669 - loss: 0.6236 - val_accuracy: 0.8103 - val_loss: 0.5115
Epoch 5/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7887 - loss: 0.5640 - val_accuracy: 0.8212 - val_loss: 0.5006
Epoch 6/50
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8077 - loss: 0.5166 - val_accuracy: 0.8301 - val_loss: 0.4708
Epoch 7/50
[1m