In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import tensorflow as tf
# noinspection PyUnresolvedReferences
from tensorflow.keras.models import Sequential
# noinspection PyUnresolvedReferences
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization

# Constant seeding
tf.random.set_seed(5950)

# Constants
SAMPLING_RATE = 500  # Hz
WINDOW_SIZE = 2 * SAMPLING_RATE
STEP_SIZE = 1 * SAMPLING_RATE
ecg_folder = "../../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Label mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

# Load diagnostics data
diagnostics_df = pd.read_excel(diagnostics_file)
diagnostics_df['Rhythm'] = diagnostics_df['Rhythm'].map(rhythm_mapping)

2024-11-27 11:53:34.600836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-27 11:53:34.611933: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-27 11:53:34.615328: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 11:53:34.624441: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
diagnostics_df

Unnamed: 0,FileName,Rhythm,Beat,PatientAge,Gender,VentricularRate,AtrialRate,QRSDuration,QTInterval,QTCorrected,RAxis,TAxis,QRSCount,QOnset,QOffset,TOffset
0,MUSE_20180113_171327_27000,AFIB,RBBB TWC,85,MALE,117,234,114,356,496,81,-27,19,208,265,386
1,MUSE_20180112_073319_29000,SB,TWC,59,FEMALE,52,52,92,432,401,76,42,8,215,261,431
2,MUSE_20180111_165520_97000,SR,NONE,20,FEMALE,67,67,82,382,403,88,20,11,224,265,415
3,MUSE_20180113_121940_44000,SB,NONE,66,MALE,53,53,96,456,427,34,3,9,219,267,447
4,MUSE_20180112_122850_57000,AFIB,STDD STTC,73,FEMALE,162,162,114,252,413,68,-40,26,228,285,354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10641,MUSE_20181222_204306_99000,GSVT,NONE,80,FEMALE,196,73,168,284,513,258,244,32,177,261,319
10642,MUSE_20181222_204309_22000,GSVT,NONE,81,FEMALE,162,81,162,294,482,110,-75,27,173,254,320
10643,MUSE_20181222_204310_31000,GSVT,NONE,39,MALE,152,92,152,340,540,250,38,25,208,284,378
10644,MUSE_20181222_204312_58000,GSVT,NONE,76,MALE,175,178,128,310,529,98,-83,29,205,269,360


In [3]:
def preprocess_ecg_data(ecg_folder, diagnostics_df):
    segments = []
    segment_labels = []

    for _, row in tqdm(diagnostics_df.iterrows(), total=diagnostics_df.shape[0]):
        file_name = row['FileName']
        rhythm_label = row['Rhythm']

        # Skip if rhythm label is invalid
        if pd.isnull(rhythm_label) or rhythm_label not in rhythm_mapping.values():
            print("Invalid rhythm label", rhythm_label)
            continue

        # Load ECG file
        ecg_file = os.path.join(ecg_folder, f"{file_name}.csv")
        if not os.path.exists(ecg_file):
            print("File not found", ecg_file)
            continue

        ecg_data = pd.read_csv(ecg_file, header=0).iloc[:, 1].values
        ecg_data = ecg_data.astype(float)
        # print(ecg_data.shape)

        # Normalize the signal
        # ecg_data = (ecg_data - np.mean(ecg_data)) / np.std(ecg_data)
        # print(len(ecg_data))
        # Segment the data using sliding window
        for start in range(0, len(ecg_data) - WINDOW_SIZE + 1, STEP_SIZE):
            segment = ecg_data[start:start + WINDOW_SIZE]
            segments.append(segment)
            segment_labels.append(rhythm_label)

    return np.array(segments), np.array(segment_labels)


# Preprocess data
segments, segment_labels = preprocess_ecg_data(ecg_folder, diagnostics_df)
segments

100%|██████████| 10646/10646 [00:56<00:00, 189.99it/s]


array([[ 263.52,  263.52,  263.52, ...,    0.  ,  -19.52,  -14.64],
       [ -68.32,  -58.56,  -53.68, ...,  -53.68,  -53.68,  -78.08],
       [   0.  ,  -14.64,  -19.52, ...,  112.24,  126.88,  141.52],
       ...,
       [ 136.64,  131.76,  136.64, ...,  -14.64,   -4.88,    4.88],
       [  73.2 ,   68.32,   73.2 , ..., -107.36,  -97.6 ,  -97.6 ],
       [   4.88,    4.88,    9.76, ..., -214.72, -224.48, -234.24]])

In [4]:
# Encode labels
label_encoder = LabelEncoder()
segment_labels_encoded = label_encoder.fit_transform(segment_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(segments, segment_labels_encoded, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Decision Tree Classification Report:
              precision    recall  f1-score   support

        AFIB       0.29      0.30      0.29      1740
        GSVT       0.51      0.47      0.49      1868
          SB       0.66      0.67      0.66      3100
          SR       0.40      0.39      0.40      1809

    accuracy                           0.49      8517
   macro avg       0.46      0.46      0.46      8517
weighted avg       0.49      0.49      0.49      8517

Depth of the tree 98
Leaf nodes of the tree 5361


In [5]:
# MLP
mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dense(len(label_encoder.classes_), activation='softmax')
])

mlp.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=100, batch_size=1024, validation_data=(X_test, y_test))
y_pred_mlp = np.argmax(mlp.predict(X_test), axis=1)
print("MLP Classification Report:")
print(classification_report(y_test, y_pred_mlp, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732686932.751535  146919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732686932.791939  146919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732686932.799561  146919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732686932.80640

Epoch 1/100


I0000 00:00:1732686934.292837  149296 service.cc:146] XLA service 0x705ce8002da0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732686934.292875  149296 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-11-27 11:55:34.329007: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-27 11:55:34.442135: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907




[1m21/34[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.2547 - loss: 2.1497 

I0000 00:00:1732686935.985014  149296 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 75ms/step - accuracy: 0.2736 - loss: 2.0485 - val_accuracy: 0.4086 - val_loss: 1.8751
Epoch 2/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4123 - loss: 1.4548 - val_accuracy: 0.5530 - val_loss: 1.1695
Epoch 3/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4781 - loss: 1.2791 - val_accuracy: 0.6015 - val_loss: 1.0106
Epoch 4/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5167 - loss: 1.1629 - val_accuracy: 0.6319 - val_loss: 0.9427
Epoch 5/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5467 - loss: 1.1003 - val_accuracy: 0.6541 - val_loss: 0.8887
Epoch 6/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5725 - loss: 1.0378 - val_accuracy: 0.6663 - val_loss: 0.8543
Epoch 7/100
[1m34/34[0m [32m━━━━━━━━━━━━━━

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train.reshape(X_train.shape[0], -1), y_train)
y_pred_dt = dt.predict(X_test.reshape(X_test.shape[0], -1))
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))
print("Depth of the tree", dt.get_depth())
print("Leaf nodes of the tree", dt.get_n_leaves())