In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
import logging

# Set up logging
logging.basicConfig(filename='ecg_processing.log', level=logging.INFO,
                   format='%(asctime)s:%(levelname)s:%(message)s')

# Constants
SAMPLING_RATE = 500  # Hz
SEQUENCE_LENGTH = 5000  # 10 seconds * 500 Hz
N_LEADS = 12


ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Rhythm Mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

2024-12-26 04:24:31.279237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-26 04:24:31.290806: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-26 04:24:31.294341: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-26 04:24:31.304210: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping):
    """Load and preprocess ECG data and labels."""
    # Load diagnostics data
    diagnostics = pd.read_excel(diagnostics_file)

    # Map rhythms to reduced set of labels
    diagnostics['Rhythm'] = diagnostics['Rhythm'].map(rhythm_mapping)

    valid_files = []
    valid_data = []
    valid_labels = []

    # Process each ECG file
    for idx, row in diagnostics.iterrows():
        file_path = os.path.join(ecg_folder, row['FileName'] + ".csv")

        try:
            # Load ECG data
            ecg_data = pd.read_csv(file_path, header=0)

            # Check for missing or zero values
            if ecg_data.isnull().any().any() or (ecg_data == 0).all().any():
                logging.warning(f"File {row['FileName']} contains null or all-zero leads - skipped")
                continue

            # Check if data has expected length
            if len(ecg_data) != SEQUENCE_LENGTH:
                logging.warning(f"File {row['FileName']} has unexpected length {len(ecg_data)} - skipped")
                continue

            # Store valid data
            valid_files.append(row['FileName'])
            valid_data.append(ecg_data.values)
            valid_labels.append(row['Rhythm'])

        except Exception as e:
            logging.error(f"Error processing {row['FileName']}: {str(e)}")
            continue

    # Convert to numpy arrays
    X = np.array(valid_data)
    y = np.array(valid_labels)

    return X, y, valid_files

def prepare_data(X, y, use_single_lead=False):
    """Prepare data for training - normalize and split."""
    # Reshape data if using single lead
    if use_single_lead:
        X = X[:, :, 1:2]  # Keep only second lead

    # Reshape for preprocessing
    original_shape = X.shape
    X_reshaped = X.reshape(-1, X.shape[-1])

    # Normalize using z-score
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X_reshaped)
    X = X_normalized.reshape(original_shape)

    # Convert string labels to integer indices
    label_encoder = tf.keras.preprocessing.text.Tokenizer()
    label_encoder.fit_on_texts(y)
    y_encoded = label_encoder.texts_to_sequences(y)
    y_encoded = np.array(y_encoded).reshape(-1)

    # Convert to one-hot encoding
    num_classes = len(label_encoder.word_index)
    y_onehot = tf.keras.utils.to_categorical(y_encoded - 1, num_classes)  # Subtract 1 since word_index starts from 1

    # Get class names in order
    classes = [k for k, v in sorted(label_encoder.word_index.items(), key=lambda x: x[1])]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_onehot, test_size=0.3, random_state=42, stratify=y_onehot
    )

    return X_train, X_test, y_train, y_test, classes

def create_mlp_model(input_shape, num_classes):
    """Create MLP model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=input_shape),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def create_cnn_model(input_shape, num_classes):
    """Create CNN model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test, classes, model_name):
    """Train model and print evaluation metrics."""
    # Compile model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=200,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )

    # Evaluate model
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Print classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test_classes, y_pred_classes,
                                target_names=classes, digits=5))

    return history

# Load and preprocess data
X, y, valid_files = load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping)
logging.info(f"Successfully processed {len(valid_files)} files")

# Train and evaluate models using all leads
print("\nTraining models with all leads:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=False)

# Train MLP
# mlp_model = create_mlp_model((SEQUENCE_LENGTH, N_LEADS), len(classes))
# mlp_history = train_and_evaluate(mlp_model, X_train, X_test, y_train, y_test,
#                                  classes, "MLP (All Leads)")

# Train CNN
cnn_model = create_cnn_model((SEQUENCE_LENGTH, N_LEADS), len(classes))
cnn_history = train_and_evaluate(cnn_model, X_train, X_test, y_train, y_test,
                                 classes, "CNN (All Leads)")

# Train and evaluate models using single lead
print("\nTraining models with single lead:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=True)

# Train MLP
# mlp_model_single = create_mlp_model((SEQUENCE_LENGTH, 1), len(classes))
# mlp_history_single = train_and_evaluate(mlp_model_single, X_train, X_test,
#                                         y_train, y_test, classes, "MLP (Single Lead)")

# Train CNN
cnn_model_single = create_cnn_model((SEQUENCE_LENGTH, 1), len(classes))
cnn_history_single = train_and_evaluate(cnn_model_single, X_train, X_test,
                                        y_train, y_test, classes, "CNN (Single Lead)")



Training models with all leads:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1735165547.418528  151252 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735165547.453757  151252 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735165547.455727  151252 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735165547.45905

Epoch 1/200


I0000 00:00:1735165550.170294  152075 service.cc:146] XLA service 0x77b038003e70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735165550.170327  152075 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-12-26 04:25:50.204745: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-26 04:25:50.320599: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m 13/186[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 14ms/step - accuracy: 0.2834 - loss: 1.3944

I0000 00:00:1735165554.111579  152075 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.3990 - loss: 1.2921 - val_accuracy: 0.5753 - val_loss: 0.9687
Epoch 2/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.6220 - loss: 0.9354 - val_accuracy: 0.6472 - val_loss: 0.8401
Epoch 3/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.6499 - loss: 0.8437 - val_accuracy: 0.6888 - val_loss: 0.7782
Epoch 4/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.6982 - loss: 0.7598 - val_accuracy: 0.7103 - val_loss: 0.7425
Epoch 5/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.7144 - loss: 0.7345 - val_accuracy: 0.7238 - val_loss: 0.6957
Epoch 6/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.7334 - loss: 0.6929 - val_accuracy: 0.7258 - val_loss: 0.6971
Epoch 7/200
[1m186/186[0m 

2024-12-26 04:34:24.736476: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 765360000 exceeds 10% of free system memory.
2024-12-26 04:34:25.197929: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 765360000 exceeds 10% of free system memory.


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step

Classification Report for CNN (All Leads):
              precision    recall  f1-score   support

          sb    0.91339   0.89460   0.90390      1167
        gsvt    0.81519   0.82464   0.81988       690
          sr    0.76369   0.79460   0.77884       667
        afib    0.84557   0.83158   0.83851       665

    accuracy                        0.84541      3189
   macro avg    0.83446   0.83636   0.83528      3189
weighted avg    0.84669   0.84541   0.84593      3189


Training models with single lead:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.3804 - loss: 1.3381 - val_accuracy: 0.5081 - val_loss: 1.1130
Epoch 2/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5219 - loss: 1.1045 - val_accuracy: 0.5753 - val_loss: 0.9960
Epoch 3/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5727 - loss: 1.0229 - val_accuracy: 0.6116 - val_loss: 0.9412
Epoch 4/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5920 - loss: 0.9664 - val_accuracy: 0.6492 - val_loss: 0.8783
Epoch 5/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6267 - loss: 0.9189 - val_accuracy: 0.6734 - val_loss: 0.8346
Epoch 6/200
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6402 - loss: 0.8673 - val_accuracy: 0.6915 - val_loss: 0.7885
Epoch 7/200
[1m