In [1]:
import numpy as np
import pandas as pd
import os

import pywt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import logging
from scipy import signal

# Set up logging
logging.basicConfig(filename='ecg_processing_v3.log', level=logging.INFO,
                   format='%(asctime)s:%(levelname)s:%(message)s')

# Constants
SAMPLING_RATE = 500  # Hz
SEQUENCE_LENGTH = 5000  # 10 seconds * 500 Hz
N_LEADS = 12


ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Rhythm Mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

2024-12-26 05:07:39.517648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-26 05:07:39.530183: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-26 05:07:39.533738: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-26 05:07:39.543962: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def denoise_signal(data, wavelet='db4', level=4):
    """Denoise ECG signal using wavelet transform."""
    # Wavelet decomposition
    coeffs = pywt.wavedec(data, wavelet, level=level)

    # Threshold calculation and applying
    threshold = np.std(coeffs[-1]) * np.sqrt(2 * np.log(len(data)))

    # Apply threshold to detail coefficients
    for i in range(1, len(coeffs)):
        coeffs[i] = pywt.threshold(coeffs[i], threshold, mode='soft')

    # Reconstruct signal
    return pywt.waverec(coeffs, wavelet)

def apply_filters(data):
    """Apply bandpass and notch filters to ECG signal."""
    # Bandpass filter (0.5-40 Hz)
    nyquist = SAMPLING_RATE / 2
    low = 0.5 / nyquist
    high = 40.0 / nyquist
    b, a = signal.butter(4, [low, high], btype='band')
    data = signal.filtfilt(b, a, data)

    # Notch filter (50 Hz)
    b_notch, a_notch = signal.iirnotch(50.0, 30.0, SAMPLING_RATE)
    data = signal.filtfilt(b_notch, a_notch, data)

    return data

def normalize_signal(data):
    """Normalize signal using z-score normalization."""
    mean = np.mean(data)
    std = np.std(data)
    if std == 0:
        return np.zeros_like(data)
    return (data - mean) / std

def preprocess_signal(data):
    """Apply full signal preprocessing pipeline."""
    processed_data = np.zeros_like(data)

    # Process each lead
    for lead in range(data.shape[1]):
        signal_1d = data[:, lead]
        # Apply preprocessing steps
        signal_1d = denoise_signal(signal_1d)
        signal_1d = apply_filters(signal_1d)
        signal_1d = normalize_signal(signal_1d)
        processed_data[:, lead] = signal_1d

    return processed_data

def load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping):
    """Load and preprocess ECG data and labels."""
    # Load diagnostics data
    diagnostics = pd.read_excel(diagnostics_file)
    diagnostics['Rhythm'] = diagnostics['Rhythm'].map(rhythm_mapping)

    valid_files = []
    valid_data = []
    valid_labels = []

    for idx, row in diagnostics.iterrows():
        file_path = os.path.join(ecg_folder, row['FileName'] + ".csv")

        try:
            # Load ECG data
            ecg_data = pd.read_csv(file_path, header=0)

            if ecg_data.isnull().any().any() or (ecg_data == 0).all().any():
                logging.warning(f"File {row['FileName']} contains null or all-zero leads - skipped")
                continue

            if len(ecg_data) != SEQUENCE_LENGTH:
                logging.warning(f"File {row['FileName']} has unexpected length {len(ecg_data)} - skipped")
                continue

            # Preprocess the signal
            processed_data = preprocess_signal(ecg_data.values)

            valid_files.append(row['FileName'])
            valid_data.append(processed_data)
            valid_labels.append(row['Rhythm'])

        except Exception as e:
            logging.error(f"Error processing {row['FileName']}: {str(e)}")
            continue

    return np.array(valid_data), np.array(valid_labels), valid_files

def prepare_data(X, y, use_single_lead=False):
    """Prepare data for training."""
    if use_single_lead:
        X = X[:, :, 1:2]  # Keep only lead II

    # Convert labels to one-hot encoding
    label_encoder = tf.keras.preprocessing.text.Tokenizer()
    label_encoder.fit_on_texts(y)
    y_encoded = label_encoder.texts_to_sequences(y)
    y_encoded = np.array(y_encoded).reshape(-1)
    num_classes = len(label_encoder.word_index)
    y_onehot = tf.keras.utils.to_categorical(y_encoded - 1, num_classes)

    # Get class names
    classes = [k for k, v in sorted(label_encoder.word_index.items(), key=lambda x: x[1])]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot
    )

    return X_train, X_test, y_train, y_test, classes

def create_simple_cnn(input_shape, num_classes):
    """Create a simplified CNN model optimized for ECG classification."""
    model = tf.keras.Sequential([
        # First convolutional block
        tf.keras.layers.Conv1D(16, 7, activation='relu', input_shape=input_shape, padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling1D(2),

        # Second convolutional block
        tf.keras.layers.Conv1D(32, 5, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling1D(2),

        # Third convolutional block
        tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.GlobalAveragePooling1D(),

        # Dense layers
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test, classes, model_name):
    """Train model and print evaluation metrics."""
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Add early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate model
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Print classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test_classes, y_pred_classes,
                                target_names=classes, digits=5))

    return history

# Load and preprocess data
X, y, valid_files = load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping)
logging.info(f"Successfully processed {len(valid_files)} files")

# Test with all leads
print("\nTraining model with all leads:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=False)
cnn_model = create_simple_cnn((SEQUENCE_LENGTH, N_LEADS), len(classes))
cnn_history = train_and_evaluate(cnn_model, X_train, X_test, y_train, y_test,
                                 classes, "CNN (All Leads)")

# Test with lead II only
print("\nTraining model with Lead II only:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=True)
cnn_model_single = create_simple_cnn((SEQUENCE_LENGTH, 1), len(classes))
cnn_history_single = train_and_evaluate(cnn_model_single, X_train, X_test,
                                        y_train, y_test, classes, "CNN (Lead II)")



Training model with all leads:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1735168207.043861  200900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735168207.084815  200900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735168207.086984  200900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735168207.09011

Epoch 1/30


I0000 00:00:1735168212.321722  202641 service.cc:146] XLA service 0x7a57f4004750 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735168212.321752  202641 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-12-26 05:10:12.380883: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-26 05:10:12.589186: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m 14/213[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 13ms/step - accuracy: 0.3805 - loss: 1.3405

I0000 00:00:1735168216.824935  202641 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.5093 - loss: 1.1392

2024-12-26 05:10:22.585837: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 408240000 exceeds 10% of free system memory.
2024-12-26 05:10:22.637770: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 408240000 exceeds 10% of free system memory.


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 34ms/step - accuracy: 0.5097 - loss: 1.1385 - val_accuracy: 0.5779 - val_loss: 0.9302
Epoch 2/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6985 - loss: 0.7678 - val_accuracy: 0.7490 - val_loss: 0.6481
Epoch 3/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7494 - loss: 0.6444 - val_accuracy: 0.7531 - val_loss: 0.6217
Epoch 4/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7693 - loss: 0.5784 - val_accuracy: 0.7972 - val_loss: 0.5409
Epoch 5/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7756 - loss: 0.5651 - val_accuracy: 0.7937 - val_loss: 0.5120
Epoch 6/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8008 - loss: 0.5082 - val_accuracy: 0.7325 - val_loss: 0.6903
Epoch 7/30
[1m213/213[0m [32m

2024-12-26 05:11:30.872393: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 510240000 exceeds 10% of free system memory.


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step

Classification Report for CNN (All Leads):
              precision    recall  f1-score   support

          sb    0.86425   0.98201   0.91937       778
        gsvt    0.85683   0.85870   0.85776       460
          sr    0.91643   0.71622   0.80405       444
        afib    0.87327   0.85360   0.86333       444

    accuracy                        0.87300      2126
   macro avg    0.87770   0.85263   0.86113      2126
weighted avg    0.87543   0.87300   0.87025      2126


Training model with Lead II only:
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.4945 - loss: 1.1841 - val_accuracy: 0.3651 - val_loss: 2.3632
Epoch 2/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6523 - loss: 0.8481 - val_accuracy: 0.3804 - val_loss: 2.4498
Epoch 3/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7190 - loss: 0.7077 - val_accuracy: 0.5544 - val_loss: 1.1706
Epoch 4/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.7294 - loss: 0.6774 - val_accuracy: 0.5197 - val_loss: 1.3225
Epoch 5/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.7490 - loss: 0.6472 - val_accuracy: 0.6731 - val_loss: 0.8549
Epoch 6/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.7650 - loss: 0.6092 - val_accuracy: 0.7366 - val_loss: 0.6358
Epoch 7/30
[1m213/213[0m [32m━━