In [5]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
import logging

# Set up logging
logging.basicConfig(filename='ecg_processing.log', level=logging.INFO,
                   format='%(asctime)s:%(levelname)s:%(message)s')

# Constants
SAMPLING_RATE = 500  # Hz
SEQUENCE_LENGTH = 5000  # 10 seconds * 500 Hz
N_LEADS = 12


ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Rhythm Mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

In [6]:
def load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping):
    """Load and preprocess ECG data and labels."""
    # Load diagnostics data
    diagnostics = pd.read_excel(diagnostics_file)

    # Map rhythms to reduced set of labels
    diagnostics['Rhythm'] = diagnostics['Rhythm'].map(rhythm_mapping)

    valid_files = []
    valid_data = []
    valid_labels = []

    # Process each ECG file
    for idx, row in diagnostics.iterrows():
        file_path = os.path.join(ecg_folder, row['FileName'] + ".csv")

        try:
            # Load ECG data
            ecg_data = pd.read_csv(file_path, header=0)

            # Check for missing or zero values
            if ecg_data.isnull().any().any() or (ecg_data == 0).all().any():
                logging.warning(f"File {row['FileName']} contains null or all-zero leads - skipped")
                continue

            # Check if data has expected length
            if len(ecg_data) != SEQUENCE_LENGTH:
                logging.warning(f"File {row['FileName']} has unexpected length {len(ecg_data)} - skipped")
                continue

            # Store valid data
            valid_files.append(row['FileName'])
            valid_data.append(ecg_data.values)
            valid_labels.append(row['Rhythm'])

        except Exception as e:
            logging.error(f"Error processing {row['FileName']}: {str(e)}")
            continue

    # Convert to numpy arrays
    X = np.array(valid_data)
    y = np.array(valid_labels)

    return X, y, valid_files

def prepare_data(X, y, use_single_lead=False):
    """Prepare data for training - normalize and split."""
    # Reshape data if using single lead
    if use_single_lead:
        X = X[:, :, 1:2]  # Keep only second lead

    # Reshape for preprocessing
    original_shape = X.shape
    X_reshaped = X.reshape(-1, X.shape[-1])

    # Normalize using z-score
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X_reshaped)
    X = X_normalized.reshape(original_shape)

    # Convert string labels to integer indices
    label_encoder = tf.keras.preprocessing.text.Tokenizer()
    label_encoder.fit_on_texts(y)
    y_encoded = label_encoder.texts_to_sequences(y)
    y_encoded = np.array(y_encoded).reshape(-1)

    # Convert to one-hot encoding
    num_classes = len(label_encoder.word_index)
    y_onehot = tf.keras.utils.to_categorical(y_encoded - 1, num_classes)  # Subtract 1 since word_index starts from 1

    # Get class names in order
    classes = [k for k, v in sorted(label_encoder.word_index.items(), key=lambda x: x[1])]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot
    )

    return X_train, X_test, y_train, y_test, classes

def create_mlp_model(input_shape, num_classes):
    """Create MLP model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=input_shape),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def create_cnn_model(input_shape, num_classes):
    """Create CNN model."""
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def train_and_evaluate(model, X_train, X_test, y_train, y_test, classes, model_name):
    """Train model and print evaluation metrics."""
    # Compile model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )

    # Evaluate model
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Print classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test_classes, y_pred_classes,
                                target_names=classes, digits=5))

    return history

# Load and preprocess data
X, y, valid_files = load_and_preprocess_data(ecg_folder, diagnostics_file, rhythm_mapping)
logging.info(f"Successfully processed {len(valid_files)} files")

# Train and evaluate models using all leads
print("\nTraining models with all leads:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=False)

# Train MLP
mlp_model = create_mlp_model((SEQUENCE_LENGTH, N_LEADS), len(classes))
mlp_history = train_and_evaluate(mlp_model, X_train, X_test, y_train, y_test,
                                 classes, "MLP (All Leads)")

# Train CNN
cnn_model = create_cnn_model((SEQUENCE_LENGTH, N_LEADS), len(classes))
cnn_history = train_and_evaluate(cnn_model, X_train, X_test, y_train, y_test,
                                 classes, "CNN (All Leads)")

# Train and evaluate models using single lead
print("\nTraining models with single lead:")
X_train, X_test, y_train, y_test, classes = prepare_data(X, y, use_single_lead=True)

# Train MLP
mlp_model_single = create_mlp_model((SEQUENCE_LENGTH, 1), len(classes))
mlp_history_single = train_and_evaluate(mlp_model_single, X_train, X_test,
                                        y_train, y_test, classes, "MLP (Single Lead)")

# Train CNN
cnn_model_single = create_cnn_model((SEQUENCE_LENGTH, 1), len(classes))
cnn_history_single = train_and_evaluate(cnn_model_single, X_train, X_test,
                                        y_train, y_test, classes, "CNN (Single Lead)")



Training models with all leads:


  super().__init__(**kwargs)
I0000 00:00:1735164332.812718  132452 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735164332.949153  132452 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735164332.953200  132452 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735164332.957563  132452 cuda_executor.cc:1015] successful

Epoch 1/20


I0000 00:00:1735164336.332438  133997 service.cc:146] XLA service 0x77c290005a30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735164336.332474  133997 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-12-26 04:05:36.369173: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-26 04:05:36.484487: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m 49/213[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.3223 - loss: 7.0677

I0000 00:00:1735164337.481436  133997 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.3791 - loss: 6.2313 - val_accuracy: 0.5026 - val_loss: 1.3597
Epoch 2/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5909 - loss: 1.3674 - val_accuracy: 0.5150 - val_loss: 1.0982
Epoch 3/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6760 - loss: 0.9010 - val_accuracy: 0.5514 - val_loss: 1.0322
Epoch 4/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7099 - loss: 0.7855 - val_accuracy: 0.5708 - val_loss: 1.0261
Epoch 5/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7704 - loss: 0.6254 - val_accuracy: 0.5661 - val_loss: 1.0296
Epoch 6/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7812 - loss: 0.6106 - val_accuracy: 0.5826 - val_loss: 1.0423
Epoch 7/20
[1m213/213[0m [32m━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-12-26 04:06:01.772833: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1632480000 exceeds 10% of free system memory.
2024-12-26 04:06:03.180653: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1632480000 exceeds 10% of free system memory.


Epoch 1/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.4364 - loss: 1.2809 - val_accuracy: 0.6326 - val_loss: 0.8985
Epoch 2/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.6268 - loss: 0.9075 - val_accuracy: 0.6267 - val_loss: 0.8764
Epoch 3/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.6710 - loss: 0.8061 - val_accuracy: 0.7002 - val_loss: 0.7656
Epoch 4/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.6889 - loss: 0.7664 - val_accuracy: 0.7255 - val_loss: 0.6978
Epoch 5/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.7240 - loss: 0.6915 - val_accuracy: 0.7343 - val_loss: 0.6810
Epoch 6/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.7281 - loss: 0.6790 - val_accuracy: 0.7372 - val_loss: 0.6615
Epoch 7/20
[1m213/21

  super().__init__(**kwargs)


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3757 - loss: 1.7221 - val_accuracy: 0.5362 - val_loss: 1.0768
Epoch 2/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6033 - loss: 0.9958 - val_accuracy: 0.5614 - val_loss: 1.0333
Epoch 3/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6905 - loss: 0.7992 - val_accuracy: 0.5697 - val_loss: 1.0273
Epoch 4/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7325 - loss: 0.6993 - val_accuracy: 0.5885 - val_loss: 1.0721
Epoch 5/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7633 - loss: 0.6386 - val_accuracy: 0.5791 - val_loss: 1.0366
Epoch 6/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7934 - loss: 0.5598 - val_accuracy: 0.5791 - val_loss: 1.0698
Epoch 7/20
[1m213/213[0m [32m━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.4054 - loss: 1.3159 - val_accuracy: 0.5391 - val_loss: 1.0765
Epoch 2/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5443 - loss: 1.0671 - val_accuracy: 0.5691 - val_loss: 0.9968
Epoch 3/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.5704 - loss: 1.0030 - val_accuracy: 0.5197 - val_loss: 1.1117
Epoch 4/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.6145 - loss: 0.9440 - val_accuracy: 0.5938 - val_loss: 0.9373
Epoch 5/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.6388 - loss: 0.8898 - val_accuracy: 0.6520 - val_loss: 0.8575
Epoch 6/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.6624 - loss: 0.8288 - val_accuracy: 0.6631 - val_loss: 0.8277
Epoch 7/20
[1m213/213[0m [32m━