In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import biosppy.signals.ecg as ecg
from typing import Tuple, List, Dict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
# noinspection PyUnresolvedReferences
from tensorflow.keras.models import Sequential
# noinspection PyUnresolvedReferences
from tensorflow.keras.layers import Dense, Dropout
# noinspection PyUnresolvedReferences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns

# Defining file path
ecg_folder = "../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised"
attributes_file = "../../../../Datasets/12-lead electrocardiogram database/AttributesDictionary.xlsx"
diagnostics_file = "../../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"
rhythm_names_file = "../../../../Datasets/12-lead electrocardiogram database/RhythmNames.xlsx"

# Checking for missing files and stuff
for file_path in [attributes_file, diagnostics_file, rhythm_names_file]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Required file not found: {file_path}")

# Loading metadata
attributes_df = pd.read_excel(attributes_file)
diagnostics_df = pd.read_excel(diagnostics_file)
rhythm_names_df = pd.read_excel(rhythm_names_file)

# Removing trailing spaces in acronym columns for accurate matching
rhythm_names_df['Acronym Name'] = rhythm_names_df['Acronym Name'].str.strip()

# Creating sets of valid acronyms for rhythm
valid_rhythms = set(rhythm_names_df['Acronym Name'])
print(valid_rhythms)

2024-11-25 10:41:00.744223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-25 10:41:00.754843: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-25 10:41:00.758024: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-25 10:41:00.766631: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'AVNRT', 'SAAWR', 'SVT', 'SB', 'SR', 'AF', 'AFIB', 'SI', 'ST', 'AVRT', 'AT'}


In [None]:
def load_and_preprocess_data(ecg_folder: str, diagnostics_df: pd.DataFrame, rhythm_mapping: Dict[str, str]) -> Tuple[
    np.ndarray, np.ndarray]:
    """
    Load ECG signals and extract features using BioSPPy
    """
    features_list = []
    labels = []

    # Process each ECG file
    for idx, row in tqdm(diagnostics_df.iterrows(), desc="Processing ECG files"):
        file_path = os.path.join(ecg_folder, f"{row['FileName']}.csv")

        if os.path.exists(file_path):
            # Load ECG signal
            signal = pd.read_csv(file_path).values.T  # Transpose to get [leads, samples]

            # Extract features for each lead
            lead_features = []
            for lead_signal in signal:
                try:
                    # Extract features using BioSPPy
                    out = ecg.ecg(signal=lead_signal, sampling_rate=500.0, show=False)

                    # Extract relevant features
                    features = {
                        'heart_rate': np.mean(out['heart_rate']),
                        'rpeaks_count': len(out['rpeaks']),
                        'templates_mean': np.mean(out['templates'], axis=0),
                        'templates_std': np.std(out['templates'], axis=0)
                    }

                    # Flatten features into a 1D array
                    lead_features.extend([
                        features['heart_rate'],
                        features['rpeaks_count'],
                        *features['templates_mean'],
                        *features['templates_std']
                    ])
                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")
                    continue

            features_list.append(lead_features)

            # Map rhythm to reduced set
            rhythm = row['Rhythm']
            mapped_rhythm = rhythm_mapping.get(rhythm, None)
            if mapped_rhythm:
                labels.append(mapped_rhythm)

    return np.array(features_list), np.array(labels)


def create_mlp_model(input_shape: int, num_classes: int) -> tf.keras.Model:
    """
    Create a simple MLP model
    """
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


def train_and_evaluate(X: np.ndarray, y: np.ndarray) -> Tuple[tf.keras.Model, dict]:
    """
    Train and evaluate the MLP model
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert labels to categorical
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    y_train_cat = to_categorical(y_train_encoded)
    y_test_cat = to_categorical(y_test_encoded)

    # Create and train model
    model = create_mlp_model(X_train.shape[1], len(set(y_train)))

    history = model.fit(
        X_train_scaled,
        y_train_cat,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            )
        ]
    )

    # Evaluate model
    y_pred = model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test_cat, axis=1)

    # Generate classification report
    report = classification_report(
        y_test_classes,
        y_pred_classes,
        target_names=le.classes_,
        output_dict=True
    )

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        confusion_matrix(y_test_classes, y_pred_classes),
        annot=True,
        fmt='d',
        xticklabels=le.classes_,
        yticklabels=le.classes_
    )
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

    return model, report


# Your rhythm mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

# Extract features
X, y = load_and_preprocess_data(ecg_folder, diagnostics_df, rhythm_mapping)

# Train and evaluate model
model, report = train_and_evaluate(X, y)

# Print classification report
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())

Processing ECG files: 459it [01:42,  4.30it/s]

Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180712_153140_95000.csv: Not enough beats to compute heart rate.


Processing ECG files: 730it [02:48,  4.40it/s]

Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.
Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.
Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.
Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.
Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.
Error processing file ../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised/MUSE_20180114_124230_39000.csv: Not enough beats to compute heart rate.


Processing ECG files: 1027it [04:06,  3.59it/s]