In [1]:
import numpy as np
import pandas as pd
import os
from typing import Tuple, List, Dict, Union
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
# noinspection PyUnresolvedReferences
from tensorflow.keras.models import Sequential
# noinspection PyUnresolvedReferences
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization, Flatten
# noinspection PyUnresolvedReferences
from tensorflow.keras.utils import to_categorical
from scipy.signal import find_peaks, butter, filtfilt

# Defining file path
ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGData"
attributes_file = "../../../Datasets/12-lead electrocardiogram database/AttributesDictionary.xlsx"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"
rhythm_names_file = "../../../Datasets/12-lead electrocardiogram database/RhythmNames.xlsx"

# Checking for missing files and stuff
for file_path in [attributes_file, diagnostics_file, rhythm_names_file]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Required file not found: {file_path}")

# Loading metadata
attributes_df = pd.read_excel(attributes_file)
diagnostics_df = pd.read_excel(diagnostics_file)
rhythm_names_df = pd.read_excel(rhythm_names_file)

# Removing trailing spaces in acronym columns for accurate matching
rhythm_names_df['Acronym Name'] = rhythm_names_df['Acronym Name'].str.strip()

# Creating sets of valid acronyms for rhythm
valid_rhythms = set(rhythm_names_df['Acronym Name'])
print(valid_rhythms)

2024-11-25 11:47:16.506723: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-25 11:47:16.551282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-25 11:47:16.565279: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-25 11:47:16.633501: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'SVT', 'AT', 'ST', 'SR', 'AF', 'AFIB', 'SAAWR', 'SB', 'AVNRT', 'SI', 'AVRT'}


In [2]:

def preprocess_signal(signal: np.ndarray, sampling_rate: int = 500) -> np.ndarray:
    """
    Preprocess ECG signal with filtering and normalization
    """
    # Apply bandpass filter (0.5-45 Hz)
    nyquist = sampling_rate / 2
    low = 0.5 / nyquist
    high = 45 / nyquist
    b, a = butter(2, [low, high], btype='band')
    filtered = filtfilt(b, a, signal)
    
    # Normalize
    normalized = (filtered - np.mean(filtered)) / np.std(filtered)
    
    return normalized

def detect_beats(signal: np.ndarray, sampling_rate: int = 500) -> np.ndarray:
    """
    Detect R-peaks in the signal
    """
    # Find R-peaks
    peaks, _ = find_peaks(signal, distance=sampling_rate//2)  # Minimum 0.5s between peaks
    return peaks

def segment_beats(signal: np.ndarray, peaks: np.ndarray, window: int = 250) -> np.ndarray:
    """
    Segment signal into individual beats
    """
    segments = []
    for peak in peaks:
        # Extract window around the peak
        start = max(0, peak - window//2)
        end = min(len(signal), peak + window//2)
        
        if end - start == window:  # Only use complete segments
            segment = signal[start:end]
            segments.append(segment)
    
    return np.array(segments)

def load_and_preprocess_data(
    ecg_folder: str,
    diagnostics_df: pd.DataFrame,
    rhythm_mapping: Dict[str, str],
    selected_leads: Union[int, List[int]],
    num_timesteps: int = 5000
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Load and preprocess ECG signals with beat detection
    """
    if isinstance(selected_leads, int):
        selected_leads = [selected_leads]
    
    processed_segments = []
    labels = []
    
    for idx, row in tqdm(diagnostics_df.iterrows(), desc="Loading ECG files"):
        file_path = os.path.join(ecg_folder, f"{row['FileName']}.csv")
        
        if os.path.exists(file_path):
            try:
                # Load ECG signal
                signal = pd.read_csv(file_path).values  # Shape: [timesteps, leads]
                
                if signal.shape[0] == num_timesteps:
                    # Process each selected lead
                    for lead in selected_leads:
                        # Preprocess signal
                        processed = preprocess_signal(signal[:, lead])
                        
                        # Detect beats
                        peaks = detect_beats(processed)
                        
                        # Segment beats
                        segments = segment_beats(processed, peaks)
                        
                        if len(segments) > 0:  # If beats were detected
                            processed_segments.extend(segments)
                            
                            # Map rhythm to reduced set
                            rhythm = row['Rhythm']
                            mapped_rhythm = rhythm_mapping.get(rhythm, None)
                            if mapped_rhythm:
                                labels.extend([mapped_rhythm] * len(segments))
                
            except Exception as e:
                print(f"Error processing file {file_path}: {str(e)}")
                continue
    
    return np.array(processed_segments), np.array(labels)

def create_cnn_model(input_shape: tuple, num_classes: int) -> tf.keras.Model:
    """
    Create a 1D CNN model for ECG classification
    """
    model = Sequential([
        # First Conv Block
        Conv1D(64, kernel_size=5, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.2),
        
        # Second Conv Block
        Conv1D(128, kernel_size=5, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.2),
        
        # Third Conv Block
        Conv1D(256, kernel_size=5, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.2),
        
        # Dense layers
        Flatten(),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_and_evaluate(
    X: np.ndarray,
    y: np.ndarray,
    batch_size: int = 32,
    epochs: int = 50,
    experiment_name: str = ""
) -> Tuple[tf.keras.Model, dict, float]:
    """
    Train and evaluate the CNN model
    """
    # Reshape input for CNN (samples, timesteps, channels)
    X = X.reshape(X.shape[0], X.shape[1], 1)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert labels to categorical
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    y_train_cat = to_categorical(y_train_encoded)
    y_test_cat = to_categorical(y_test_encoded)
    
    # Create and train model
    model = create_cnn_model((X.shape[1], 1), len(set(y_train)))
    
    # Train the model
    history = model.fit(
        X_train,
        y_train_cat,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.2,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                min_lr=1e-6
            )
        ],
        verbose=1
    )
    
    # Plot training history and confusion matrix (similar to before)
    # ... (previous visualization code remains the same)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test_cat, axis=1)
    
    report = classification_report(
        y_test_classes,
        y_pred_classes,
        target_names=le.classes_,
        output_dict=True
    )
    
    test_loss, test_accuracy = model.evaluate(X_test, y_test_cat, verbose=0)
    
    return model, report, test_accuracy

# Main execution
if __name__ == "__main__":
    # Your rhythm mapping
    rhythm_mapping = {
        'AFIB': 'AFIB',
        'AF': 'AFIB',
        'SVT': 'GSVT',
        'AT': 'GSVT',
        'SAAWR': 'GSVT',
        'ST': 'GSVT',
        'AVNRT': 'GSVT',
        'AVRT': 'GSVT',
        'SB': 'SB',
        'SR': 'SR',
        'SA': 'SR'
    }
    
    # Load and preprocess data (Lead II)
    X, y = load_and_preprocess_data(ecg_folder, diagnostics_df, rhythm_mapping, selected_leads=1)
    
    # Train and evaluate model
    model, report, accuracy = train_and_evaluate(X, y, experiment_name="Lead II - CNN with Beat Detection")
    
    # Print results
    print("\nClassification Report:")
    print(pd.DataFrame(report).transpose())
    print(f"\nTest Accuracy: {accuracy:.4f}")


Loading ECG files: 10646it [01:04, 164.95it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732513704.060002  430233 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732513704.161235  430233 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732513704.166929  430233 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs

Epoch 1/50


I0000 00:00:1732513705.928054  430981 service.cc:146] XLA service 0x75515c0091e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732513705.928105  430981 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-11-25 11:48:25.993926: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-25 11:48:26.225990: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m  56/3100[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 3ms/step - accuracy: 0.4442 - loss: 1.8590

I0000 00:00:1732513709.377295  430981 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.6648 - loss: 0.8959 - val_accuracy: 0.7423 - val_loss: 0.6200 - learning_rate: 0.0010
Epoch 2/50
[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7476 - loss: 0.6372 - val_accuracy: 0.7710 - val_loss: 0.5706 - learning_rate: 0.0010
Epoch 3/50
[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7556 - loss: 0.6112 - val_accuracy: 0.7792 - val_loss: 0.5738 - learning_rate: 0.0010
Epoch 4/50
[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7693 - loss: 0.5836 - val_accuracy: 0.7999 - val_loss: 0.5096 - learning_rate: 0.0010
Epoch 5/50
[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7842 - loss: 0.5503 - val_accuracy: 0.7924 - val_loss: 0.5179 - learning_rate: 0.0010
Epoch 6/50
[1m3100/3100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [