In [3]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from scipy import signal, stats
import pywt
from sklearn.decomposition import FastICA
import logging


# Set up logging
logging.basicConfig(filename='ecg_processing_v4.log', level=logging.INFO,
                   format='%(asctime)s:%(levelname)s:%(message)s')

# Constants
SAMPLING_RATE = 500  # Hz
SEQUENCE_LENGTH = 5000  # 10 seconds * 500 Hz
N_LEADS = 12


ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Rhythm Mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

In [4]:
def extract_dwt_features(signal_data, wavelet='db4', level=4):
    """
    Extract Discrete Wavelet Transform features from the signal.
    Returns both coefficients and statistical features of the coefficients.
    """
    # Perform DWT
    coeffs = pywt.wavedec(signal_data, wavelet, level=level)

    # Extract statistical features from each coefficient level
    features = []
    for coeff in coeffs:
        features.extend([
            np.mean(coeff),      # Mean
            np.std(coeff),       # Standard deviation
            np.max(coeff),       # Maximum
            np.min(coeff),       # Minimum
            np.median(coeff),    # Median
            np.var(coeff),       # Variance
            np.sum(coeff**2),    # Energy
            np.mean(abs(coeff)), # Mean absolute value
            stats.kurtosis(coeff), # Kurtosis
            stats.skew(coeff)    # Skewness
        ])

    return np.array(features)

def apply_ica(signals, n_components=None):
    """Apply Independent Component Analysis to the signals."""
    ica = FastICA(n_components=n_components, random_state=42)
    ica_signals = ica.fit_transform(signals.T).T
    return ica_signals

def extract_time_domain_features(signal):
    """Extract time domain features from the signal."""
    features = [
        np.mean(signal),         # Mean
        np.std(signal),          # Standard deviation
        np.var(signal),          # Variance
        np.max(signal),          # Maximum
        np.min(signal),          # Minimum
        np.median(signal),       # Median
        np.sum(signal**2),       # Energy
        stats.kurtosis(signal),  # Kurtosis
        stats.skew(signal),      # Skewness
        np.mean(abs(signal)),    # Mean absolute value
        np.sqrt(np.mean(signal**2))  # Root mean square
    ]
    return np.array(features)

def extract_frequency_domain_features(signal):
    """Extract frequency domain features from the signal."""
    # Compute FFT
    fft_vals = np.abs(np.fft.fft(signal))
    freqs = np.fft.fftfreq(len(signal), 1/SAMPLING_RATE)

    # Get positive frequencies only
    pos_mask = freqs >= 0
    freqs = freqs[pos_mask]
    fft_vals = fft_vals[pos_mask]

    features = [
        np.max(fft_vals),        # Peak frequency amplitude
        freqs[np.argmax(fft_vals)],  # Peak frequency
        np.mean(fft_vals),       # Mean frequency
        np.std(fft_vals),        # Spectral spread
        np.sum(fft_vals**2),     # Spectral energy
        stats.kurtosis(fft_vals),# Spectral kurtosis
        stats.skew(fft_vals)     # Spectral skewness
    ]
    return np.array(features)

def extract_all_features(data):
    """Extract all features from the signal."""
    features_list = []

    for i in range(data.shape[0]):  # For each sample
        sample_features = []

        # Apply ICA to the multi-lead signal
        ica_signals = apply_ica(data[i], n_components=5)  # Reduce to 5 components

        for signal in ica_signals:
            # Extract DWT features
            dwt_features = extract_dwt_features(signal)

            # Extract time domain features
            time_features = extract_time_domain_features(signal)

            # Extract frequency domain features
            freq_features = extract_frequency_domain_features(signal)

            # Combine all features
            signal_features = np.concatenate([dwt_features, time_features, freq_features])
            sample_features.extend(signal_features)

        features_list.append(sample_features)

    return np.array(features_list)

def create_mlp_model(input_shape, num_classes):
    """Create MLP model for feature-based classification."""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def create_feature_cnn(input_shape, num_classes):
    """Create CNN model for feature-based classification."""
    model = tf.keras.Sequential([
        tf.keras.layers.Reshape((input_shape, 1), input_shape=(input_shape,)),
        tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 3, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def load_and_process_data(ecg_folder, diagnostics_file, rhythm_mapping):
    """Load and process ECG data with feature extraction."""
    # Load diagnostics data
    diagnostics = pd.read_excel(diagnostics_file)
    diagnostics['Rhythm'] = diagnostics['Rhythm'].map(rhythm_mapping)

    valid_data = []
    valid_labels = []

    for idx, row in diagnostics.iterrows():
        file_path = os.path.join(ecg_folder, row['FileName'] + ".csv")

        try:
            # Load ECG data
            ecg_data = pd.read_csv(file_path, header=0)

            if ecg_data.isnull().any().any() or (ecg_data == 0).all().any():
                continue

            if len(ecg_data) != SEQUENCE_LENGTH:
                continue

            valid_data.append(ecg_data.values)
            valid_labels.append(row['Rhythm'])

        except Exception as e:
            logging.error(f"Error processing {row['FileName']}: {str(e)}")
            continue

    X = np.array(valid_data)
    y = np.array(valid_labels)

    # Extract features
    X_features = extract_all_features(X)

    return X_features, y

def prepare_data(X, y):
    """Prepare data for training."""
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Convert labels to one-hot encoding
    label_encoder = tf.keras.preprocessing.text.Tokenizer()
    label_encoder.fit_on_texts(y)
    y_encoded = label_encoder.texts_to_sequences(y)
    y_encoded = np.array(y_encoded).reshape(-1)
    num_classes = len(label_encoder.word_index)
    y_onehot = tf.keras.utils.to_categorical(y_encoded - 1, num_classes)

    # Get class names
    classes = [k for k, v in sorted(label_encoder.word_index.items(), key=lambda x: x[1])]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot
    )

    return X_train, X_test, y_train, y_test, classes

def train_and_evaluate_models(X_train, X_test, y_train, y_test, classes):
    """Train and evaluate all models."""
    # Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_train, np.argmax(y_train, axis=1))
    dt_pred = dt_model.predict(X_test)
    print("\nDecision Tree Classification Report:")
    print(classification_report(np.argmax(y_test, axis=1), dt_pred,
                                target_names=classes, digits=5))

    # MLP
    mlp_model = create_mlp_model(X_train.shape[1], len(classes))
    mlp_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    mlp_history = mlp_model.fit(X_train, y_train, epochs=30, batch_size=32,
                                validation_split=0.2, verbose=1)
    mlp_pred = np.argmax(mlp_model.predict(X_test), axis=1)
    print("\nMLP Classification Report:")
    print(classification_report(np.argmax(y_test, axis=1), mlp_pred,
                                target_names=classes, digits=5))

    # CNN
    cnn_model = create_feature_cnn(X_train.shape[1], len(classes))
    cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    cnn_history = cnn_model.fit(X_train, y_train, epochs=30, batch_size=32,
                                validation_split=0.2, verbose=1)
    cnn_pred = np.argmax(cnn_model.predict(X_test), axis=1)
    print("\nCNN Classification Report:")
    print(classification_report(np.argmax(y_test, axis=1), cnn_pred,
                                target_names=classes, digits=5))

# Load and process data
X_features, y = load_and_process_data(ecg_folder, diagnostics_file, rhythm_mapping)
logging.info(f"Successfully extracted features from {len(y)} samples")

# Prepare data
X_train, X_test, y_train, y_test, classes = prepare_data(X_features, y)

# Train and evaluate models
train_and_evaluate_models(X_train, X_test, y_train, y_test, classes)





Decision Tree Classification Report:
              precision    recall  f1-score   support

          sb    0.39708   0.38432   0.39059       778
        gsvt    0.22428   0.23696   0.23044       460
          sr    0.20501   0.20270   0.20385       444
        afib    0.23884   0.24099   0.23991       444

    accuracy                        0.28457      2126
   macro avg    0.26630   0.26624   0.26620      2126
weighted avg    0.28653   0.28457   0.28547      2126

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1735170906.458770  225315 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735170906.510877  225315 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735170906.514597  225315 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735170906.51887

[1m142/213[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.2601 - loss: 1.7565

I0000 00:00:1735170909.153258  233728 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.2659 - loss: 1.7070 - val_accuracy: 0.3163 - val_loss: 1.4180
Epoch 2/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step - accuracy: 0.3544 - loss: 1.3797 - val_accuracy: 0.3304 - val_loss: 1.4085
Epoch 3/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step - accuracy: 0.3736 - loss: 1.3149 - val_accuracy: 0.3374 - val_loss: 1.3865
Epoch 4/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881us/step - accuracy: 0.4048 - loss: 1.2873 - val_accuracy: 0.3574 - val_loss: 1.3808
Epoch 5/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4243 - loss: 1.2533 - val_accuracy: 0.3510 - val_loss: 1.3938
Epoch 6/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step - accuracy: 0.4472 - loss: 1.2357 - val_accuracy: 0.3410 - val_loss: 1.4076
Epoch 7/30
[1m213/213[0m [3

  super().__init__(**kwargs)


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.3431 - loss: 1.3660 - val_accuracy: 0.3651 - val_loss: 1.3570
Epoch 2/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3670 - loss: 1.3459 - val_accuracy: 0.3081 - val_loss: 1.3834
Epoch 3/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3695 - loss: 1.3440 - val_accuracy: 0.3651 - val_loss: 1.3514
Epoch 4/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3650 - loss: 1.3419 - val_accuracy: 0.3445 - val_loss: 1.3621
Epoch 5/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3582 - loss: 1.3419 - val_accuracy: 0.3645 - val_loss: 1.3609
Epoch 6/30
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3699 - loss: 1.3327 - val_accuracy: 0.3633 - val_loss: 1.3482
Epoch 7/30
[1m213/213[0m [32m━━━━━━