In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from biosppy.signals import ecg
from biosppy.features import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization

# Random seed
tf.random.set_seed(6950)

# Constants
SAMPLING_RATE = 500  # Hz

# Rhythm mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}


def prepare_dataset(ecg_folder, diagnostics_df):
    signals, signal_labels, ecg_metadata = [], [], []
    additional_features = []

    for _, row in diagnostics_df.iterrows():
        file_name = row['FileName']
        rhythm_label = row['Rhythm']

        if pd.isnull(rhythm_label) or rhythm_label not in rhythm_mapping.values():
            continue

        ecg_file = os.path.join(ecg_folder, f"{file_name}.csv")
        if not os.path.exists(ecg_file):
            continue

        # Read the CSV and extract lead II data
        lead_ii = pd.read_csv(ecg_file, header=0).iloc[:, 1].values

        # Clean the signal using BioSPPy
        cleaned = ecg.ecg(signal=lead_ii, sampling_rate=SAMPLING_RATE, show=False)
        signals.append(cleaned['filtered'])
        signal_labels.append(rhythm_label)
        ecg_metadata.append({'sampling_rate': SAMPLING_RATE})

        # Extract additional features
        rpeaks = cleaned['rpeaks']  # R-peak locations
        heart_rate = cleaned['heart_rate']  # Heart rate values
        templates = np.mean(cleaned['templates'], axis=0) if cleaned['templates'].size else [
                                                                                                0] * 100  # Mean template (if available)

        # Concatenate additional features (e.g., mean, std, etc.)
        additional_features.append([
            len(rpeaks),  # Number of R-peaks
            np.mean(heart_rate) if len(heart_rate) > 0 else 0,  # Mean heart rate
            np.std(heart_rate) if len(heart_rate) > 0 else 0,  # HR variability
            np.mean(templates) if len(templates) > 0 else 0,  # Mean of heartbeat template
            np.std(templates) if len(templates) > 0 else 0,  # STD of heartbeat template
        ])

    return np.array(signals), np.array(signal_labels), ecg_metadata, np.array(additional_features)


def extract_time_features(signals, metadata):
    all_features = []
    for signal, mdata in zip(signals, metadata):
        # Extract time-domain features
        time_feats = time.time(signal=signal, sampling_rate=mdata["sampling_rate"])
        # Convert the ReturnTuple to a list by directly accessing its elements
        all_features.append(list(time_feats))  # Convert ReturnTuple to a list
        # print(time_feats)
        # print(len(time_feats))

    collected_features = np.array(all_features)
    # print(collected_features.shape)
    return collected_features


def extract_combined_features(signals, metadata, additional_features):
    time_features = extract_time_features(signals, metadata)  # Extract time-domain features
    combined_features = np.hstack([time_features, additional_features])  # Combine both feature sets
    return combined_features


def build_mlp(input_shape):
    mlp = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dense(4, activation='softmax')
    ])
    mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return mlp


def build_cnn(input_shape):
    fcnn = Sequential([
        Conv1D(64, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(2),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(4, activation='softmax')  # Assuming 4 classes
    ])
    fcnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return fcnn



2024-12-02 09:42:58.989646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-02 09:42:59.000319: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-02 09:42:59.003602: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-02 09:42:59.012283: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# --- Main Pipeline ---
# Load diagnostics
diagnostics_file = "../../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"
ecg_folder = "../../../../Datasets/12-lead electrocardiogram database/ECGData"
diagnostics_df = pd.read_excel(diagnostics_file)
diagnostics_df['Rhythm'] = diagnostics_df['Rhythm'].map(rhythm_mapping)

# Prepare dataset
ecg_signals, ecg_labels, ecg_metadata, extra_features = prepare_dataset(ecg_folder, diagnostics_df)
features = extract_combined_features(ecg_signals, ecg_metadata, extra_features)

# Encode labels
label_encoder = LabelEncoder()
ecg_labels_encoded = label_encoder.fit_transform(ecg_labels)

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features_scaled, ecg_labels_encoded, test_size=0.2, random_state=42)

# One-hot encode labels
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=4)
y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=4)

# Train MLP
mlp = build_mlp(X_train.shape[1])
mlp.fit(X_train, y_train_oh, epochs=100, batch_size=32, verbose=1)
mlp_eval = mlp.evaluate(X_test, y_test_oh, verbose=0)
print(f"MLP Accuracy: {mlp_eval[1]}")
mlp_predictions = np.argmax(mlp.predict(X_test), axis=1)
print(classification_report(y_test, mlp_predictions, target_names=label_encoder.classes_, digits=5))

# Train CNN
X_train_cnn = np.expand_dims(X_train, axis=2)  # Reshape for CNN
X_test_cnn = np.expand_dims(X_test, axis=2)
cnn = build_cnn(X_train_cnn.shape[1:])
cnn.fit(X_train_cnn, y_train_oh, epochs=100, batch_size=32, verbose=1)
cnn_eval = cnn.evaluate(X_test_cnn, y_test_oh, verbose=0)
print(f"CNN Accuracy: {cnn_eval[1]}")
cnn_predictions = np.argmax(cnn.predict(X_test_cnn), axis=1)
print(classification_report(y_test, cnn_predictions, target_names=label_encoder.classes_, digits=5))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1733111346.240202   70719 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733111346.361320   70719 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733111346.366201   70719 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733111346.37073

[1m122/267[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.4972 - loss: 1.2465

I0000 00:00:1733111349.212989   74302 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5862 - loss: 1.0360
Epoch 2/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 863us/step - accuracy: 0.8002 - loss: 0.5388
Epoch 3/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.8300 - loss: 0.4742
Epoch 4/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660us/step - accuracy: 0.8431 - loss: 0.4344
Epoch 5/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 621us/step - accuracy: 0.8537 - loss: 0.4064
Epoch 6/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 585us/step - accuracy: 0.8601 - loss: 0.3963
Epoch 7/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 584us/step - accuracy: 0.8652 - loss: 0.3719
Epoch 8/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 603us/step - accuracy: 0.8666 - loss: 0.3653
Epoch 9/100
[1m267/267[0m [

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6814 - loss: 0.8167
Epoch 2/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.8668 - loss: 0.4163
Epoch 3/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 713us/step - accuracy: 0.8784 - loss: 0.3646
Epoch 4/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 719us/step - accuracy: 0.8919 - loss: 0.3260
Epoch 5/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step - accuracy: 0.8967 - loss: 0.2996
Epoch 6/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.9054 - loss: 0.2814
Epoch 7/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step - accuracy: 0.9096 - loss: 0.2542
Epoch 8/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 725us/step - accuracy: 0.9162 - loss: 0.2438
Epoch 9/100
[1m267/267[0m [

In [4]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict on test data
y_pred = dt.predict(X_test)

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, digits=5))
print(dt.get_depth())
print(dt.get_n_leaves())

              precision    recall  f1-score   support

        AFIB    0.73585   0.73585   0.73585       424
        GSVT    0.85350   0.83402   0.84365       482
          SB    0.96104   0.95238   0.95669       777
          SR    0.87097   0.90604   0.88816       447

    accuracy                        0.87277      2130
   macro avg    0.85534   0.85707   0.85609      2130
weighted avg    0.87298   0.87277   0.87277      2130

27
495
