In [1]:
import numpy as np
import pandas as pd
import os
from scipy.signal import find_peaks, butter, filtfilt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import tensorflow as tf
# noinspection PyUnresolvedReferences
from tensorflow.keras.models import Sequential
# noinspection PyUnresolvedReferences
from tensorflow.keras.layers import Dense, Dropout

# Constants
ecg_folder = "../../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised"
diagnostics_file = "../../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"

# Label mapping
rhythm_mapping = {
    'AFIB': 'AFIB',
    'AF': 'AFIB',
    'SVT': 'GSVT',
    'AT': 'GSVT',
    'SAAWR': 'GSVT',
    'ST': 'GSVT',
    'AVNRT': 'GSVT',
    'AVRT': 'GSVT',
    'SB': 'SB',
    'SR': 'SR',
    'SA': 'SR'
}

# Load diagnostics data
diagnostics_df = pd.read_excel(diagnostics_file)
diagnostics_df['Rhythm'] = diagnostics_df['Rhythm'].map(rhythm_mapping)

2024-11-27 09:39:46.800926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-27 09:39:46.862032: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-27 09:39:46.879608: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 09:39:46.987255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
diagnostics_df

Unnamed: 0,FileName,Rhythm,Beat,PatientAge,Gender,VentricularRate,AtrialRate,QRSDuration,QTInterval,QTCorrected,RAxis,TAxis,QRSCount,QOnset,QOffset,TOffset
0,MUSE_20180113_171327_27000,AFIB,RBBB TWC,85,MALE,117,234,114,356,496,81,-27,19,208,265,386
1,MUSE_20180112_073319_29000,SB,TWC,59,FEMALE,52,52,92,432,401,76,42,8,215,261,431
2,MUSE_20180111_165520_97000,SR,NONE,20,FEMALE,67,67,82,382,403,88,20,11,224,265,415
3,MUSE_20180113_121940_44000,SB,NONE,66,MALE,53,53,96,456,427,34,3,9,219,267,447
4,MUSE_20180112_122850_57000,AFIB,STDD STTC,73,FEMALE,162,162,114,252,413,68,-40,26,228,285,354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10641,MUSE_20181222_204306_99000,GSVT,NONE,80,FEMALE,196,73,168,284,513,258,244,32,177,261,319
10642,MUSE_20181222_204309_22000,GSVT,NONE,81,FEMALE,162,81,162,294,482,110,-75,27,173,254,320
10643,MUSE_20181222_204310_31000,GSVT,NONE,39,MALE,152,92,152,340,540,250,38,25,208,284,378
10644,MUSE_20181222_204312_58000,GSVT,NONE,76,MALE,175,178,128,310,529,98,-83,29,205,269,360


In [3]:
diagnostics_df = diagnostics_df.dropna(subset=['Rhythm'])  # Drop unmapped rows

In [4]:
# Define functions for preprocessing and feature extraction
def preprocess_signal(signal: np.ndarray, sampling_rate: int = 500) -> np.ndarray:
    """
    Preprocess ECG signal with filtering and normalization
    """
    nyquist = sampling_rate / 2
    low = 0.5 / nyquist
    high = 45 / nyquist
    b, a = butter(2, [low, high], btype='band')
    filtered = filtfilt(b, a, signal)
    normalized = (filtered - np.mean(filtered)) / np.std(filtered)
    return normalized


def detect_r_peaks(signal: np.ndarray, sampling_rate: int = 500) -> np.ndarray:
    """
    Detect R-peaks in the signal using find_peaks
    """
    peaks, _ = find_peaks(signal, distance=sampling_rate // 2, height=0.5)  # Adjust threshold as needed
    return peaks


def extract_features(signal: np.ndarray, sampling_rate: int = 500) -> dict:
    """
    Extract features from the ECG signal
    """
    r_peaks = detect_r_peaks(signal, sampling_rate)
    rr_intervals = np.diff(r_peaks) / sampling_rate  # Convert to seconds

    features = {}

    # Basic RR interval-based features
    features['ventricular_rate'] = 60 / np.mean(rr_intervals) if len(rr_intervals) > 0 else 0
    features['mean_rr_interval'] = np.mean(rr_intervals) if len(rr_intervals) > 0 else 0
    features['variance_rr_interval'] = np.var(rr_intervals) if len(rr_intervals) > 0 else 0
    features['qrs_count'] = len(r_peaks)
    features['rr_interval_count'] = len(rr_intervals)

    # QRS Duration
    qrs_durations = []
    for i, r_peak in enumerate(r_peaks):
        # Look for the Q and S points around the R peak
        left_idx = max(0, r_peak - int(0.1 * sampling_rate))  # 100 ms window before
        right_idx = min(len(signal), r_peak + int(0.1 * sampling_rate))  # 100 ms window after
        segment = signal[left_idx:right_idx]

        if len(segment) > 1:
            # Approximate QRS width as the duration of the segment above a threshold
            threshold = 0.5 * np.max(segment)  # 50% of the max amplitude
            significant_points = np.where(segment > threshold)[0]
            if len(significant_points) > 1:
                qrs_duration = (significant_points[-1] - significant_points[0]) / sampling_rate
                qrs_durations.append(qrs_duration)

    features['qrs_duration'] = np.mean(qrs_durations) if len(qrs_durations) > 0 else 0.1

    # QT Interval
    qt_intervals = []
    for i, r_peak in enumerate(r_peaks):
        # Approximate T wave as a prominent feature after the R peak
        left_idx = r_peak
        right_idx = min(len(signal), r_peak + int(0.4 * sampling_rate))  # Up to 400 ms after R peak
        segment = signal[left_idx:right_idx]

        if len(segment) > 1:
            # Find the max point (T peak) and use it to approximate QT interval
            t_peak_idx = np.argmax(segment)
            qt_interval = (t_peak_idx + left_idx - r_peak) / sampling_rate
            qt_intervals.append(qt_interval)

    features['qt_interval'] = np.mean(qt_intervals) if len(qt_intervals) > 0 else 0.35

    # R and T Axes (Placeholder, lead-specific calculations)
    features['r_axis'] = np.sum(signal[r_peaks])  # Sum of R peak amplitudes as a proxy
    features['t_axis'] = np.mean(signal[r_peaks])  # Mean T wave amplitude as a proxy

    return features


def load_and_extract_features(ecg_folder: str, diagnostics_df: pd.DataFrame, selected_leads: int = 1) -> pd.DataFrame:
    """
    Load ECG signals and extract features
    """
    feature_list = []
    labels = []

    for idx, row in tqdm(diagnostics_df.iterrows(), total=len(diagnostics_df), desc="Processing ECG files"):
        file_path = os.path.join(ecg_folder, f"{row['FileName']}.csv")
        if os.path.exists(file_path):
            try:
                signal = pd.read_csv(file_path).values[:, selected_leads - 1]  # Extract selected lead
                signal = preprocess_signal(signal)
                features = extract_features(signal)
                feature_list.append(features)
                labels.append(row['Rhythm'])  # Assuming 'Rhythm' column contains target labels
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                continue

    features_df = pd.DataFrame(feature_list)
    features_df['label'] = labels
    return features_df


In [5]:
# Load data and extract features
features_df = load_and_extract_features(ecg_folder, diagnostics_df)

features_df

  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  normalized = (filtered - np.mean(filtered)) / np.std(filtered)
  return _methods

Unnamed: 0,ventricular_rate,mean_rr_interval,variance_rr_interval,qrs_count,rr_interval_count,qrs_duration,qt_interval,r_axis,t_axis,label
0,84.708949,0.708308,0.027421,14,13,0.053429,0.032857,29.602364,2.114455,AFIB
1,53.523640,1.121000,0.011721,9,8,0.028000,0.000000,60.783689,6.753743,SB
2,67.901235,0.883636,0.010112,12,11,0.083333,0.000000,30.355725,2.529644,SR
3,53.309640,1.125500,0.000238,9,8,0.022444,0.000000,58.084291,6.453810,SB
4,87.336245,0.687000,0.006193,15,14,0.024133,0.098133,51.690305,3.446020,AFIB
...,...,...,...,...,...,...,...,...,...,...
10641,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,GSVT
10642,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,GSVT
10643,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,GSVT
10644,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,GSVT


In [6]:
# Encode labels
le = LabelEncoder()
features_df['label'] = le.fit_transform(features_df['label'])
features_df

Unnamed: 0,ventricular_rate,mean_rr_interval,variance_rr_interval,qrs_count,rr_interval_count,qrs_duration,qt_interval,r_axis,t_axis,label
0,84.708949,0.708308,0.027421,14,13,0.053429,0.032857,29.602364,2.114455,0
1,53.523640,1.121000,0.011721,9,8,0.028000,0.000000,60.783689,6.753743,2
2,67.901235,0.883636,0.010112,12,11,0.083333,0.000000,30.355725,2.529644,3
3,53.309640,1.125500,0.000238,9,8,0.022444,0.000000,58.084291,6.453810,2
4,87.336245,0.687000,0.006193,15,14,0.024133,0.098133,51.690305,3.446020,0
...,...,...,...,...,...,...,...,...,...,...
10641,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,1
10642,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,1
10643,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,1
10644,0.000000,0.000000,0.000000,0,0,0.100000,0.350000,0.000000,,1


In [7]:
# Prepare data
X = features_df.drop(columns=['label'])
y = features_df['label']

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8516, 9) (2130, 9) (8516,) (2130,)


In [8]:
def create_mlp_model(input_dim, num_classes):
    mlp = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])

    mlp.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return mlp


num_classes = len(le.classes_)
mlp_model = create_mlp_model(X_train.shape[1], num_classes)

# Train MLP
mlp_model.fit(X_train, y_train, epochs=150, batch_size=128, validation_split=0.2, verbose=1)

# Evaluate MLP
mlp_loss, mlp_accuracy = mlp_model.evaluate(X_test, y_test, verbose=0)
mlp_y_pred = np.argmax(mlp_model.predict(X_test), axis=1)

print("\nTensorFlow MLP Classifier Results")
print(f"Accuracy: {mlp_accuracy:.4f}")
print(classification_report(y_test, mlp_y_pred, target_names=le.classes_, digits=5))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732678848.360448   33140 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732678848.493219   33140 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732678848.494583   33140 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732678848.49753

Epoch 1/150


I0000 00:00:1732678849.410173   33852 service.cc:146] XLA service 0x760440003fa0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732678849.410205   33852 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2024-11-27 09:40:49.442017: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-27 09:40:49.552490: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m 1/54[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:37[0m 2s/step - accuracy: 0.3203 - loss: nan

I0000 00:00:1732678850.685529   33852 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 63ms/step - accuracy: 0.2190 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 2/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2024 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 3/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2080 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 4/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2105 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 5/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 916us/step - accuracy: 0.2149 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 6/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2023 - loss: nan - val_accuracy: 0.2165 - val_loss: nan
Epoch 7/150
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_y_pred = dt.predict(X_test)

# Evaluate Decision Tree
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("\nDecision Tree Classifier Results")
print(f"Accuracy: {dt_accuracy:.4f}")
print(f"Max Depth: {dt.get_depth()}")
print(f"Max Leaf Nodes: {dt.get_n_leaves()}")
print(classification_report(y_test, dt_y_pred, target_names=le.classes_, digits=5))


Decision Tree Classifier Results
Accuracy: 0.8362
Max Depth: 27
Max Leaf Nodes: 931
              precision    recall  f1-score   support

        AFIB    0.66667   0.67925   0.67290       424
        GSVT    0.79917   0.80083   0.80000       482
          SB    0.94650   0.95624   0.95134       777
          SR    0.84651   0.81432   0.83010       447

    accuracy                        0.83615      2130
   macro avg    0.81471   0.81266   0.81359      2130
weighted avg    0.83647   0.83615   0.83622      2130



In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=28, max_leaf_nodes=2400)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("\nRandom Forest Classifier Results")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Max Depth: {rf_model.max_depth}")
print(f"Max Leaf Nodes: {rf_model.max_leaf_nodes}")
print(classification_report(y_test, rf_y_pred, target_names=le.classes_, digits=5))


Random Forest Classifier Results
Accuracy: 0.8775
Max Depth: 28
Max Leaf Nodes: 2400
              precision    recall  f1-score   support

        AFIB    0.74260   0.76887   0.75550       424
        GSVT    0.86111   0.83610   0.84842       482
          SB    0.94823   0.96654   0.95730       777
          SR    0.90255   0.87025   0.88610       447

    accuracy                        0.87746      2130
   macro avg    0.86362   0.86044   0.86183      2130
weighted avg    0.87800   0.87746   0.87755      2130

