In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Defining file path
ecg_folder = "../../../Datasets/12-lead electrocardiogram database/ECGDataDenoised"
attributes_file = "../../../Datasets/12-lead electrocardiogram database/AttributesDictionary.xlsx"
diagnostics_file = "../../../Datasets/12-lead electrocardiogram database/Diagnostics.xlsx"
rhythm_names_file = "../../../Datasets/12-lead electrocardiogram database/RhythmNames.xlsx"

# Checking for missing files and stuff
for file_path in [attributes_file, diagnostics_file, rhythm_names_file]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Required file not found: {file_path}")

# Loading metadata
attributes_df = pd.read_excel(attributes_file)
diagnostics_df = pd.read_excel(diagnostics_file)
rhythm_names_df = pd.read_excel(rhythm_names_file)

# Removing trailing spaces in acronym columns for accurate matching
rhythm_names_df['Acronym Name'] = rhythm_names_df['Acronym Name'].str.strip()

# Creating sets of valid acronyms for rhythm
valid_rhythms = set(rhythm_names_df['Acronym Name'])
print(valid_rhythms)

{'AVNRT', 'AF', 'AT', 'SVT', 'SR', 'SI', 'SB', 'AFIB', 'SAAWR', 'AVRT', 'ST'}


In [2]:
# Load ECG data
def load_ecg_data(ecg_folder, diagnostics_df):
    data = []
    labels = []
    metadata = []

    # Check if the folder exists
    if not os.path.exists(ecg_folder):
        raise FileNotFoundError(f"ECG data folder not found: {ecg_folder}")

    for file_name in os.listdir(ecg_folder):
        if file_name.endswith('.csv'):
            # Read ECG data
            ecg_data = pd.read_csv(os.path.join(ecg_folder, file_name), header=None)
            ecg_data = ecg_data.to_numpy()  # Convert to numpy array

            # Retrieve associated label and metadata
            record_id = file_name.replace('.csv', '')
            record_info = diagnostics_df[diagnostics_df['FileName'] == record_id]

            # Skip if no label information is available
            if record_info.empty:
                raise ValueError(f"No diagnostic information found for file: {file_name}")

            # Get rhythm and condition acronyms directly
            rhythm_acronym = record_info['Rhythm'].values[0].strip()

            # The author of the dataset seems like did an oopsie
            if rhythm_acronym == "SA":
                rhythm_acronym = "SI"
                
            # Validate rhythm and condition acronyms
            if rhythm_acronym not in valid_rhythms:
                raise ValueError(f"Unknown rhythm acronym '{rhythm_acronym}' found in file: {file_name}")

            # Set label as rhythm acronym (or use condition acronym if preferred)
            label = rhythm_acronym  # Use rhythm acronym as label

            # Additional metadata
            patient_age = record_info["PatientAge"].values[0]
            ventricular_rate = record_info["VentricularRate"].values[0]
            atrial_rate = record_info["AtrialRate"].values[0]

            data.append(ecg_data)
            labels.append(label)
            metadata.append({
                "patient_age": patient_age,
                "ventricular_rate": ventricular_rate,
                "atrial_rate": atrial_rate
            })

    return data, np.array(labels), metadata


# Load ECG data and labels
ecg_data, ecg_labels, ecg_metadata = load_ecg_data(ecg_folder, diagnostics_df)

# Standardize each ECG signal individually
scaler = StandardScaler()
ecg_data = [scaler.fit_transform(sample) for sample in ecg_data]

# Define a fixed sequence length (pad or truncate to this length)
sequence_length = 5000  # Adjust as per dataset requirements


# Function to pad or truncate each ECG signal to a fixed length
def preprocess_sequence(data, length):
    processed_data = []
    for sample in data:
        if sample.shape[0] > length:
            processed_data.append(sample[:length])
        else:
            pad_width = length - sample.shape[0]
            processed_data.append(np.pad(sample, ((0, pad_width), (0, 0)), mode="constant"))
    return np.array(processed_data)


# initiating
ecg_data = preprocess_sequence(ecg_data, sequence_length)

In [4]:
from collections import Counter

# Filter out classes with fewer than 2 instances
label_counts = Counter(ecg_labels)
print(label_counts)
filtered_indices = [i for i, label in enumerate(ecg_labels) if label_counts[label] > 1]
filtered_ecg_data = ecg_data[filtered_indices]
filtered_ecg_labels = ecg_labels[filtered_indices]

# Train-test split with filtered data
X_train, X_test, y_train, y_test = train_test_split(filtered_ecg_data, filtered_ecg_labels, test_size=0.2,
                                                    random_state=42, stratify=filtered_ecg_labels)

# Verify shapes and data summary
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)
print("Metadata example:", ecg_metadata[0])

Counter({'SB': 3889, 'SR': 1826, 'AFIB': 1780, 'ST': 1568, 'SVT': 587, 'AF': 445, 'SI': 399, 'AT': 121, 'AVNRT': 16, 'AVRT': 8, 'SAAWR': 7})
Training data shape: (8516, 5000, 12)
Testing data shape: (2130, 5000, 12)
Training labels shape: (8516,)
Testing labels shape: (2130,)
Metadata example: {'patient_age': 82, 'ventricular_rate': 143, 'atrial_rate': 144}


In [6]:
X_train

array([[[-1.19834224, -1.58384304, -0.37023501, ..., -0.80524247,
         -0.77701309, -0.96014917],
        [-0.9744344 , -1.4363066 , -0.46541014, ..., -0.77506742,
         -0.75135008, -0.92402534],
        [-0.77719196, -1.28160379, -0.52247645, ..., -0.7455109 ,
         -0.72733714, -0.88961118],
        ...,
        [ 0.07976439,  0.25268815,  0.22958151, ...,  0.05945142,
         -0.05137077,  0.05352415],
        [ 0.432521  ,  0.43992082,  0.02639328, ...,  0.08508032,
         -0.04433624,  0.09182093],
        [ 0.80992444,  0.66420089, -0.16536231, ...,  0.11480031,
         -0.03701946,  0.13601885]],

       [[-0.46075294, -0.4575776 , -0.01879292, ..., -0.27069364,
         -0.25128915, -0.27899669],
        [-0.4973392 , -0.35601608,  0.17672614, ..., -0.2476225 ,
         -0.20753578, -0.19351574],
        [-0.52019522, -0.28206487,  0.31349408, ..., -0.22921116,
         -0.17284226, -0.12926609],
        ...,
        [ 1.05736149,  0.12538666, -0.66076744, ...,  

In [7]:
y_train

array(['SR', 'SB', 'SR', ..., 'SB', 'SB', 'AFIB'], dtype='<U5')

In [8]:
import os

# Directory to save the train-test split data
output_dir = "../../../Datasets/12-lead electrocardiogram database/FirstRevFull5000Sample"

# Creating directory if it doesn't exist already
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# function to save the stuff
def save_split(data, labels, split_name):
    data_file = os.path.join(output_dir, f"{split_name}_data.txt")
    labels_file = os.path.join(output_dir, f"{split_name}_labels.txt")

    # Saving the data
    with open(data_file, "w") as f_data:
        for sample in data:
            # Convert each sample to a space-separated string and write to file
            sample_str = " ".join(map(str, sample.flatten()))
            f_data.write(sample_str + "\n")

    # Save labels
    with open(labels_file, "w") as f_labels:
        for label in labels:
            f_labels.write(str(label) + "\n")

# Initialing the process
save_split(X_train, y_train, "train")
save_split(X_test, y_test, "test")

print(f"Train and test data have been saved to the '{output_dir}' directory.")

Train and test data have been saved to the '../../../Datasets/12-lead electrocardiogram database/FirstRevFull5000Sample' directory.
