In [None]:
import wfdb
import numpy as np
import pandas as pd
import os

In [1]:
def load_physionet_data_and_get_RRIs_one_subject(database_path, record_name, annotation_format, annotations_only=False):
    # annotation_format: Annotation file extension. E.g. 'ecg' or 'atr'.
    # record_name: The record name of the subject. E.g. 'chf01'.
    # annotations_only: False indicates that the dataset has continuous ECG signal data. True indicates annotations only.

    if annotations_only == False: # Only load ECG signal if it's available in the dataset.
        signal, fields = wfdb.rdsamp(database_path + record_name)
    
    annotations = wfdb.rdann(database_path + record_name, annotation_format)

    # Pre-processing: Remove abnormal heartbeats (keep only heartbeats labeled as 'N' (Normal))
    # Get the sample indices for all normal beats ('N')
    normal_indices = [i for i, symbol in enumerate(annotations.symbol) if symbol == 'N']
    
    # Get the actual sample positions of the normal beats (R-peaks for 'N' labeled beats)
    normal_r_peaks = annotations.sample[normal_indices]
    
    rr_intervals = np.diff(normal_r_peaks) / annotations.fs  # RR intervals in seconds

    # Create a dataframe for the RR intervals
    RRI_data = pd.DataFrame({
        'RR_interval_sec': rr_intervals, # RRIs in seconds
        'sample_start': normal_r_peaks[:-1], # Samples that mark the beginning of each RRI
        'sampling_frequency': annotations.fs,
        'record': record_name
    })

    # Remove RRIs > 2 seconds
    RRI_data_cleaned = RRI_data[RRI_data['RR_interval_sec'] <= 2]

    # Get all unique annotation symbols used in this record
    unique_symbols = set(annotations.symbol)

    if annotations_only == False:
        return RRI_data_cleaned, fields, annotations, unique_symbols
    else:
        return RRI_data_cleaned, annotations, unique_symbols

In [2]:
# Save cleaned RRI data to CSV files

databases = ['BIDMC-CHF', 'CHF-RR', 'NSR', 'NSR-RR', 'FD']

db_data_directories = {
    'BIDMC-CHF': 'BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/',
    'CHF-RR': 'CHF-RR_congestive-heart-failure-rr-interval-database-1.0.0/',
    'NSR': 'NSR_mit-bih-normal-sinus-rhythm-database-1.0.0/',
    'NSR-RR': 'NSR-RR_normal-sinus-rhythm-rr-interval-database-1.0.0/',
    'FD': 'FD_fantasia-database-1.0.0/'
}

# Dictionary for annotation formats
db_annotation_formats = {
    'BIDMC-CHF': 'ecg',
    'CHF-RR': 'ecg',
    'NSR': 'atr',
    'NSR-RR': 'ecg',
    'FD': 'ecg'
}

db_annotation_only = {
    'BIDMC-CHF': False,
    'CHF-RR': True,
    'NSR': False,
    'NSR-RR': True,
    'FD': False
}

# Save CSVs in folder: ../data/cleaned_RRIs/

for db_name in databases:
    db_directory = db_data_directories.get(db_name) # Retrieve data directory from the dict
    data_dir_path = '../data/' + db_directory # Directory where the heart data is located
    records_file_path = data_dir_path + 'RECORDS' # The RECORDS file contains the subject code names/labels (e.g. 'chf01', 'chf02', ...)
    annotation_format = db_annotation_formats.get(db_name)
    annotations_only = db_annotation_only.get(db_name)
    
    with open(records_file_path, 'r') as file: # Retrieve the record labels from the RECORDS file
        record_names = file.read().splitlines()
        # For example: record_names = ['chf201', ... 'chf229']

    for record_name in record_names:
        # Get RR interval data
        if annotations_only == True: # Annotations only
            rri_dataframe, annotations, unique_symbols = load_physionet_data_and_get_RRIs_one_subject(
            data_dir_path, record_name, annotation_format, annotations_only
            )
        else: # ECG signal + annotations
            rri_dataframe, fields, annotations, unique_symbols = load_physionet_data_and_get_RRIs_one_subject(
            data_dir_path, record_name, annotation_format, annotations_only
            )
        
        # Save rri_data to CSV
        csv_file_dir = f'../data/cleaned_RRIs/{db_name}/' # The directory where the csv file will be saved
        
        os.makedirs(csv_file_dir, exist_ok=True) # Create the output directory if it doesn't exist
        
        csv_file_name = f'{db_name}_{record_name}_RRI.csv'
        output_file_path = csv_file_dir + csv_file_name
        
        rri_dataframe.to_csv(output_file_path, index=False)

../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf01
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf02
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf03
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf04
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf05
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf06
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf07
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf08
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf09
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf10
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf11
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf12
../data/BIDMC-CHF_bidmc-congestive-heart-failure-database-1.0.0/files/chf13
../data/BIDM

In [2]:
# Partition each subject's RRI data into segments.
# This is done by adding a column 'segment_id' for a chosen segment length (e.g. N=500).
# Each updated dataframe is saved to CSV.

# Segment length
N = 500

# Base directory where the CSV files are stored
base_dir = '../data/cleaned_RRIs'

# Directory where the CSV files with segment ids are to be stored
with_segment_ids_dir = f'../data/cleaned_RRIs/with_segment_ids_N{N}'
print(f'with_segment_ids_dir: {with_segment_ids_dir}')
os.makedirs(with_segment_ids_dir, exist_ok=True)

# List of subdirectories for each database
databases = ['BIDMC-CHF', 'CHF-RR', 'NSR', 'NSR-RR', 'FD']
databases_nsr = ['NSR', 'NSR-RR', 'FD'] # no heart disease present (label = 0)
databases_chf = ['BIDMC-CHF', 'CHF-RR'] # heart disease present (label = 1)

# Iterate through each database
for db in databases:
    print(f"\nProcessing database: {db}")
    db_path_load = os.path.join(base_dir, db)
    db_path_save = os.path.join(with_segment_ids_dir, db)
    os.makedirs(db_path_save, exist_ok=True)
    
    # Check if the directory exists
    if os.path.exists(db_path_load):
        for file in os.listdir(db_path_load):
            if file.endswith(".csv"):
                print(f"  Processing subject file: {file}")
                
                # Load each CSV file
                file_path_load = os.path.join(db_path_load, file)
                df_rris = pd.read_csv(file_path_load) # RRI data of one subject
                subject_code_name = df_rris['record'].iloc[0] # Retrieve the record (subject code name)
                subject_code_name = str(subject_code_name) # Convert to a string to ensure it is treated as text

                # Add column 'segment_id'
                if 'segment_id' not in df_rris.columns:
                    df_rris['segment_id'] = pd.Series(dtype='object')

                # Add column 'label'
                if 'label' not in df_rris.columns:
                    df_rris['label'] = pd.Series(dtype='int64')
                if db in databases_nsr:
                    df_rris['label'] = 0
                elif db in databases_chf:
                    df_rris['label'] = 1
                else:
                    df_rris['label'] = np.nan
                    print(f"Warning: Database [{db}] was not in the listed CHF or NSR databases.")
 
                # Partition the data into intervals of N RRIs
                num_segments = len(df_rris) // N
                
                # Process complete segments
                for i in range(num_segments):
                    segment_no = i+1
                    segment_id = f"{subject_code_name}_{str(segment_no).zfill(len(str(num_segments+1)))}"

                    # Current RRI segment
                    df_segment = df_rris.iloc[i*N:(i+1)*N]

                    # Add segment_id to the RRI rows corresponding to the segment:
                    df_rris.iloc[i*N:(i+1)*N, df_rris.columns.get_loc('segment_id')] = segment_id

                # Ensure all missing or empty values of segment_id are explicitly set to NaN for clarity
                df_rris.loc[df_rris['segment_id'].isna(), 'segment_id'] = np.nan
                
                # Save to CSV
                file_path_save = os.path.join(db_path_save, file)
                df_rris.to_csv(file_path_save, index=False)

print("\nProcessing complete.")

Segment length: N = 500

with_segment_ids_dir: ../data/cleaned_RRIs/with_segment_ids_N500

Processing database: BIDMC-CHF
  Processing subject file: BIDMC-CHF_chf01_RRI.csv
  Processing subject file: BIDMC-CHF_chf02_RRI.csv
  Processing subject file: BIDMC-CHF_chf03_RRI.csv
  Processing subject file: BIDMC-CHF_chf04_RRI.csv
  Processing subject file: BIDMC-CHF_chf05_RRI.csv
  Processing subject file: BIDMC-CHF_chf06_RRI.csv
  Processing subject file: BIDMC-CHF_chf07_RRI.csv
  Processing subject file: BIDMC-CHF_chf08_RRI.csv
  Processing subject file: BIDMC-CHF_chf09_RRI.csv
  Processing subject file: BIDMC-CHF_chf10_RRI.csv
  Processing subject file: BIDMC-CHF_chf11_RRI.csv
  Processing subject file: BIDMC-CHF_chf12_RRI.csv
  Processing subject file: BIDMC-CHF_chf13_RRI.csv
  Processing subject file: BIDMC-CHF_chf14_RRI.csv
  Processing subject file: BIDMC-CHF_chf15_RRI.csv

Processing database: CHF-RR
  Processing subject file: CHF-RR_chf201_RRI.csv
  Processing subject file: CHF-RR_c