In [2]:
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd

In [3]:
def extract_ecg_data(root):
    # Define the namespace
    namespaces = {"ns": "urn:hl7-org:v3"}

    # Extract the sequence id and other sequence related metadata
    seq_id = root.find(".//ns:id", namespaces).attrib["root"]
    subject_id = root.find(".//ns:trialSubject/ns:id", namespaces).attrib["extension"]
    acq_time = root.find(".//ns:effectiveTime/ns:low", namespaces).attrib["value"]

    # Initialize a dictionary to hold the ECG data
    ecg_data = {
        "sequence_id": seq_id,
        "subject_id": subject_id,
        "acq_time": acq_time
    }

    # Extract the ECG signal, unit, and scale
    for component in root.findall(".//ns:sequenceSet/ns:component", namespaces)[1:13]:
        sequence = component.find(".//ns:sequence", namespaces)
        if sequence is not None:
            lead = sequence.find(".//ns:code", namespaces).attrib["code"]
            scale = float(sequence.find(".//ns:scale", namespaces).attrib["value"])
            signal = [int(x)*scale for x in sequence.find(".//ns:digits", namespaces).text.strip().split()]
        
            # Add each lead's signal to the data dictionary
            ecg_data[f'lead_{lead}'] = signal

    return ecg_data

In [4]:
data_path = Path("/media/nvme1/pbecg-data/fda")
all_ecg_data = []
for file_path in data_path.rglob("*xml"):
    tree = ET.parse(file_path)
    root = tree.getroot()
    ecg_data = extract_ecg_data(root)
    all_ecg_data.append(ecg_data)

df = pd.DataFrame(all_ecg_data)

In [5]:
# Fill NaN values in the first set of columns with values from the second set of columns
df['lead_MDC_ECG_LEAD_aVR'].fillna(df['lead_MDC_ECG_LEAD_AVR'], inplace=True)
df['lead_MDC_ECG_LEAD_aVL'].fillna(df['lead_MDC_ECG_LEAD_AVL'], inplace=True)
df['lead_MDC_ECG_LEAD_aVF'].fillna(df['lead_MDC_ECG_LEAD_AVF'], inplace=True)

# Drop the second set of columns
df.drop(columns=['lead_MDC_ECG_LEAD_AVR', 'lead_MDC_ECG_LEAD_AVL', 'lead_MDC_ECG_LEAD_AVF'], inplace=True)

In [6]:
# Convert the acquisition time to a datetime object
df["acq_time"] = pd.to_datetime(df["acq_time"], format="%Y%m%d%H%M%S")
# Store the dataset as a parquet file
save_path = Path("/media/nvme1/pbecg-data/signal.parquet")
df.to_parquet(save_path, index=False)

  if _pandas_api.is_sparse(col):


PermissionError: [Errno 13] Permission denied: '/media/nvme1/pbecg-data/signal.parquet'

In [2]:
!groups

ldapusers dremt pse-dunn-login
