In [86]:
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd

In [148]:
def extract_demographics(root):
    p_data = {}

    # Get the device source and activity type
    source = root.find(".//SOURCE")
    p_data["Activity"] = source.get("TYPE")
    p_data["Model"] = source.get("MODEL")

    # Get the site number and ECG metadata + aquisition time
    site = root.find(".//SITE")
    if site is not None:
        p_data["Site Number:"] = site.get("ID")

    p_data["Acquisition Time"] = root.attrib["ACQUISITION_TIME"]
    p_data["QRS Count"] = root.attrib["NUM_QRS"]
    p_data["Average RR"] = root.attrib["AVERAGE_RR"]
    p_data["Heart Rate"] = 60000 / int(p_data["Average RR"])

    for demo in root.findall(".//DEMOGRAPHIC_FIELD"):
        label = demo.get("LABEL")
        value = demo.get("VALUE")

        if label in ["Site Number:", "Participant #:", "Age:", "Gender:", "ID:&ID:", "Age:&", "Sex:"]:
            p_data[label] = value

    return p_data

In [149]:
p_demographics = []

mort_path = Path("/media/nvme1/pbecg-data/mortara")
for file_path in mort_path.rglob("*xml"):
    tree = ET.parse(file_path)
    root = tree.getroot()
    p_data = extract_demographics(root)
    p_demographics.append(p_data)

In [150]:
# Rename keys in list of dictionary
for p_demo in p_demographics:
    age_keys = ["Age:&", "Age:"]
    for key in age_keys:
        if key in p_demo:
            p_demo["Age"] = p_demo.pop(key)
    
    id_keys = ["ID:&ID:", "Participant #:"]
    for key in id_keys:
        if key in p_demo:
            p_demo["ID"] = p_demo.pop(key)

    sex_keys = ["Sex:", "Gender:"]
    for key in sex_keys:
        if key in p_demo:
            p_demo["Sex"] = p_demo.pop(key)

    if "Site Number:" in p_demo:
        p_demo["Site Number"] = p_demo.pop("Site Number:")

In [151]:
demo_df = pd.DataFrame(p_demographics).drop_duplicates()
demo_df.shape

(3447, 10)

In [152]:
unique_ids = demo_df["ID"].unique()
print(f"Number of unique patients in Mortara files: {len(unique_ids)}")

Number of unique patients in Mortara files: 2477


In [153]:
demo_df.to_csv("metadata.csv", index=False)