In [86]:
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd

In [82]:
def extract_demographics(root):
    p_data = {}

    source = root.find(".//SOURCE")
    p_data["Activity"] = source.get("TYPE")
    p_data["Model"] = source.get("MODEL")

    site = root.find(".//SITE")
    if site is not None:
        p_data["Site Number:"] = site.get("ID")

    for demo in root.findall(".//DEMOGRAPHIC_FIELD"):
        label = demo.get("LABEL")
        value = demo.get("VALUE")

        if label in ["Site Number:", "Participant #:", "Age:", "Gender:", "ID:&ID:", "Age:&", "Sex:"]:
            p_data[label] = value

    return p_data

In [83]:
p_demographics = []

mort_path = Path("/media/nvme1/pbecg-data/mortara")
for file_path in mort_path.rglob("*xml"):
    tree = ET.parse(file_path)
    root = tree.getroot()
    p_data = extract_demographics(root)
    p_demographics.append(p_data)

In [84]:
# Rename keys in list of dictionary
for p_demo in p_demographics:
    age_keys = ["Age:&", "Age:"]
    for key in age_keys:
        if key in p_demo:
            p_demo["Age"] = p_demo.pop(key)
    
    id_keys = ["ID:&ID:", "Participant #:"]
    for key in id_keys:
        if key in p_demo:
            p_demo["ID"] = p_demo.pop(key)

    sex_keys = ["Sex:", "Gender:"]
    for key in sex_keys:
        if key in p_demo:
            p_demo["Sex"] = p_demo.pop(key)

    if "Site Number:" in p_demo:
        p_demo["Site Number"] = p_demo.pop("Site Number:")

In [87]:
demo_df = pd.DataFrame(p_demographics)

In [100]:
duplicate_rows = demo_df[demo_df.duplicated("ID", keep=False)]

In [103]:
duplicate_rows.sort_values(by=["ID"]).head(10)

Unnamed: 0,Activity,Model,Age,ID,Sex,Site Number
3994,RESTING,el150/250,32,1003195522,Female,15
1444,RESTING,el150/250,30,1003195522,Female,62
3698,RESTING,el150/250,35,1021213008,Male,7
2131,RESTING,el150/250,32,1021213008,Male,62
1776,RESTING,el150/250,32,1021213008,Male,62
2184,RESTING,el150/250,30,1032132066,Male,8
3642,RESTING,el150/250,30,1032132066,Male,8
3074,RESTING,el150/250,32,1032132066,Male,8
1829,RESTING,el150/250,30,1032132066,Male,8
2210,RESTING,el150/250,20,1036836926,Male,15
