In [None]:
import os.path

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from tqdm import tqdm
from scipy.stats import ttest_ind

In [None]:
sample_data = '/mnt/data/ukb_heartmri/ukb_20205/1000134_20205_2_0.xml'
ecg_xml_path = '/mnt/data/ukb_heartmri/ukb_20205/'

In [None]:
def extract_qt_intervals(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    # get BPM, PQInterval, PDuration, QRSDuration, QTInterval, QTCInterval, RRInterval, PPInterval
    bpm = root.find('.//VentricularRate')
    pqi = root.find('.//PQInterval')
    pdur = root.find('.//PDuration')

    qt = root.find('.//QTInterval')
    qtc = root.find('.//QTCInterval')
    qrs = root.find('.//QRSDuration')
    rr = root.find('.//RRInterval')
    pp = root.find('.//PPInterval')

    bpm_val = int(bpm.text) if bpm is not None and bpm.text else None
    pqi_val = int(pqi.text) if pqi is not None and pqi.text else None
    pdur_val = int(pdur.text) if pdur is not None and pdur.text else None
    qt_val = int(qt.text) if qt is not None and qt.text else None
    qtc_val = int(qtc.text) if qtc is not None and qtc.text else None
    qrs_val = int(qrs.text) if qrs is not None and qrs.text else None
    rr_val = int(rr.text) if rr is not None and rr.text else None
    pp_val = int(pp.text) if pp is not None and pp.text else None

    return_dict = {
        'BPM': bpm_val,
        'PQInterval': pqi_val,
        'PDuration': pdur_val,
        'QRSDuration': qrs_val,
        'QTInterval': qt_val,
        'QTCInterval': qtc_val,
        'RRInterval': rr_val,
        'PPInterval': pp_val
    }

    return return_dict
extract_qt_intervals(sample_data)

In [None]:
df = pd.read_csv('data/ukb_ecg_data.csv')

In [None]:
eids = df['Eid'].values
if not os.path.exists('data/ukb_ecg_stats.csv'):
    ecg_stats = {
        'BPM': [],
        'PQInterval': [],
        'PDuration': [],
        'QRSDuration': [],
        'QTInterval': [],
        'QTCInterval': [],
        'RRInterval': [],
        'PPInterval': []
    }

    for eid in tqdm(eids):
        ecg_xml = os.path.join(ecg_xml_path, str(eid) + '_20205_2_0.xml')
        if not os.path.exists(ecg_xml):
            print(f'File not found for Eid {eid}')
            for key in ecg_stats.keys():
                ecg_stats[key].append(np.nan)
        else:
            ecg_res = extract_qt_intervals(ecg_xml)
            for key in ecg_stats.keys():
                ecg_stats[key].append(ecg_res[key])

    df_res = pd.DataFrame({
        'Eid': eids,
        'BPM': ecg_stats['BPM'],
        'PQInterval': ecg_stats['PQInterval'],
        'PDuration': ecg_stats['PDuration'],
        'QRSDuration': ecg_stats['QRSDuration'],
        'QTInterval': ecg_stats['QTInterval'],
        'QTCInterval': ecg_stats['QTCInterval'],
        'RRInterval': ecg_stats['RRInterval'],
        'PPInterval': ecg_stats['PPInterval']
    })
    df_res.to_csv('data/ukb_ecg_stats.csv', index=False)
else:
    df_res = pd.read_csv('data/ukb_ecg_stats.csv')