## Imports

In [1]:
import re

import pandas as pd
from scipy.stats import circmean, circstd
from labels import DIAGNOSTICS, SUB2SUPER, DIAG2SUB, FORM, RHYTHM

## Demographic features

In [2]:
ptbxl = pd.read_csv("../data/ptbxl_database.csv")
ptbxl["patient_id"] = ptbxl["patient_id"].astype(int)
print(ptbxl.shape)
ptbxl.head()

(21799, 28)


Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [3]:
ptbxl["patient_id"].nunique()

18869

In [4]:
median_age = ptbxl["age"].median()
q1 = ptbxl["age"].quantile(0.25)
q3 = ptbxl["age"].quantile(0.75)
iqr = q3 - q1

print(f"Median age: {median_age}")
print(f"Interquartile range (IQR) of age: {iqr}")

Median age: 62.0
Interquartile range (IQR) of age: 22.0


## Target

In [5]:
def save_y(df, path, ecg_id_col):
    binary_df = pd.DataFrame([x if isinstance(x, dict) else {} for x in df])
    binary_df.insert(0, "ecg_id", ecg_id_col)
    print(f"{binary_df.shape} \t {path}")
    binary_df.to_csv(path, index=False)

def get_labels(x, keys=None, key_transform=None):
    result = {}
    tuples = [(k,v) for k, v in eval(x)]
    for key, value in tuples:
        if keys and key not in keys:
            continue
        if key_transform:
            key = key_transform(key)
        if key not in result or value > result[key]:
            result[key] = value
    return result

scp_codes = pd.read_csv("../data/ptbxl_statements.csv")
ecg_id_col = scp_codes["ecg_id"]
scp_codes = scp_codes["scp_codes"]

all = scp_codes.apply(get_labels)
save_y(all, "../data/y_all.csv", ecg_id_col)

diag = scp_codes.apply(get_labels, keys=DIAG2SUB.keys())
save_y(diag, "../data/y_diag.csv", ecg_id_col)

subdiag = scp_codes.apply(get_labels, keys=DIAG2SUB.keys(), key_transform=lambda k: DIAG2SUB[k])
save_y(subdiag, "../data/y_subdiag.csv", ecg_id_col)

superdiag = scp_codes.apply(get_labels, keys=DIAG2SUB.keys(), key_transform=lambda k: SUB2SUPER[DIAG2SUB[k]])
save_y(superdiag, "../data/y_superdiag.csv", ecg_id_col)

form = scp_codes.apply(get_labels, keys=FORM)
save_y(form, "../data/y_form.csv", ecg_id_col)

rhythm = scp_codes.apply(get_labels, keys=RHYTHM)
save_y(rhythm, "../data/y_rhythm.csv", ecg_id_col)

(21799, 72) 	 ../data/y_all.csv
(21799, 45) 	 ../data/y_diag.csv
(21799, 24) 	 ../data/y_subdiag.csv
(21799, 6) 	 ../data/y_superdiag.csv
(21799, 20) 	 ../data/y_form.csv
(21799, 13) 	 ../data/y_rhythm.csv


## 3DFMMecg parameters

In [None]:
# %%
LEADS = ["I", "II", "III", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6"]
WAVES = ["P", "Q", "R", "S", "T"]

#fmm = pd.read_csv("../data/datFinalV10_PTBXL.csv", sep=",")
fmm = pd.read_csv("../data/datFinalV10_PTBXL12leads.csv", sep=",")
fmm = fmm.rename(columns={"EcgId": "ecg_id"})
fmm = fmm.rename(
    columns=lambda x: re.sub("Alpha", "α_", x)
    if any(w in x for w in WAVES)
    else x
)
fmm = fmm.rename(
    columns=lambda x: re.sub("Beta", "β_", x)
    if any(w in x for w in WAVES)
    else x
)
fmm = fmm.rename(
    columns=lambda x: re.sub("Omega", "ω_", x)
    if any(w in x for w in WAVES)
    else x
)
fmm = fmm.rename(
    columns=lambda x: re.sub("^A", "A_", x)
    if any(l in x for l in LEADS)
    else x
)
fmm = fmm.rename(columns=lambda x: re.sub("R2", "R²", x))

m = [x for x in fmm.columns if re.findall("M_", x)]
alpha = [x for x in fmm.columns if re.findall("α", x)]
beta = [x for x in fmm.columns if re.findall("β", x)]
omega = [x for x in fmm.columns if re.findall("ω", x)]
amplitude = [x for x in fmm.columns if re.findall("A_", x)]
r2 = [x for x in fmm.columns if re.findall("R²", x)]
dist = [x for x in fmm.columns if re.findall("^d\w\w", x)]
fmm = fmm.loc[
    :, ["ecg_id", "Age", "Sex"] + alpha + beta + omega + amplitude + r2 + dist + m
]

print(fmm.shape)
fmm.head()

  fmm = pd.read_csv("../data/datFinalV10_PTBXL12leads.csv", sep=",")


(233124, 157)


Unnamed: 0,ecg_id,Age,Sex,α_P,α_Q,α_R,α_S,α_T,β_P_I,β_Q_I,...,M_V1,M_V2,M_V3,M_V4,M_V5,M_V6,M_III,M_aVR,M_aVL,M_aVF
0,1,56.0,1,5.010413,5.636898,5.776905,5.910941,1.374845,3.722224,1.322646,...,-204.209706,-264.459822,38.620864,161.164849,266.677823,303.715931,-72.592268,-312.715556,210.801978,101.913577
1,1,56.0,1,5.054832,5.659873,5.788284,5.929163,1.378343,3.751417,1.648857,...,-207.999974,-220.299195,81.180581,175.520055,286.369548,283.620493,-50.438189,-293.652064,184.654674,108.99739
2,1,56.0,1,5.012415,5.625464,5.75949,5.892059,1.321735,3.468903,1.121659,...,-248.411574,-277.124301,65.246924,198.814581,297.30582,347.99226,-51.290885,-298.69512,187.815723,110.879397
3,1,56.0,1,4.972855,5.607787,5.740498,5.87407,1.293562,3.788385,1.146849,...,-211.884421,-233.831417,67.931549,175.809645,280.152882,328.385076,-52.521193,-301.586348,190.184069,111.402279
4,1,56.0,1,5.019927,5.630378,5.757434,5.895252,1.303729,3.617541,1.608754,...,-190.667492,-182.765771,90.969996,187.477355,268.697426,314.512501,-52.511093,-346.740985,212.753812,133.987172


In [7]:
vars_dict = {x: ["mean", "std"] for x in amplitude + omega + r2 + dist + m}
circvars_dict = {x: [circmean, circstd] for x in alpha + beta}
fmm_features = fmm.groupby("ecg_id").agg(vars_dict | circvars_dict)
fmm_features.columns = fmm_features.columns.to_flat_index()
fmm_features.columns = ["_".join(col) for col in fmm_features.columns.values]
fmm_features = fmm_features.reset_index()
print(fmm_features.shape)
fmm_features.head()

(21607, 309)


Unnamed: 0,ecg_id,A_P_I_mean,A_P_I_std,A_Q_I_mean,A_Q_I_std,A_R_I_mean,A_R_I_std,A_S_I_mean,A_S_I_std,A_T_I_mean,...,β_S_aVF_circmean,β_S_aVF_circstd,β_T_III_circmean,β_T_III_circstd,β_T_aVR_circmean,β_T_aVR_circstd,β_T_aVL_circmean,β_T_aVL_circstd,β_T_aVF_circmean,β_T_aVF_circstd
0,1,63.644417,17.746255,140.204135,16.215102,304.716906,21.137502,21.538722,14.748456,96.74534,...,5.197656,1.425333,2.740595,0.205495,0.092604,0.089625,3.441853,0.175244,3.127213,0.097689
1,2,30.480021,19.24232,117.139372,18.53942,365.250199,22.692962,140.423452,28.146895,223.356168,...,0.570667,0.912055,0.959192,2.026022,0.395953,0.195223,3.618928,0.1578,3.431155,0.521418
2,3,52.808632,6.816969,93.192237,17.091079,444.403055,38.835345,68.197228,20.434927,125.669194,...,,,,,,,,,,
3,4,61.791648,10.818636,223.413351,18.800768,253.92667,26.423692,184.091114,20.918125,188.780994,...,6.090814,0.678336,3.144125,0.646862,0.074764,0.14495,3.348633,1.045292,3.165907,0.329886
4,5,29.571852,7.502069,77.724568,13.965687,248.819946,14.038717,91.767584,11.122428,105.434106,...,3.990044,0.195563,2.900109,0.204306,6.242107,0.110389,4.931426,0.870782,3.000628,0.145071


## Rhythm features

In [8]:
RR2BPM = lambda x: 60000 / x
FREQ = 500
SEC2MS = 1000

beat_annot_df = pd.read_csv("../data/finalDefAll.txt", sep="\t")
print(beat_annot_df.shape)
beat_annot_df.head()

rhythm_features = pd.DataFrame({"ecg_id": beat_annot_df["ecg_id"].unique()})
beat_annot_df["RR"] = (
    beat_annot_df.groupby("ecg_id")["annoRef"].diff() / FREQ * SEC2MS
)
beat_annot_df["HR"] = RR2BPM(beat_annot_df["RR"])
beat_annot_df_group = beat_annot_df.groupby("ecg_id")
# RR Features
rhythm_features["RR_mean"] = beat_annot_df_group["RR"].mean().values
rhythm_features["RR_std"] = beat_annot_df_group["RR"].std().values
# Hr features
rhythm_features["HR_mean"] = beat_annot_df_group["HR"].mean().values
rhythm_features["HR_std"] = beat_annot_df_group["HR"].std().values
print(rhythm_features.shape)
rhythm_features.head()

(233443, 15)
(21607, 5)


Unnamed: 0,ecg_id,RR_mean,RR_std,HR_mean,HR_std
0,1,937.0,15.964246,64.050366,1.087776
1,2,1271.333333,82.507373,47.36228,3.104412
2,3,940.222222,20.011108,63.840161,1.345999
3,4,806.0,38.812369,74.595881,3.525347
4,5,905.2,51.53812,66.478286,3.802364


In [9]:
ptbxl_df = ptbxl[
    ["ecg_id", "strat_fold", "age", "sex", "height", "weight"]
].merge(fmm_features, how="left", left_on="ecg_id", right_on="ecg_id")
ptbxl_df = ptbxl_df.merge(
    rhythm_features, how="left", left_on="ecg_id", right_on="ecg_id"
)
ptbxl_df.to_csv("../data/3dfmmecg_features.csv", index=False)
ptbxl_df

Unnamed: 0,ecg_id,strat_fold,age,sex,height,weight,A_P_I_mean,A_P_I_std,A_Q_I_mean,A_Q_I_std,...,β_T_aVR_circmean,β_T_aVR_circstd,β_T_aVL_circmean,β_T_aVL_circstd,β_T_aVF_circmean,β_T_aVF_circstd,RR_mean,RR_std,HR_mean,HR_std
0,1,3,56.0,1,,63.0,63.644417,17.746255,140.204135,16.215102,...,0.092604,0.089625,3.441853,0.175244,3.127213,0.097689,937.000000,15.964246,64.050366,1.087776
1,2,2,19.0,0,,70.0,30.480021,19.242320,117.139372,18.539420,...,0.395953,0.195223,3.618928,0.157800,3.431155,0.521418,1271.333333,82.507373,47.362280,3.104412
2,3,5,37.0,1,,69.0,52.808632,6.816969,93.192237,17.091079,...,,,,,,,940.222222,20.011108,63.840161,1.345999
3,4,3,24.0,0,,82.0,61.791648,10.818636,223.413351,18.800768,...,0.074764,0.144950,3.348633,1.045292,3.165907,0.329886,806.000000,38.812369,74.595881,3.525347
4,5,4,19.0,1,,70.0,29.571852,7.502069,77.724568,13.965687,...,6.242107,0.110389,4.931426,0.870782,3.000628,0.145071,905.200000,51.538120,66.478286,3.802364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21794,21833,7,67.0,1,,,108.372848,20.610001,210.236812,37.390333,...,3.786069,1.297332,0.172350,1.102046,3.385121,1.155450,676.000000,446.773734,102.040984,21.679995
21795,21834,4,300.0,0,,,52.565324,5.384216,217.278226,10.787319,...,6.205999,0.176360,2.990590,0.341713,3.085173,0.183288,894.666667,10.908712,67.072906,0.814045
21796,21835,2,59.0,1,,,33.989115,5.999980,179.317673,18.125746,...,5.796808,0.357647,3.850983,0.764846,2.498182,0.304308,1022.750000,44.077043,58.759608,2.500835
21797,21836,8,64.0,1,,,47.889257,5.194711,158.591167,22.519993,...,0.228123,0.060595,1.155216,0.143639,3.478373,0.041405,1101.714286,594.837153,62.167330,16.680216
