In [None]:
import pandas as pd
import json
import numpy as np
import os
import numpy as np
import shutil
from multiprocessing import Pool
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from util.preprocessing import extract_window, plot_and_save_ecg_window

In [1]:
# Read csv from ptb-xl/ptbxl_database.csv
df = pd.read_csv("ptb-xl/ptbxl_database.csv")

# Replace all single quotes with double quotes
df["scp_codes"] = df["scp_codes"].apply(lambda x: x.replace("'", '"'))

# JSON decode all df["scp_codes"]
df["scp_codes"] = df["scp_codes"].apply(json.loads)

# Print the first 5 rows of the dataframe
print(df.head())
print(df.count())

FileNotFoundError: [Errno 2] No such file or directory: 'ptb-xl/ptbxl_database.csv'

In [2]:
# NORM => Normal
# 1AVB => First-Degree Atrioventricular Block (1dAVb)
# IRBBB, CRBBB => RBBB => Right Bundle Branch Block (RBBB)
# ILBBB, CLBBB => LBBB => Left Bundle Branch Block (LBBB)
# AFLT => Atrial Flutter (AFLT)
# AFIB => Atrial Fibrillation (AFIB)
labels = ["NORM", "1AVB", "RBBB", "LBBB", "AFLT", "AFIB"]

# Add a new column called "diagnosis" and use the key with the highest value
df["diagnosis"] = df["scp_codes"].apply(lambda x: max(x, key=x.get))

# Combine IRBBB and CRBBB into RBBB
df["diagnosis"] = df["diagnosis"].replace("IRBBB", "RBBB")
df["diagnosis"] = df["diagnosis"].replace("CRBBB", "RBBB")

# Combine ILBBB and CLBBB into LBBB
df["diagnosis"] = df["diagnosis"].replace("ILBBB", "LBBB")
df["diagnosis"] = df["diagnosis"].replace("CLBBB", "LBBB")

# Convert df["diagnosis"] to OTHERS where df["diagnosis"] is not within labels
df["diagnosis"] = df["diagnosis"].apply(lambda x: x if x in labels else "OTHERS")

# Drop every column except "diagnosis", "filename_lr", "filename_hr"
df = df[["diagnosis", "filename_lr", "filename_hr"]]

# Reset the index so that the index is the row number
df = df.reset_index(drop=True)

# Print the first 5 rows of the dataframe
print(df.head())
print(df.count())

# Print the distribution of the diagnoses by their scp_codes
print(df["diagnosis"].value_counts())

  diagnosis                filename_lr                filename_hr
0      NORM  records100/00000/00001_lr  records500/00000/00001_hr
1      NORM  records100/00000/00002_lr  records500/00000/00002_hr
2      NORM  records100/00000/00003_lr  records500/00000/00003_hr
3      NORM  records100/00000/00004_lr  records500/00000/00004_hr
4      NORM  records100/00000/00005_lr  records500/00000/00005_hr
diagnosis      21799
filename_lr    21799
filename_hr    21799
dtype: int64
diagnosis
OTHERS    10576
NORM       9134
RBBB       1216
LBBB        572
1AVB        201
AFLT         55
AFIB         45
Name: count, dtype: int64


In [3]:
records = df[["diagnosis", "filename_lr"]].to_dict(orient="records")

X_windows = []
y_labels = []

for rec in records:
    # Print progress
    print(f"Extracting {records.index(rec)+1} of {len(records)}")

    win = extract_window(f"ptb-xl/{rec['filename_lr']}")
    X_windows.append(win.flatten())  # → shape 2400
    y_labels.append(rec["diagnosis"])

X = np.stack(X_windows, axis=0)
y = np.array(y_labels)

Extracting 1 of 21799
Extracting 2 of 21799
Extracting 3 of 21799
Extracting 4 of 21799
Extracting 5 of 21799
Extracting 6 of 21799
Extracting 7 of 21799
Extracting 8 of 21799
Extracting 9 of 21799
Extracting 10 of 21799
Extracting 11 of 21799
Extracting 12 of 21799
Extracting 13 of 21799
Extracting 14 of 21799
Extracting 15 of 21799
Extracting 16 of 21799
Extracting 17 of 21799
Extracting 18 of 21799
Extracting 19 of 21799
Extracting 20 of 21799
Extracting 21 of 21799
Extracting 22 of 21799
Extracting 23 of 21799
Extracting 24 of 21799
Extracting 25 of 21799
Extracting 26 of 21799
Extracting 27 of 21799
Extracting 28 of 21799
Extracting 29 of 21799
Extracting 30 of 21799
Extracting 31 of 21799
Extracting 32 of 21799
Extracting 33 of 21799
Extracting 34 of 21799
Extracting 35 of 21799
Extracting 36 of 21799
Extracting 37 of 21799
Extracting 38 of 21799
Extracting 39 of 21799
Extracting 40 of 21799
Extracting 41 of 21799
Extracting 42 of 21799
Extracting 43 of 21799
Extracting 44 of 217

In [4]:
# Knock NORM & OTHERS down to 5000 each
rus = RandomUnderSampler(
    sampling_strategy={"NORM": 5000, "OTHERS": 5000}, random_state=42
)
X_mid, y_mid = rus.fit_resample(X, y)

# Bring all other classesup to 5000 each with SMOTE+Tomek
smk = SMOTETomek(
    sampling_strategy={cls: 5000 for cls in ["RBBB", "LBBB", "1AVB", "AFLT", "AFIB"]},
    random_state=42,
)
X_bal, y_bal = smk.fit_resample(X_mid, y_mid)

print(pd.Series(y_bal).value_counts())

1AVB      5000
AFIB      5000
AFLT      5000
LBBB      5000
RBBB      4990
OTHERS    4677
NORM      4669
Name: count, dtype: int64


In [6]:
def process_window(args):
    i, (win, label) = args
    out_path = f"output/{label}/{label}_{i:05d}.png"
    plot_and_save_ecg_window(win, out_path)
    return i

# In case the kernel dies, we can start from the last index
start_from_index = 0

if start_from_index == 0:
    # Force delete output/ and make a new folder called output/
    shutil.rmtree("output/", ignore_errors=True)
    os.makedirs("output/", exist_ok=True)

    # Create a folder for each diagnosis
    for diagnosis in df["diagnosis"].unique():
        os.makedirs(f"output/{diagnosis}", exist_ok=True)

# Reshape back into windows
windows_bal = X_bal.reshape(-1, 200, 12)
dataset = list(enumerate(zip(windows_bal, y_bal)))

# Filter based on start index
dataset = [x for x in dataset if x[0] >= start_from_index]

# Split into chunks for parallel processing
num_processes = os.cpu_count()
chunk_size = len(dataset) // num_processes
chunks = [dataset[i:i + chunk_size] for i in range(0, len(dataset), chunk_size)]

# Process chunks in parallel
with Pool(processes=num_processes) as pool:
    for i, _ in enumerate(pool.imap_unordered(process_window, dataset)):
        print(f"Processed {i+1} of {len(dataset)}")

Processing 22257 of 34336
Processing 22258 of 34336
Processing 22259 of 34336
Processing 22260 of 34336
Processing 22261 of 34336
Processing 22262 of 34336
Processing 22263 of 34336
Processing 22264 of 34336
Processing 22265 of 34336
Processing 22266 of 34336
Processing 22267 of 34336
Processing 22268 of 34336
Processing 22269 of 34336
Processing 22270 of 34336
Processing 22271 of 34336
Processing 22272 of 34336
Processing 22273 of 34336
Processing 22274 of 34336
Processing 22275 of 34336
Processing 22276 of 34336
Processing 22277 of 34336
Processing 22278 of 34336
Processing 22279 of 34336
Processing 22280 of 34336
Processing 22281 of 34336
Processing 22282 of 34336
Processing 22283 of 34336
Processing 22284 of 34336
Processing 22285 of 34336
Processing 22286 of 34336
Processing 22287 of 34336
Processing 22288 of 34336
Processing 22289 of 34336
Processing 22290 of 34336
Processing 22291 of 34336
Processing 22292 of 34336
Processing 22293 of 34336
Processing 22294 of 34336
Processing 2