In [30]:
!pip -q install wfdb

In [31]:
import os, ast, random, time, math
import numpy as np
import pandas as pd
from pathlib import Path
import wfdb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report


In [32]:
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [34]:
# Keep the same file paths you already use in the notebook/files area
DATA_ROOT = "/kaggle/input/ptbxl-dataset"
CSV_DB   = f"{DATA_ROOT}/ptbxl_database.csv"
CSV_SCP  = f"{DATA_ROOT}/scp_statements.csv"
RECORDS  = f"{DATA_ROOT}/records100"  # contains 00000, 01000, ..., 21000

assert Path(CSV_DB).exists(), "ptbxl_database.csv not found at expected path"
assert Path(CSV_SCP).exists(), "scp_statements.csv not found at expected path"
assert Path(RECORDS).exists(),  "records100/ folder not found at expected path"


In [35]:
df = pd.read_csv(CSV_DB)
scp = pd.read_csv(CSV_SCP)

# Map SCP codes to diagnostic superclasses per PTBâ€‘XL documentation
scp_diag = scp[scp['diagnostic'] == 1][['Unnamed: 0', 'diagnostic_class']]
scp_diag.columns = ['scp_code','superclass']
superclasses = ['NORM','MI','STTC','CD','HYP']  # fixed 5-class superclass setup

scp_map = dict(zip(scp_diag['scp_code'].values, scp_diag['superclass'].values))

def extract_targets(s):
    codes = ast.literal_eval(s)
    labs = set()
    for code in codes.keys():
        if code in scp_map:
            labs.add(scp_map[code])
    y = np.zeros(len(superclasses), dtype=np.float32)
    for i,c in enumerate(superclasses):
        if c in labs:
            y[i] = 1.0
    return y

df['y'] = df['scp_codes'].apply(extract_targets)
df = df[df['y'].apply(lambda a: a.sum() > 0)].reset_index(drop=True)
print("Total labeled records:", len(df))


Total labeled records: 21388


In [36]:
train_df = df[df.strat_fold.isin(range(1,9))].copy()
val_df   = df[df.strat_fold == 9].copy()
test_df  = df[df.strat_fold == 10].copy()

print("Train/Val/Test:", len(train_df), len(val_df), len(test_df))


Train/Val/Test: 17084 2146 2158


In [37]:
SIG_LEN = 1000  # 10 s at 100 Hz
N_LEADS = 12

def resolve_lr_path(filename_lr: str) -> Path:
    # filename_lr looks like 'records100/00000/00001_lr'
    rel = filename_lr.lstrip("./")
    return (Path(DATA_ROOT) / rel).resolve()

def read_record_lr(row):
    base = resolve_lr_path(row['filename_lr'])
    # wfdb.rdsamp expects base path (no extension); it reads .hea/.dat automatically
    sig, _ = wfdb.rdsamp(str(base))
    sig = sig.astype(np.float32)

    # pad/crop to SIG_LEN
    if sig.shape[0] < SIG_LEN:
        pad = SIG_LEN - sig.shape[0]
        sig = np.pad(sig, ((0,pad),(0,0)), mode='constant')
    elif sig.shape[0] > SIG_LEN:
        sig = sig[:SIG_LEN]

    # standardize per-lead, then transpose to (12, T)
    m = sig.mean(axis=0, keepdims=True)
    s = sig.std(axis=0, keepdims=True) + 1e-6
    sig = (sig - m) / s
    return sig.T  # (12, 1000)


In [38]:
class PTBXLDataset(Dataset):
    def __init__(self, frame, single_lead=None):
        self.frame = frame.reset_index(drop=True)
        self.single_lead = single_lead  # index 0..11 or None for multi-lead
        self.num_classes = len(superclasses)
    def __len__(self): return len(self.frame)
    def __getitem__(self, idx):
        row = self.frame.iloc[idx]
        x = read_record_lr(row)  # (12, T)
        if self.single_lead is not None:
            x = x[self.single_lead:self.single_lead+1, :]  # (1, T)
        y = row['y'].astype(np.float32)
        return torch.from_numpy(x), torch.from_numpy(y)

# Example: use lead II (index 1) as single-lead; adjust if your prior lead differs
SINGLE_LEAD_INDEX = 1

train_single = PTBXLDataset(train_df, single_lead=SINGLE_LEAD_INDEX)
val_single   = PTBXLDataset(val_df,   single_lead=SINGLE_LEAD_INDEX)
test_single  = PTBXLDataset(test_df,  single_lead=SINGLE_LEAD_INDEX)

train_multi = PTBXLDataset(train_df, single_lead=None)
val_multi   = PTBXLDataset(val_df,   single_lead=None)
test_multi  = PTBXLDataset(test_df,  single_lead=None)

BATCH_TRAIN = 64
BATCH_EVAL  = 128

trainloader_single = DataLoader(train_single, batch_size=BATCH_TRAIN, shuffle=True,
                                num_workers=2, pin_memory=True, drop_last=True)
valloader_single   = DataLoader(val_single, batch_size=BATCH_EVAL, shuffle=False,
                                num_workers=2, pin_memory=True)
testloader_single  = DataLoader(test_single, batch_size=BATCH_EVAL, shuffle=False,
                                num_workers=2, pin_memory=True)

trainloader = DataLoader(train_multi, batch_size=BATCH_TRAIN, shuffle=True,
                         num_workers=2, pin_memory=True, drop_last=True)
valloader   = DataLoader(val_multi, batch_size=BATCH_EVAL, shuffle=False,
                         num_workers=2, pin_memory=True)
testloader  = DataLoader(test_multi, batch_size=BATCH_EVAL, shuffle=False,
                         num_workers=2, pin_memory=True)


In [39]:
class ConvBlock(nn.Module):
    def __init__(self, c_in, c_out, k=7, s=1, p=3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(c_in, c_out, k, s, p, bias=False),
            nn.BatchNorm1d(c_out),
            nn.ReLU(inplace=True),
            nn.Conv1d(c_out, c_out, 3, 1, 1, bias=False),
            nn.BatchNorm1d(c_out),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2)
        )
    def forward(self, x): return self.net(x)

class SingleLeadCNN(nn.Module):
    def __init__(self, n_cls=5, feat_dim=128):
        super().__init__()
        self.b1 = ConvBlock(1, 64)
        self.b2 = ConvBlock(64, 128)
        self.b3 = ConvBlock(128, 128)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(128, feat_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(feat_dim, n_cls)
        )
    def forward(self, x):
        x = self.b1(x); x = self.b2(x); x = self.b3(x)
        return self.head(x)

class MultiLeadCNN(nn.Module):
    def __init__(self, n_leads=12, n_cls=5, feat_dim=256):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(n_leads, 64, 7, 1, 3, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True)
        )
        self.b1 = ConvBlock(64, 128)
        self.b2 = ConvBlock(128, 256)
        self.b3 = ConvBlock(256, 256)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(256, feat_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(feat_dim, n_cls)
        )
    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x); x = self.b2(x); x = self.b3(x)
        return self.head(x)

singlemodel = SingleLeadCNN(n_cls=len(superclasses)).to(device)
multileadmodel = MultiLeadCNN(n_leads=N_LEADS, n_cls=len(superclasses)).to(device)

print("Single params (M):", sum(p.numel() for p in singlemodel.parameters())/1e6)
print("Multi  params (M):", sum(p.numel() for p in multileadmodel.parameters())/1e6)


Single params (M): 0.301509
Multi  params (M): 1.262981


In [40]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    tot = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()
        tot += loss.item()
    return tot / len(loader)

@torch.no_grad()
def eval_probs(model, loader, criterion, device):
    model.eval()
    tot = 0.0
    probs_list, tgts_list = [], []
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        probs = torch.sigmoid(logits).cpu().numpy()
        probs_list.append(probs); tgts_list.append(yb.cpu().numpy())
        tot += loss.item()
    P = np.vstack(probs_list); Y = np.vstack(tgts_list)
    f1_05 = f1_score(Y, (P>=0.5).astype(int), average="macro", zero_division=0)
    return tot/len(loader), P, Y, f1_05


In [41]:
def find_best_thresholds(val_probs, val_tgts, grid=None):
    if grid is None:
        grid = np.linspace(0.1, 0.9, 81)
    C = val_probs.shape[1]
    best = np.full(C, 0.5, dtype=np.float32)
    for c in range(C):
        p, y = val_probs[:, c], val_tgts[:, c]
        qs = np.quantile(p, np.linspace(0.05, 0.95, 19))
        candidates = np.unique(np.concatenate([grid, qs]))
        fbest, tbest = -1.0, 0.5
        for t in candidates:
            pred = (p >= t).astype(int)
            f = f1_score(y, pred, zero_division=0)
            if f > fbest:
                fbest, tbest = f, float(t)
        best[c] = tbest
    return best


In [44]:
singlecriterion = nn.BCEWithLogitsLoss()
singleoptimizer = torch.optim.Adam(singlemodel.parameters(), lr=1e-3)
singlescheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    singleoptimizer, mode='max', factor=0.5, patience=3
)
print("Running")
best_single = -1.0
wait, patience = 0, 7
EPOCHS_SINGLE = 30  # adjust to your prior run length

for epoch in range(1, EPOCHS_SINGLE+1):
    tr = train_epoch(singlemodel, trainloader_single, singlecriterion, singleoptimizer, device)
    vl, Vp, Vy, vf05 = eval_probs(singlemodel, valloader_single, singlecriterion, device)
    thr = find_best_thresholds(Vp, Vy)
    vbest = f1_score(Vy, (Vp>=thr).astype(int), average="macro", zero_division=0)
    singlescheduler.step(vbest)
    print(f"[Single] Epoch {epoch:02d} | train {tr:.4f} | val {vl:.4f} | F1@0.5 {vf05:.4f} | F1@best {vbest:.4f}")
    if vbest > best_single:
        best_single, wait = vbest, 0
        torch.save({"model": singlemodel.state_dict(), "thr": thr}, "best_single.pt")
    else:
        wait += 1
        if wait >= patience:
            print("Single-lead early stop.")
            break


Running
[Single] Epoch 01 | train 0.3847 | val 0.4158 | F1@0.5 0.4497 | F1@best 0.6159
[Single] Epoch 02 | train 0.3721 | val 0.3860 | F1@0.5 0.5193 | F1@best 0.6163
[Single] Epoch 03 | train 0.3634 | val 0.3756 | F1@0.5 0.4992 | F1@best 0.6252
[Single] Epoch 04 | train 0.3595 | val 0.3779 | F1@0.5 0.5440 | F1@best 0.6250
[Single] Epoch 05 | train 0.3563 | val 0.3699 | F1@0.5 0.5216 | F1@best 0.6334
[Single] Epoch 06 | train 0.3526 | val 0.3787 | F1@0.5 0.5799 | F1@best 0.6315
[Single] Epoch 07 | train 0.3509 | val 0.3793 | F1@0.5 0.5789 | F1@best 0.6338
[Single] Epoch 08 | train 0.3477 | val 0.3629 | F1@0.5 0.5738 | F1@best 0.6451
[Single] Epoch 09 | train 0.3461 | val 0.3669 | F1@0.5 0.5549 | F1@best 0.6459
[Single] Epoch 10 | train 0.3418 | val 0.3773 | F1@0.5 0.5642 | F1@best 0.6394
[Single] Epoch 11 | train 0.3415 | val 0.3710 | F1@0.5 0.5533 | F1@best 0.6431
[Single] Epoch 12 | train 0.3388 | val 0.3591 | F1@0.5 0.5765 | F1@best 0.6445
[Single] Epoch 13 | train 0.3377 | val 0.367