In [3]:
import os, sys, ast, math, warnings, time
from pathlib import Path
import numpy as np
import pandas as pd
import wfdb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
warnings.filterwarnings("ignore")

# ---------------- CONFIG ----------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)
PTBXL_PATH = "/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1"
# change above to your local path if needed

MAX_LEN = 500        # sequence length (time samples). Use 500 for 100Hz => 5s; adjust as needed
BATCH = 32
NUM_WORKERS = 2
RANDOM_SEED = 42

# Training epoch settings for each algorithm (small for demo)
EPOCHS_CLASSIFIER = 2
EPOCHS_CAPTION = 1
EPOCHS_GPT = 1

# Target classes (superclasses)
CLASSES = ['CD','HYP','MI','NORM','STTC']

Device: cuda


In [4]:
import math
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Load metadata
meta_csv = os.path.join(PTBXL_PATH, "ptbxl_database.csv")
scp_csv = os.path.join(PTBXL_PATH, "scp_statements.csv")
assert os.path.exists(meta_csv), f"ptbxl_database.csv not found at {meta_csv}"
assert os.path.exists(scp_csv), f"scp_statements.csv not found at {scp_csv}"

meta_df = pd.read_csv(meta_csv, index_col=0)
scp_df = pd.read_csv(scp_csv, index_col=0)

# helper to map scp_codes -> diagnostic_superclass (list)
def scp_to_superclasses(scp_codes_str):
    try:
        d = ast.literal_eval(scp_codes_str)
    except Exception:
        return []
    out=[]
    for k in d.keys():
        if k in scp_df.index and scp_df.loc[k, 'diagnostic_class'] in CLASSES:
            out.append(scp_df.loc[k,'diagnostic_class'])
    return list(set(out))

meta_df['diagnostic_superclass'] = meta_df['scp_codes'].apply(scp_to_superclasses)

In [5]:
# MultiLabel binarizer
mlb = MultiLabelBinarizer(classes=CLASSES)
Y_all = mlb.fit_transform(meta_df['diagnostic_superclass'])

# Safe index mapping for filename paths (some rows store relative paths like 'records100/00000/00001_lr')
def get_full_path(fname):
    # fname may already be a full path or relative to PTBXL_PATH
    p = os.path.join(PTBXL_PATH, fname)
    base,_ = os.path.splitext(p)
    # prefer .dat/.hea pairs; wfdb.rdsamp accepts base path without extension
    if os.path.exists(base + ".dat") and os.path.exists(base + ".hea"):
        return base
    # try adding ptbxl prefix if needed
    if os.path.exists(p):
        return os.path.splitext(p)[0]
    # fallback: return base anyway
    return base

In [6]:
class PTBXL_Dataset(Dataset):
    def __init__(self, df, Y, max_len=MAX_LEN):
        self.df = df.reset_index(drop=True)
        self.Y = Y
        self.max_len = max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        base = get_full_path(row['filename_lr'])
        try:
            sig, _ = wfdb.rdsamp(base)
        except Exception as e:
            # fallback: try hr
            base2 = get_full_path(row['filename_hr']) if 'filename_hr' in row else base
            sig, _ = wfdb.rdsamp(base2)
        sig = sig.astype(np.float32)  # shape (T, 12)
        # ensure shape (T,12)
        if sig.ndim==1: sig = sig[:,None]
        if sig.shape[1] != 12 and sig.shape[0]==12: sig = sig.T
        if sig.shape[1] != 12:
            # pad or trim leads dimension
            if sig.shape[1] < 12:
                sig = np.pad(sig, ((0,0),(0, 12 - sig.shape[1])), mode='constant')
            else:
                sig = sig[:, :12]
        # normalize per lead
        sig = (sig - sig.mean(axis=0)) / (sig.std(axis=0) + 1e-8)
        # pad/trim in time
        if sig.shape[0] > self.max_len:
            sig = sig[:self.max_len, :]
        else:
            pad_len = self.max_len - sig.shape[0]
            sig = np.pad(sig, ((0,pad_len),(0,0)))
        # flip to (channels, seq_len)
        x = torch.tensor(sig.T, dtype=torch.float)  # (12, max_len)
        y = torch.tensor(self.Y[idx].astype(np.float32), dtype=torch.float)
        return x, y

In [7]:
indices = np.arange(len(meta_df))
# filter out records with empty labels? keep them (multi-label may be empty); stratify on first non-empty label or fallback
has_label_mask = np.array([len(l)>0 for l in meta_df['diagnostic_superclass']])
# For stratify, use string representation; if too rare leads to ValueError, fallback to random split
try:
    train_idx, test_idx = train_test_split(indices, test_size=0.1, random_state=RANDOM_SEED, stratify=[str(s) for s in meta_df['diagnostic_superclass']])
    train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=RANDOM_SEED, stratify=[str(meta_df.iloc[i]['diagnostic_superclass']) for i in train_idx])
except Exception:
    train_idx, test_idx = train_test_split(indices, test_size=0.1, random_state=RANDOM_SEED)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=RANDOM_SEED)

train_df = meta_df.iloc[train_idx].copy()
val_df = meta_df.iloc[val_idx].copy()
test_df = meta_df.iloc[test_idx].copy()
Y = mlb.transform(meta_df['diagnostic_superclass'])
train_Y = Y[train_idx]; val_Y = Y[val_idx]; test_Y = Y[test_idx]

print("Sizes (train/val/test):", len(train_df), len(val_df), len(test_df))

train_ds = PTBXL_Dataset(train_df, train_Y, max_len=MAX_LEN)
val_ds = PTBXL_Dataset(val_df, val_Y, max_len=MAX_LEN)
test_ds = PTBXL_Dataset(test_df, test_Y, max_len=MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_ds, batch_size=BATCH, shuffle=False, num_workers=NUM_WORKERS)

Sizes (train/val/test): 17687 1966 2184


In [8]:
def eval_multilabel(model, loader, threshold=0.5):
    model.eval()
    Ys=[]; Ps=[]
    with torch.no_grad():
        for x,y in loader:
            x = x.to(DEVICE)
            # model expects (B, C, T) - many models below follow that
            logits = model(x)
            probs = torch.sigmoid(logits).cpu().numpy()
            Ys.append(y.numpy())
            Ps.append(probs)
    Ys = np.vstack(Ys); Ps = np.vstack(Ps)
    pred = (Ps > threshold).astype(int)
    # metrics per-class
    try:
        auc = roc_auc_score(Ys, Ps, average='macro')
    except Exception:
        auc = float('nan')
    f1 = f1_score(Ys, pred, average='macro', zero_division=0)
    acc = (pred == Ys).all(axis=1).mean()  # strict multi-label exact-match accuracy
    print("AUROC(macro):", auc)
    print("F1(macro):", f1)
    print("Exact-match accuracy:", acc)
    return Ys, Ps, pred

In [9]:
print("\n== Algorithm 1: BPE Tokenizer (demo) ==")
try:
    from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
    sample_reports = meta_df['report'].dropna().astype(str).tolist()
    # train small BPE on reports (demo)
    tmp_file = "ptbxl_reports_corpus.txt"
    with open(tmp_file, "w", encoding="utf-8") as f:
        for r in sample_reports[:5000]:
            f.write(r.replace("\n"," ") + "\n")
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=8000, special_tokens=["<pad>","<s>","</s>","<unk>"])
    tokenizer.train([tmp_file], trainer)
    print("BPE vocab size:", tokenizer.get_vocab_size())
    print("Sample tokens:", tokenizer.encode("Normal sinus rhythm").tokens[:20])
except Exception as e:
    print("Tokenizers not available or corpus missing:", e)


== Algorithm 1: BPE Tokenizer (demo) ==



BPE vocab size: 2928
Sample tokens: ['N', 'orm', 'al', 'sinus', 'rhythm']


In [10]:
print("\n== Algorithm 2: ResNet1D Encoder + Transformer Decoder (captioning sketch) ==")
from torch.nn import TransformerDecoderLayer, TransformerDecoder

class ResNet1D_Enc(nn.Module):
    def __init__(self, out_dim=128):
        super().__init__()
        self.conv1 = nn.Conv1d(12,64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        # adaptively pool to fixed sequence length
        self.pool = nn.AdaptiveAvgPool1d(128)
        self.fc = nn.Linear(64*128, out_dim)
    def forward(self,x):
        # x: (B,12,T)
        h = self.relu(self.bn1(self.conv1(x)))
        h = self.pool(h)       # (B,64,128)
        h = h.flatten(1)
        return self.fc(h)      # (B, out_dim)

class ECGCaptioner(nn.Module):
    def __init__(self, vocab_size=1000, d_model=128):
        super().__init__()
        self.encoder = ResNet1D_Enc(d_model)
        dec_layer = TransformerDecoderLayer(d_model=d_model, nhead=8)
        self.decoder = TransformerDecoder(dec_layer, num_layers=2)
        self.fc_out = nn.Linear(d_model, vocab_size)
    def forward(self, x, tgt):
        # x: (B,12,T), tgt: (tgt_len,B,d_model)
        enc = self.encoder(x).unsqueeze(0)  # (1, B, d_model)
        out = self.decoder(tgt, enc)
        return self.fc_out(out)  # (tgt_len, B, vocab_size)


== Algorithm 2: ResNet1D Encoder + Transformer Decoder (captioning sketch) ==


In [11]:
# Create a small training loop but here we only demo training with fake targets
caption_vocab = 500
caption_model = ECGCaptioner(vocab_size=caption_vocab, d_model=128).to(DEVICE)
opt = torch.optim.Adam(caption_model.parameters(), lr=1e-4)
caption_model.train()
for epoch in range(EPOCHS_CAPTION):
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(DEVICE)  # (B,12,T)
        # create fake target representations (tgt_len, B, d_model)
        tgt_len = 10
        tgt = torch.randn(tgt_len, xb.size(0), 128, device=DEVICE)
        logits = caption_model(xb, tgt)    # (tgt_len, B, vocab)
        # fake labels
        labels = torch.randint(0, caption_vocab, (tgt_len*xb.size(0),), device=DEVICE)
        loss = F.cross_entropy(logits.view(-1, caption_vocab), labels)
        opt.zero_grad(); loss.backward(); opt.step()
        epoch_loss += loss.item()
    print(f"Captioning epoch {epoch+1}, loss {epoch_loss/len(train_loader):.4f}")
print("Captioning demo done. (Use real tokenized targets for real training.)")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Captioning epoch 1, loss 6.2467
Captioning demo done. (Use real tokenized targets for real training.)


In [12]:
print("\n== Algorithm 3: ChexNet (DenseNet121) on spectrograms + optional GPT-2 sketch ==")
# we will convert ECG -> mel-spectrogram per lead and stack 3 selected leads as RGB channels
try:
    import librosa
    import librosa.display
except Exception:
    librosa = None
    print("librosa not installed â€” install librosa to run spectrogram-based ChexNet approach (pip install librosa).")

from torchvision import models, transforms

def ecg_to_melspec_batch(X_numpy, sr=100, n_mels=128, target_size=(128,128)):
    # X_numpy: (B,12,T) numpy
    B = X_numpy.shape[0]
    imgs = []
    for b in range(B):
        leads = X_numpy[b]  # (12,T)
        channels = []
        # create mel for first 3 leads (I, II, V1) or choose [0,1,6]
        lead_idxs = [0,1,6]
        for li in lead_idxs:
            sig = leads[li]
            if librosa is None:
                # fallback simple STFT using numpy (not optimal)
                S = np.abs(np.fft.rfft(sig, n=target_size[1]*2))[:target_size[1]]
                S = np.tile(S[:target_size[1]], (target_size[0],1))
            else:
                S = librosa.feature.melspectrogram(y=sig, sr=sr, n_mels=target_size[0], n_fft=256, hop_length=max(1, len(sig)//target_size[1]))
                S = librosa.power_to_db(S, ref=np.max)
                # resize/pad to target_size
                if S.shape[1] < target_size[1]:
                    pad = target_size[1] - S.shape[1]
                    S = np.pad(S, ((0,0),(0,pad)), mode='constant')
                else:
                    S = S[:, :target_size[1]]
            # normalize
            S = (S - S.mean()) / (S.std()+1e-8)
            channels.append(S)
        img = np.stack(channels, axis=0)  # (3, H, W)
        imgs.append(img)
    return np.stack(imgs, axis=0).astype(np.float32)  # (B,3,H,W)

# Simple DenseNet classifier on spectrograms
class DenseNetSpec(nn.Module):
    def __init__(self, num_classes=len(CLASSES), pretrained=False):
        super().__init__()
        base = models.densenet121(pretrained=pretrained)
        # adapt first conv to accept 3 channels (already 3)
        self.features = base.features
        self.classifier = nn.Linear(base.classifier.in_features, num_classes)
    def forward(self,x):
        # x: (B,3,H,W)
        f = self.features(x)
        f = F.relu(f, inplace=True)
        f = F.adaptive_avg_pool2d(f, (1,1)).view(x.size(0), -1)
        return self.classifier(f)

if librosa is not None:
    # training loop for DenseNet on spectrograms (quick demo)
    densenet = DenseNetSpec().to(DEVICE)
    opt = torch.optim.Adam(densenet.parameters(), lr=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    for epoch in range(EPOCHS_CLASSIFIER):
        densenet.train(); running=0.0; n=0
        for xb, yb in train_loader:
            # xb: (B,12,T) -> numpy
            xb_np = xb.numpy()
            imgs = ecg_to_melspec_batch(xb_np, sr=100, n_mels=128, target_size=(128,128))
            imgs_t = torch.tensor(imgs).to(DEVICE)
            yb = yb.to(DEVICE)
            logits = densenet(imgs_t)
            loss = criterion(logits, yb)
            opt.zero_grad(); loss.backward(); opt.step()
            running += loss.item(); n+=1
        print(f"DenseNet spectrogram epoch {epoch+1}, loss {running/max(1,n):.4f}")
    # evaluate
    def eval_densenet(loader):
        densenet.eval()
        Ys=[]; Ps=[]
        with torch.no_grad():
            for xb,yb in loader:
                imgs = ecg_to_melspec_batch(xb.numpy())
                imgs_t = torch.tensor(imgs).to(DEVICE)
                probs = torch.sigmoid(densenet(imgs_t)).cpu().numpy()
                Ys.append(yb.numpy()); Ps.append(probs)
        Ys = np.vstack(Ys); Ps = np.vstack(Ps)
        preds = (Ps>0.5).astype(int)
        print("DenseNet AUROC:", roc_auc_score(Ys, Ps, average='macro'))
        print("DenseNet F1:", f1_score(Ys, preds, average='macro', zero_division=0))
    print("Evaluating DenseNet on test set (spectrogram)...")
    eval_densenet(test_loader)
else:
    print("Skipping DenseNet spectrogram training (librosa not installed).")


== Algorithm 3: ChexNet (DenseNet121) on spectrograms + optional GPT-2 sketch ==


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DenseNet spectrogram epoch 1, loss 0.4346


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DenseNet spectrogram epoch 2, loss 0.3836
Evaluating DenseNet on test set (spectrogram)...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DenseNet AUROC: 0.8492949527911964
DenseNet F1: 0.5676289914979151


In [13]:
print("\n== Algorithm 4: Wavelet Scattering + classifier ==")
try:
    from kymatio.torch import Scattering1D
    has_kymatio=True
except Exception:
    has_kymatio=False
    print("kymatio not installed - skip scattering. pip install kymatio to enable.")

if has_kymatio:
    scattering = Scattering1D(J=6, shape=MAX_LEN).to(DEVICE)
    class ScatterNet(nn.Module):
        def __init__(self, out_dim=128, num_classes=len(CLASSES)):
            super().__init__()
            self.fc1 = nn.Linear(scattering.output_size(), out_dim)
            self.classifier = nn.Linear(out_dim, num_classes)
        def forward(self, x):
            # x: (B,12,T) -> we'll average across leads then scatter
            x = x.mean(dim=1)  # (B, T)
            s = scattering(x)  # (B, C, T_sc) or (B, C)
            if s.ndim>2:
                s = s.mean(dim=-1)
            h = F.relu(self.fc1(s))
            return self.classifier(h)
    scatter_model = ScatterNet().to(DEVICE)
    opt = torch.optim.Adam(scatter_model.parameters(), lr=1e-4)
    crit = nn.BCEWithLogitsLoss()
    for epoch in range(EPOCHS_CLASSIFIER):
        scatter_model.train(); r=0; n=0
        for xb,yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            logits = scatter_model(xb)
            loss = crit(logits, yb)
            opt.zero_grad(); loss.backward(); opt.step()
            r+=loss.item(); n+=1
        print(f"Scatter epoch {epoch+1}, loss {r/max(1,n):.4f}")
    print("Eval scattering model:")
    eval_multilabel(scatter_model, test_loader)
else:
    print("Skipping scattering classifier due to missing dependency.")


== Algorithm 4: Wavelet Scattering + classifier ==
kymatio not installed - skip scattering. pip install kymatio to enable.
Skipping scattering classifier due to missing dependency.


In [14]:
print("\n== Algorithm 5: Vision-Text Transformer (ECG-GPT sketch) ==")
# We build a small multimodal transformer: encoder (ResNet1D_Enc) + text decoder (simple transformer decoder),
# then connect decoder to a small LM head. For demo, train with fake text targets.
from torch.nn import Transformer

class ECGMultimodal(nn.Module):
    def __init__(self, d_model=128, vocab_size=500):
        super().__init__()
        self.encoder = ResNet1D_Enc(d_model)
        self.text_decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=8)
        self.decoder = nn.TransformerDecoder(self.text_decoder_layer, num_layers=2)
        self.lm_head = nn.Linear(d_model, vocab_size)
    def forward(self, x, tgt_emb):
        # x: (B,12,T), tgt_emb: (tgt_len,B,d_model)
        enc = self.encoder(x).unsqueeze(0)  # (1,B,d_model)
        out = self.decoder(tgt_emb, enc)    # (tgt_len,B,d_model)
        return self.lm_head(out)

mmodel = ECGMultimodal().to(DEVICE)
opt = torch.optim.Adam(mmodel.parameters(), lr=1e-4)
for epoch in range(EPOCHS_CAPTION):
    mmodel.train(); r=0
    for xb,yb in train_loader:
        xb = xb.to(DEVICE)
        tgt = torch.randn(12, xb.size(0), 128, device=DEVICE)
        logits = mmodel(xb, tgt)
        labels = torch.randint(0,500,(12*xb.size(0),), device=DEVICE)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels)
        opt.zero_grad(); loss.backward(); opt.step()
        r+=loss.item()
    print(f"Multimodal epoch {epoch+1}, loss {r/len(train_loader):.4f}")
print("Multimodal demo done.")


== Algorithm 5: Vision-Text Transformer (ECG-GPT sketch) ==


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Multimodal epoch 1, loss 6.2436
Multimodal demo done.


In [15]:
print("\n== Algorithm 6: Custom Domain-Specific Tokenizer + GPT-2 (sketch) ==")
try:
    from tokenizers import Tokenizer, models, pre_tokenizers, trainers
    from transformers import GPT2Config, GPT2LMHeadModel
    # Build a tiny custom BPE on text reports (take limited subset)
    reports = meta_df['report'].dropna().astype(str).tolist()
    corpus_small = "reports_small.txt"
    with open(corpus_small, "w", encoding="utf-8") as f:
        for r in reports[:2000]:
            f.write(r.replace("\n"," ") + "\n")
    tok = Tokenizer(models.BPE())
    tok.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=2000, special_tokens=["<pad>","<s>","</s>","<unk>"])
    tok.train([corpus_small], trainer)
    vocab_size = tok.get_vocab_size()
    print("Custom tokenizer vocab size:", vocab_size)
    # Lightweight GPT2 config
    cfg = GPT2Config(vocab_size=vocab_size, n_embd=128, n_layer=4, n_head=4)
    gpt = GPT2LMHeadModel(cfg).to(DEVICE)
    # quick mock training loop: use tokenized report substrings as targets (demo)
    opt = torch.optim.Adam(gpt.parameters(), lr=1e-4)
    for epoch in range(EPOCHS_GPT):
        gpt.train(); r=0
        for i,report in enumerate(reports[:500]):
            enc = tok.encode(report)
            ids = enc.ids
            if len(ids)<16: continue
            ids_t = torch.tensor(ids[:64], device=DEVICE).unsqueeze(0)
            outputs = gpt(ids_t, labels=ids_t)
            loss = outputs.loss
            opt.zero_grad(); loss.backward(); opt.step()
            r += loss.item()
            if i>200: break
        print(f"GPT2-proposed epoch {epoch+1}, loss {r/200:.4f}")
    print("Custom tokenizer + GPT2 demo finished.")
except Exception as e:
    print("Transformers or tokenizers not installed or corpus missing:", e)


== Algorithm 6: Custom Domain-Specific Tokenizer + GPT-2 (sketch) ==


2025-09-23 14:30:08.269818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758637808.600847      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758637808.701064      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered





Custom tokenizer vocab size: 2000


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


GPT2-proposed epoch 1, loss 1.1076
Custom tokenizer + GPT2 demo finished.


In [16]:
print("\n== Algorithm 7: Preprocessing & Normalization Pipeline demo ==")
def preprocess_pipeline(x_numpy):
    # x_numpy: (T,12) or (12,T) - expect (12,T)
    if x_numpy.ndim==2 and x_numpy.shape[0]==12:
        X = x_numpy
    else:
        X = x_numpy.T
    # baseline wander removal (high-pass via difference)
    X = X - np.mean(X, axis=1, keepdims=True)
    # robust scaling per lead (clip extreme)
    med = np.median(X, axis=1, keepdims=True)
    mad = np.median(np.abs(X - med), axis=1, keepdims=True) + 1e-6
    Xs = (X - med) / mad
    # bandpass (simple butterworth would be better); here do a simple rolling mean subtraction
    window = 5
    smooth = np.convolve(np.ones(window)/window, Xs[0], mode='same')
    # return scaled
    return Xs

# quick check on one sample
x0,_ = train_ds[0]
print("Preprocess sample shape:", x0.shape)
print("Preprocess example (lead0 first 5) ->", preprocess_pipeline(x0.numpy())[0,:5])


== Algorithm 7: Preprocessing & Normalization Pipeline demo ==
Preprocess sample shape: torch.Size([12, 500])
Preprocess example (lead0 first 5) -> [5.871397  4.8999734 5.014259  3.69998   2.7571282]


In [17]:
print("\n== Final: Train a simple ResNet1D classifier and print metrics ==")
class SimpleResNet1D(nn.Module):
    def __init__(self, out_dim=128, num_classes=len(CLASSES)):
        super().__init__()
        self.conv1 = nn.Conv1d(12,64,7,stride=2,padding=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool1d(128)
        self.fc = nn.Linear(64*128, out_dim)
        self.head = nn.Linear(out_dim, num_classes)
    def forward(self,x):
        h = self.relu(self.bn1(self.conv1(x)))
        h = self.pool(h).flatten(1)
        h = self.fc(h)
        return self.head(h)

resnet_clf = SimpleResNet1D().to(DEVICE)
opt = torch.optim.Adam(resnet_clf.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(EPOCHS_CLASSIFIER):
    resnet_clf.train(); running=0; n=0
    for xb,yb in train_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        logits = resnet_clf(xb)
        loss = criterion(logits, yb)
        opt.zero_grad(); loss.backward(); opt.step()
        running += loss.item(); n+=1
    print(f"ResNet clf epoch {epoch+1}, loss {running/max(1,n):.4f}")

print("\nEvaluate ResNet classifier on test set:")
Ys, Ps, preds = None, None, None
Ys, Ps, preds = eval_multilabel(resnet_clf, test_loader)

# detailed per-class report
try:
    print("\nClassification report (per-label) - threshold 0.5")
    y_true = Ys
    y_pred = (Ps>0.5).astype(int)
    for i,cls in enumerate(CLASSES):
        print(f"--- {cls} ---")
        print(classification_report(y_true[:,i], y_pred[:,i], zero_division=0))
except Exception as e:
    print("Error printing classification report:", e)

print("\nAll algorithms demo finished. Increase EPOCHS_* for better performance and replace fake captioning targets with real tokenized text for captioning/GPT training.")


== Final: Train a simple ResNet1D classifier and print metrics ==


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ResNet clf epoch 1, loss 0.4723


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ResNet clf epoch 2, loss 0.4206

Evaluate ResNet classifier on test set:


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AUROC(macro): 0.8011825252731727
F1(macro): 0.42816122628209596
Exact-match accuracy: 0.43131868131868134

Classification report (per-label) - threshold 0.5
--- CD ---
              precision    recall  f1-score   support

         0.0       0.82      0.96      0.89      1674
         1.0       0.71      0.31      0.43       510

    accuracy                           0.81      2184
   macro avg       0.77      0.63      0.66      2184
weighted avg       0.79      0.81      0.78      2184

--- HYP ---
              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94      1937
         1.0       0.40      0.02      0.03       247

    accuracy                           0.89      2184
   macro avg       0.64      0.51      0.49      2184
weighted avg       0.83      0.89      0.84      2184

--- MI ---
              precision    recall  f1-score   support

         0.0       0.79      0.96      0.87      1622
         1.0       0.72      0.28      0.41   