In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)


2.5.1+cu121
12.1


In [None]:
import os, json, math, time, random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, log_loss

# -----------------------------
# Repro / device
# -----------------------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

set_seed(42)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_GPUS = torch.cuda.device_count()
print("DEVICE:", DEVICE, "| GPUs available:", NUM_GPUS)

def unwrap(m):  # for DataParallel
    return m.module if isinstance(m, nn.DataParallel) else m

# -----------------------------
# Dataset (memmap safe)
# -----------------------------
class NPYDataset(Dataset):
    def __init__(self, Xc, Xn, y):
        self.Xc = Xc; self.Xn = Xn; self.y = y
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return (torch.from_numpy(self.Xc[idx].astype(np.int64)),
                torch.from_numpy(self.Xn[idx].astype(np.float32)),
                torch.tensor(float(self.y[idx]), dtype=torch.float32))

def make_loader(Xc, Xn, y, batch_size, shuffle, workers=4):
    return DataLoader(NPYDataset(Xc, Xn, y),
                      batch_size=batch_size, shuffle=shuffle,
                      num_workers=workers, pin_memory=True, drop_last=False)

# -----------------------------
# Pretty logger
# -----------------------------
class PrettyLogger:
    def __init__(self, patience=5):
        self.best_auc = -1.0
        self.stale = 0
        self.patience = patience
    def log(self, epoch, train_ll, val_ll, val_auc):
        if val_auc > self.best_auc + 1e-12:
            self.best_auc = val_auc
            self.stale = 0
            tag = " *BEST*"
        else:
            self.stale += 1
            tag = f" stale {self.stale}/{self.patience}"
        print(f"Epoch {epoch:02d} | train_ll={train_ll:.4f} val_ll={val_ll:.4f} val_auc={val_auc:.4f}{tag}")
    def should_stop(self): return self.stale >= self.patience

# -----------------------------
# Models
# -----------------------------
bce = nn.BCEWithLogitsLoss()

class WideLR(nn.Module):
    """Paper-standard 'wide' LR: linear numeric + per-field embedding to scalar."""
    def __init__(self, cat_cards, n_num):
        super().__init__()
        self.cat_w = nn.ModuleList([nn.Embedding(c, 1) for c in cat_cards])
        self.num_w = nn.Linear(n_num, 1)
    def forward(self, x_cat, x_num):
        s = self.num_w(x_num).squeeze(-1)
        for i, emb in enumerate(self.cat_w):
            s = s + emb(x_cat[:, i]).squeeze(-1)
        return s

class CatEmbeddings(nn.Module):
    def __init__(self, cat_cards, d):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, d) for c in cat_cards])
        for e in self.embs:
            nn.init.normal_(e.weight, std=0.01)
    def forward(self, x_cat):
        outs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)]
        return torch.stack(outs, dim=1)  # [B,F,d]

class FM(nn.Module):
    def __init__(self, cat_cards, n_num, d=16):
        super().__init__()
        self.lr = WideLR(cat_cards, n_num)
        self.cat_emb = CatEmbeddings(cat_cards, d)
        self.num_proj = nn.Linear(n_num, d, bias=False)
    def forward(self, x_cat, x_num):
        lin = self.lr(x_cat, x_num)
        Ec = self.cat_emb(x_cat)                          # [B,F,d]
        En = self.num_proj(x_num).unsqueeze(1)            # [B,1,d]
        E = torch.cat([Ec, En], dim=1)                    # [B,F+1,d]
        s = E.sum(dim=1)
        fm = 0.5 * (s*s - (E*E).sum(dim=1)).sum(dim=1)
        return lin + fm

class DeepFM(nn.Module):
    def __init__(self, cat_cards, n_num, d=16, hidden=(400,400,400), dropout=0.2):
        super().__init__()
        self.fm = FM(cat_cards, n_num, d)
        self.cat_emb = CatEmbeddings(cat_cards, d)
        self.num_emb = nn.Linear(n_num, d)
        in_dim = (len(cat_cards)+1) * d
        layers = []
        cur = in_dim
        for h in hidden:
            layers += [nn.Linear(cur, h), nn.ReLU(), nn.Dropout(dropout)]
            cur = h
        layers += [nn.Linear(cur, 1)]
        self.mlp = nn.Sequential(*layers)
    def forward(self, x_cat, x_num):
        y_fm = self.fm(x_cat, x_num)
        Ec = self.cat_emb(x_cat)
        En = self.num_emb(x_num).unsqueeze(1)
        x = torch.cat([Ec, En], dim=1).flatten(1)
        y_dnn = self.mlp(x).squeeze(-1)
        return y_fm + y_dnn

class DCN(nn.Module):
    def __init__(self, cat_cards, n_num, d=16, cross_layers=3, hidden=(400,400), dropout=0.2):
        super().__init__()
        self.lr = WideLR(cat_cards, n_num)
        self.cat_emb = CatEmbeddings(cat_cards, d)
        self.num_emb = nn.Linear(n_num, d)
        self.D = (len(cat_cards)+1) * d

        self.cross_w = nn.ParameterList([nn.Parameter(torch.randn(self.D, 1)*0.01) for _ in range(cross_layers)])
        self.cross_b = nn.ParameterList([nn.Parameter(torch.zeros(self.D)) for _ in range(cross_layers)])

        layers = []
        cur = self.D
        for h in hidden:
            layers += [nn.Linear(cur, h), nn.ReLU(), nn.Dropout(dropout)]
            cur = h
        self.deep = nn.Sequential(*layers)
        self.out = nn.Linear(cur + self.D, 1)

    def forward(self, x_cat, x_num):
        Ec = self.cat_emb(x_cat)
        En = self.num_emb(x_num).unsqueeze(1)
        x0 = torch.cat([Ec, En], dim=1).flatten(1)  # [B,D]
        x = x0
        for w, b in zip(self.cross_w, self.cross_b):
            xlw = (x @ w).squeeze(1)               # [B]
            x = x0 * xlw.unsqueeze(1) + b + x
        deep_out = self.deep(x0)
        return self.out(torch.cat([x, deep_out], dim=1)).squeeze(-1) + self.lr(x_cat, x_num)

class FinalMLP(nn.Module):
    """
    Paper-style FinalMLP core:
    - Dual-stream MLP
    - Stream-specific feature selection (bit-wise gates)
    - Multi-head bilinear fusion of stream representations
    """
    def __init__(self, cat_cards, n_num, d=16, hidden=(512,256,128), dropout=0.2, n_heads=4, bilinear_dim=128):
        super().__init__()
        self.F = len(cat_cards); self.d = d
        self.cat_emb = nn.ModuleList([nn.Embedding(c, d) for c in cat_cards])
        self.num_emb = nn.Linear(n_num, d)

        self.gate_a = nn.ModuleList([nn.Linear(d, d) for _ in range(self.F + 1)])
        self.gate_b = nn.ModuleList([nn.Linear(d, d) for _ in range(self.F + 1)])

        in_dim = (self.F + 1) * d
        self.mlp_a, last_dim = self._mlp(in_dim, hidden, dropout)
        self.mlp_b, _        = self._mlp(in_dim, hidden, dropout)

        assert last_dim % n_heads == 0
        self.n_heads = n_heads
        self.head_dim = last_dim // n_heads
        self.W = nn.Parameter(torch.randn(n_heads, self.head_dim, self.head_dim) * 0.01)
        self.bilinear_out = nn.Linear(n_heads * self.head_dim, bilinear_dim)

        self.out = nn.Linear(last_dim*2 + bilinear_dim, 1)

    def _mlp(self, in_dim, hidden, dropout):
        layers = []
        cur = in_dim
        for h in hidden:
            layers += [nn.Linear(cur, h), nn.ReLU(), nn.Dropout(dropout)]
            cur = h
        return nn.Sequential(*layers), cur

    def _embed(self, x_cat, x_num):
        B = x_cat.size(0)
        Ec = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.cat_emb)], dim=1)  # [B,F,d]
        En = self.num_emb(x_num).view(B,1,self.d)                                          # [B,1,d]
        return torch.cat([Ec, En], dim=1)                                                  # [B,F+1,d]

    def _gate(self, E, gates):
        outs = []
        for i in range(E.size(1)):
            g = torch.sigmoid(gates[i](E[:, i, :]))
            outs.append(E[:, i, :] * g)
        return torch.stack(outs, dim=1)

    def _mh_bilinear(self, ha, hb):
        B = ha.size(0)
        ha = ha.view(B, self.n_heads, self.head_dim)
        hb = hb.view(B, self.n_heads, self.head_dim)
        hw = torch.einsum("bhd,hde->bhe", ha, self.W)          # [B,H,Dh]
        z  = (hw * hb).reshape(B, self.n_heads * self.head_dim)
        return self.bilinear_out(z)

    def forward(self, x_cat, x_num):
        E  = self._embed(x_cat, x_num)
        Ea = self._gate(E, self.gate_a).flatten(1)
        Eb = self._gate(E, self.gate_b).flatten(1)
        ha = self.mlp_a(Ea)
        hb = self.mlp_b(Eb)
        hib = self._mh_bilinear(ha, hb)
        return self.out(torch.cat([ha, hb, hib], dim=1)).squeeze(-1)
    
class FeatureTokenizer(nn.Module):
    def __init__(self, cat_cardinalities, n_num, d_model):
        super().__init__()
        self.cat_embs = nn.ModuleList([nn.Embedding(card, d_model) for card in cat_cardinalities])
        self.num_proj = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_num)])
        self.cls = nn.Parameter(torch.zeros(1, 1, d_model))
        nn.init.trunc_normal_(self.cls, std=0.02)

    def forward(self, x_cat, x_num):
        B = x_cat.size(0)
        cat_tokens = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embs)]
        num_tokens = [proj(x_num[:, i:i+1]) for i, proj in enumerate(self.num_proj)]
        field_embs = torch.stack(cat_tokens + num_tokens, dim=1)   # [B, F, d]
        cls = self.cls.expand(B, -1, -1)                           # [B, 1, d]
        tokens = torch.cat([cls, field_embs], dim=1)               # [B, 1+F, d]
        return tokens, field_embs


class FTTransformer(nn.Module):
    def __init__(self, d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15):
        super().__init__()
        enc = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=ff,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc, num_layers=n_layers)

    def forward(self, x):
        return self.encoder(x)

    
class FTOnly(nn.Module):
    def __init__(self, cat_cards, n_num,
                 d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15):
        super().__init__()
        self.tok = FeatureTokenizer(cat_cards, n_num, d_model)
        self.backbone = FTTransformer(d_model, nhead, ff, n_layers, dropout)
        self.head = nn.Sequential(
            nn.Linear(d_model, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(128, 1)
        )

    def forward(self, x_cat, x_num):
        tokens, _ = self.tok(x_cat, x_num)
        H = self.backbone(tokens)
        h_cls = H[:, 0, :]
        return self.head(h_cls).squeeze(1)


# -----------------------------
# Train/Eval
# -----------------------------
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    ys, ps = [], []
    total_loss, n = 0.0, 0
    for x_cat, x_num, y in loader:
        x_cat = x_cat.to(DEVICE, non_blocking=True)
        x_num = x_num.to(DEVICE, non_blocking=True)
        y     = y.to(DEVICE, non_blocking=True)
        logits = model(x_cat, x_num)
        loss = bce(logits, y)
        total_loss += float(loss) * len(y)
        n += len(y)
        ys.append(y.detach().cpu().numpy())
        ps.append(torch.sigmoid(logits).detach().cpu().numpy())
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(ps)
    return total_loss / n, roc_auc_score(y_true, y_pred)

def init_output_bias_for_ctr(model, base_ctr):
    m = unwrap(model)
    b = math.log(base_ctr/(1.0-base_ctr))

    with torch.no_grad():
        # common "out" (DCN / FinalMLP in our code)
        if hasattr(m, "out") and isinstance(m.out, nn.Linear) and m.out.bias is not None:
            m.out.bias.fill_(b); return

        # your FT-AFM style "head"
        if hasattr(m, "head") and isinstance(m.head, nn.Sequential):
            last = m.head[-1]
            if isinstance(last, nn.Linear) and last.bias is not None:
                last.bias.fill_(b); return

        # WideLR
        if hasattr(m, "num_w") and isinstance(m.num_w, nn.Linear) and m.num_w.bias is not None:
            m.num_w.bias.fill_(b); return

        # DeepFM: last layer inside mlp
        if hasattr(m, "mlp") and isinstance(m.mlp, nn.Sequential):
            for layer in reversed(m.mlp):
                if isinstance(layer, nn.Linear) and layer.bias is not None:
                    layer.bias.fill_(b); return

        print("[warn] Could not init output bias; skipping.")


def train_one(name, build_model_fn, BASE, out_dir,
              batch_size=8192, max_epochs=20, patience=5, lr=1e-3,
              sample_size=None, val_size=None, test_size=None):
    os.makedirs(out_dir, exist_ok=True)

    Xc_tr = np.load(os.path.join(BASE, "Xc_train.npy"), mmap_mode="r")
    Xc_va = np.load(os.path.join(BASE, "Xc_val.npy"),   mmap_mode="r")
    Xc_te = np.load(os.path.join(BASE, "Xc_test.npy"),  mmap_mode="r")
    Xn_tr = np.load(os.path.join(BASE, "Xn_train.npy"), mmap_mode="r")
    Xn_va = np.load(os.path.join(BASE, "Xn_val.npy"),   mmap_mode="r")
    Xn_te = np.load(os.path.join(BASE, "Xn_test.npy"),  mmap_mode="r")
    y_tr  = np.load(os.path.join(BASE, "y_train.npy"),  mmap_mode="r")
    y_va  = np.load(os.path.join(BASE, "y_val.npy"),    mmap_mode="r")
    y_te  = np.load(os.path.join(BASE, "y_test.npy"),   mmap_mode="r")

    schema = json.load(open(os.path.join(BASE, "schema.json"), "r"))
    cat_cards = schema["cat_cards"]
    n_num = len(schema["num_cols"])

    # ----------------------------
    # SMALL-SAMPLE SMOKE TEST
    # ----------------------------
    if sample_size is not None:
        ntr = min(int(sample_size), len(y_tr))
        if val_size is None:
            val_size = max(200, ntr // 5)
        if test_size is None:
            test_size = max(200, ntr // 5)

        nva = min(int(val_size), len(y_va))
        nte = min(int(test_size), len(y_te))

        Xc_tr, Xn_tr, y_tr = Xc_tr[:ntr], Xn_tr[:ntr], y_tr[:ntr]
        Xc_va, Xn_va, y_va = Xc_va[:nva], Xn_va[:nva], y_va[:nva]
        Xc_te, Xn_te, y_te = Xc_te[:nte], Xn_te[:nte], y_te[:nte]

        print(f"[SMOKE] train={len(y_tr)} val={len(y_va)} test={len(y_te)}")

    tr_loader = make_loader(Xc_tr, Xn_tr, y_tr, batch_size=batch_size, shuffle=True)
    va_loader = make_loader(Xc_va, Xn_va, y_va, batch_size=batch_size, shuffle=False)
    te_loader = make_loader(Xc_te, Xn_te, y_te, batch_size=batch_size, shuffle=False)

    model = build_model_fn(cat_cards, n_num).to(DEVICE)
    if NUM_GPUS > 1:
        model = nn.DataParallel(model)

    init_output_bias_for_ctr(model, float(np.mean(y_tr)))

    opt = torch.optim.Adam(model.parameters(), lr=lr)
    logger = PrettyLogger(patience=patience)

    best_auc = -1.0
    best_state = None

    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss, n = 0.0, 0
        for x_cat, x_num, y in tr_loader:
            x_cat = x_cat.to(DEVICE, non_blocking=True)
            x_num = x_num.to(DEVICE, non_blocking=True)
            y     = y.to(DEVICE, non_blocking=True)

            opt.zero_grad(set_to_none=True)
            logits = model(x_cat, x_num)
            loss = bce(logits, y)
            loss.backward()
            opt.step()

            total_loss += float(loss) * len(y)
            n += len(y)

        train_ll = total_loss / n
        val_ll, val_auc = evaluate(model, va_loader)
        logger.log(epoch, train_ll, val_ll, val_auc)

        if val_auc > best_auc + 1e-12:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in unwrap(model).state_dict().items()}

        if logger.should_stop():
            print("Early stopped.")
            break

    print("\nEvaluating on test set...")
    if best_state is not None:
        unwrap(model).load_state_dict(best_state)

    test_ll, test_auc = evaluate(model, te_loader)

    print("\n" + "="*80)
    print(f"{name} RESULTS  |  BASE={BASE}")
    print("="*80)
    print(f"Test AUC:     {test_auc:.4f}")
    print(f"Test LogLoss: {test_ll:.4f}")
    print(f"Saved to: {out_dir}/")
    print("="*80)

    with open(os.path.join(out_dir, "results.json"), "w") as f:
        json.dump({"test_auc": float(test_auc), "test_logloss": float(test_ll)}, f, indent=2)

    return float(test_auc), float(test_ll)

def run_all_baselines(BASE, prefix,
                      batch_size=8192, max_epochs=20, patience=5, lr=1e-3):
    results = {}

    def lr_fn(cat_cards, n_num):      return WideLR(cat_cards, n_num)
    def fm_fn(cat_cards, n_num):      return FM(cat_cards, n_num, d=16)
    def deepfm_fn(cat_cards, n_num):  return DeepFM(cat_cards, n_num, d=16)
    def dcn_fn(cat_cards, n_num):     return DCN(cat_cards, n_num, d=16, cross_layers=3)
    def finalmlp_fn(cat_cards, n_num):return FinalMLP(cat_cards, n_num, d=16, n_heads=4, bilinear_dim=128)
    def ftonly_fn(cat_cards, n_num):  return FTOnly(cat_cards, n_num, d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15)


    models = [
        ("LR", lr_fn),
        ("FM", fm_fn),
        ("DeepFM", deepfm_fn),
        ("DCN", dcn_fn),
        ("FinalMLP", finalmlp_fn),
        ("FT-only", ftonly_fn),
    ]

    for name, fn in models:
        out_dir = f"runs_{prefix}_{name}_seed42"
        print("\n" + "="*80)
        print(f"RUNNING {prefix.upper()}  |  {name}  |  batch={batch_size}  |  GPUs={NUM_GPUS}")
        print("="*80)
        results[name] = train_one(name=f"{prefix.upper()} {name}",
                                  build_model_fn=fn,
                                  BASE=BASE,
                                  out_dir=out_dir,
                                  batch_size=batch_size,
                                  max_epochs=max_epochs,
                                  patience=patience,
                                  lr=lr)
    return results

# -----------------------------
# CALLS (edit BASE paths if needed)
# -----------------------------
# Use the folder names exactly as they appear in your file browser
results_criteo = run_all_baselines(BASE="criteo_preprocessed", prefix="criteo")
results_outbrain = run_all_baselines(BASE="outbrain_preprocessed_40m", prefix="outbrain")  # change if your folder differs
results_avazu = run_all_baselines(BASE="runs_avazu_40m_improved_ft_afm", prefix="avazu")

print("Ready. Uncomment the dataset runs at the bottom.")


DEVICE: cuda | GPUs available: 4

RUNNING CRITEO  |  LR  |  batch=8192  |  GPUs=4
Epoch 01 | train_ll=0.6735 val_ll=0.5181 val_auc=0.7411 *BEST*
Epoch 02 | train_ll=0.4976 val_ll=0.4871 val_auc=0.7638 *BEST*
Epoch 03 | train_ll=0.4784 val_ll=0.4776 val_auc=0.7719 *BEST*
Epoch 04 | train_ll=0.4707 val_ll=0.4734 val_auc=0.7759 *BEST*
Epoch 05 | train_ll=0.4667 val_ll=0.4712 val_auc=0.7782 *BEST*
Epoch 06 | train_ll=0.4643 val_ll=0.4700 val_auc=0.7795 *BEST*
Epoch 07 | train_ll=0.4627 val_ll=0.4694 val_auc=0.7801 *BEST*
Epoch 08 | train_ll=0.4617 val_ll=0.4691 val_auc=0.7807 *BEST*
Epoch 09 | train_ll=0.4609 val_ll=0.4689 val_auc=0.7811 *BEST*
Epoch 10 | train_ll=0.4604 val_ll=0.4689 val_auc=0.7812 *BEST*
Epoch 11 | train_ll=0.4600 val_ll=0.4688 val_auc=0.7814 *BEST*
Epoch 12 | train_ll=0.4596 val_ll=0.4689 val_auc=0.7815 *BEST*
Epoch 13 | train_ll=0.4594 val_ll=0.4690 val_auc=0.7813 stale 1/5
Epoch 14 | train_ll=0.4592 val_ll=0.4691 val_auc=0.7815 *BEST*
Epoch 15 | train_ll=0.4590 val_ll



Epoch 01 | train_ll=0.4582 val_ll=0.4488 val_auc=0.8028 *BEST*
Epoch 02 | train_ll=0.4462 val_ll=0.4465 val_auc=0.8060 *BEST*
Epoch 03 | train_ll=0.4415 val_ll=0.4454 val_auc=0.8076 *BEST*
Epoch 04 | train_ll=0.4376 val_ll=0.4438 val_auc=0.8080 *BEST*
Epoch 05 | train_ll=0.4337 val_ll=0.4438 val_auc=0.8081 *BEST*
Epoch 06 | train_ll=0.4293 val_ll=0.4486 val_auc=0.8067 stale 1/5
Epoch 07 | train_ll=0.4242 val_ll=0.4511 val_auc=0.8048 stale 2/5
Epoch 08 | train_ll=0.4182 val_ll=0.4589 val_auc=0.8013 stale 3/5
Epoch 09 | train_ll=0.4114 val_ll=0.4610 val_auc=0.7973 stale 4/5
Epoch 10 | train_ll=0.4040 val_ll=0.4690 val_auc=0.7950 stale 5/5
Early stopped.

Evaluating on test set...

CRITEO FT-only RESULTS  |  BASE=criteo_preprocessed
Test AUC:     0.8076
Test LogLoss: 0.4442
Saved to: runs_criteo_FT-only_seed42/

RUNNING OUTBRAIN  |  LR  |  batch=8192  |  GPUs=4
Epoch 01 | train_ll=0.5392 val_ll=0.4653 val_auc=0.6835 *BEST*
Epoch 02 | train_ll=0.4547 val_ll=0.4535 val_auc=0.6976 *BEST*
Epo



Epoch 01 | train_ll=0.4421 val_ll=0.4444 val_auc=0.7112 *BEST*
Epoch 02 | train_ll=0.4400 val_ll=0.4431 val_auc=0.7140 *BEST*
Epoch 03 | train_ll=0.4387 val_ll=0.4421 val_auc=0.7158 *BEST*
Epoch 04 | train_ll=0.4376 val_ll=0.4428 val_auc=0.7151 stale 1/5
Epoch 05 | train_ll=0.4361 val_ll=0.4426 val_auc=0.7150 stale 2/5
Epoch 06 | train_ll=0.4345 val_ll=0.4429 val_auc=0.7150 stale 3/5
Epoch 07 | train_ll=0.4327 val_ll=0.4437 val_auc=0.7136 stale 4/5
Epoch 08 | train_ll=0.4307 val_ll=0.4447 val_auc=0.7113 stale 5/5
Early stopped.

Evaluating on test set...

OUTBRAIN FT-only RESULTS  |  BASE=outbrain_preprocessed_40m
Test AUC:     0.7037
Test LogLoss: 0.4499
Saved to: runs_outbrain_FT-only_seed42/

RUNNING AVAZU  |  LR  |  batch=8192  |  GPUs=4
Epoch 01 | train_ll=0.5831 val_ll=0.4479 val_auc=0.6924 *BEST*
Epoch 02 | train_ll=0.4171 val_ll=0.4177 val_auc=0.7123 *BEST*
Epoch 03 | train_ll=0.4054 val_ll=0.4080 val_auc=0.7219 *BEST*
Epoch 04 | train_ll=0.3990 val_ll=0.4038 val_auc=0.7265 *BE



Epoch 01 | train_ll=0.3938 val_ll=0.3826 val_auc=0.7504 *BEST*
Epoch 02 | train_ll=0.3853 val_ll=0.3811 val_auc=0.7507 *BEST*
Epoch 03 | train_ll=0.3797 val_ll=0.3814 val_auc=0.7526 *BEST*
Epoch 04 | train_ll=0.3759 val_ll=0.3828 val_auc=0.7501 stale 1/5
Epoch 05 | train_ll=0.3731 val_ll=0.3839 val_auc=0.7498 stale 2/5
Epoch 06 | train_ll=0.3706 val_ll=0.3848 val_auc=0.7486 stale 3/5
Epoch 07 | train_ll=0.3683 val_ll=0.3913 val_auc=0.7445 stale 4/5


In [1]:
import os, json, pandas as pd

def collect_results(prefix):
    rows = []
    for d in sorted(os.listdir(".")):
        if d.startswith(f"runs_{prefix}_") and os.path.isdir(d):
            res_path = os.path.join(d, "results.json")
            if os.path.exists(res_path):
                with open(res_path, "r") as f:
                    r = json.load(f)
                model = d.replace(f"runs_{prefix}_", "").replace("_seed42", "")
                rows.append({
                    "Model": model,
                    "Test AUC": r["test_auc"],
                    "Test LogLoss": r["test_logloss"]
                })
    df = pd.DataFrame(rows).sort_values("Test AUC", ascending=False)
    return df.reset_index(drop=True)


In [4]:
criteo_df   = collect_results("criteo")
outbrain_df = collect_results("outbrain")
avazu_df    = collect_results("avazu")

criteo_df
#outbrain_df
#avazu_df

Unnamed: 0,Model,Test AUC,Test LogLoss
0,FT-only,0.807624,0.444239
1,FinalMLP,0.806275,0.445242
2,DCN,0.797264,0.456308
3,DeepFM,0.792818,0.461807
4,FM,0.781562,0.4744
5,LR,0.781301,0.469319
