In [1]:
import math
import random
from dataclasses import dataclass
from typing import Optional, Dict, List, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          get_linear_schedule_with_warmup)
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class NeuralNDCG(nn.Module):
    """
    NeuralNDCG for single-example list (classes) with gains r (shape [B, C]) and scores s (logits) [B, C].
    - Soft permutation via NeuralSort (tau: temperature).
    - Supports topk (focus on head of the ranked list).
    """
    def __init__(self, tau: float = 1.0, topk: Optional[int] = None, eps: float = 1e-8):
        super().__init__()
        self.tau = tau
        self.topk = topk
        self.eps  = eps

    @staticmethod
    def _discounts(C: int, device) -> torch.Tensor:
        # 1 / log2(1 + rank) ; ranks start at 1
        idx = torch.arange(1, C + 1, device=device, dtype=torch.float)
        return 1.0 / torch.log2(idx + 1.0)

    def _softperm_neuralsort(self, scores: torch.Tensor) -> torch.Tensor:
        """
        NeuralSort: soft permutation matrix \hat P \in R^{C x C} for each batch item.
        scores: [B, C]
        return: P_hat: [B, C, C]
        """
        B, C = scores.shape
        # pairwise score diffs (B, C, C)
        s = scores.unsqueeze(-1)  # (B,C,1)
        diffs = s - s.transpose(1, 2)  # s_i - s_j

        # soft ranks via sinkhorn-like normalization (NeuralSort closed form):
        # P_hat = softmax(-(C+1-2*range(C)) * scores_sorted / tau) style — используем формулу с |s_i - s_j|.
        # Надёжный и простой вариант — «SoftSort by NeuralSort» из оригинальной статьи:
        # r_i = sum_j sigmoid((s_j - s_i)/tau)  -> expected rank of item i (1..C)
        # Затем из рангов собираем приближенную пермутацию с гаусс-ядером по расстоянию рангов.
        # Это «lightweight» приближение; практично и стабильно.

        ranks_expect = 1.0 + torch.sum(torch.sigmoid(diffs / self.tau), dim=-1)  # (B,C) in [1..C]
        # Гауссово распределим элементы по позициям 1..C
        pos = torch.arange(1, C + 1, device=scores.device).view(1, 1, C)  # (1,1,C)
        # bandwidth — эмпирически берем tau*C/2
        bw = self.tau * C / 2.0 + 1e-6
        P = torch.exp(- (ranks_expect.unsqueeze(-1) - pos) ** 2 / (2 * bw**2))  # (B,C,C)
        P = P / (P.sum(dim=-1, keepdim=True) + self.eps)
        return P  # rows: items, cols: positions

    def forward(self, scores: torch.Tensor, gains: torch.Tensor) -> torch.Tensor:
        """
        scores: [B, C] (logits)
        gains:  [B, C] (e.g., one-hot for single-label or graded relevance for multi-label)
        """
        B, C = scores.shape
        P_hat = self._softperm_neuralsort(scores)  # [B, C, C]

        # Discounts for positions (col dimension)
        if self.topk is None:
            K = C
        else:
            K = min(self.topk, C)
        D = self._discounts(C, scores.device)  # [C]
        D = D[:K]  # take head if topk set

        # Expected DCG: sum_i sum_pos gains_i * P[i, pos] * D[pos]
        # P_hat rows: items; cols: positions
        P_head = P_hat[:, :, :K]  # [B, C, K]
        dcg = torch.sum(gains.unsqueeze(-1) * P_head * D.view(1, 1, K), dim=(1, 2))  # [B]

        # Ideal DCG (hard sort of gains descending)
        sorted_gains, _ = torch.sort(gains, dim=-1, descending=True)
        idcg = torch.sum(sorted_gains[:, :K] * D.view(1, K), dim=-1) + self.eps  # [B]

        ndcg = dcg / idcg
        # we minimize 1 - NDCG (maximize NDCG)
        loss = 1.0 - ndcg
        return loss.mean()


In [11]:
from typing import Optional

DATASETS = {
    "sentiment_en": ("glue", "sst2", "sentence", "label"),
    "topic_en":     ("ag_news", None, "text", "label"),
}

def _to_str_list(col) -> list[str]:
    out = []
    for x in col:
        if x is None:
            out.append("")
        elif isinstance(x, str):
            out.append(x)
        elif isinstance(x, (bytes, bytearray)):
            out.append(x.decode("utf-8", errors="ignore"))
        elif isinstance(x, (list, tuple)):
            out.append(" ".join(map(str, x)))
        elif isinstance(x, dict):
            out.append(" ".join(map(str, x.values())))
        else:
            out.append(str(x))
    return out

def load_text_classification(task: str,
                             limit_train: Optional[int] = None,
                             limit_eval: Optional[int] = None):
    name, subset, text_col, y_col = DATASETS[task]
    ds = load_dataset(name) if subset is None else load_dataset(name, subset)

    tr = ds["train"] if "train" in ds else ds["validation"]
    te = ds["test"]  if "test"  in ds else ds["validation"]

    if limit_train: tr = tr.select(range(min(limit_train, len(tr))))
    if limit_eval:  te = te.select(range(min(limit_eval,  len(te))))

    texts_tr = _to_str_list(tr[text_col])
    texts_te = _to_str_list(te[text_col])
    y_tr = list(map(int, tr[y_col]))
    y_te = list(map(int, te[y_col]))
    num_labels = len(set(y_tr))
    return (texts_tr, y_tr), (texts_te, y_te), num_labels

@dataclass
class EncodedDataset(torch.utils.data.Dataset):
    enc: Dict[str, torch.Tensor]
    y: torch.Tensor
    def __len__(self): return self.y.size(0)
    def __getitem__(self, i):
        item = {k: v[i] for k, v in self.enc.items()}
        item["labels"] = self.y[i]
        return item

def make_encoded_dataset(texts: list[str], labels: list[int]):
    texts = _to_str_list(texts)
    enc = tokenizer(texts, padding=True, truncation=True,
                    max_length=MAX_LEN, return_tensors="pt")
    y = torch.tensor(list(map(int, labels)), dtype=torch.long)
    return EncodedDataset(enc, y)


In [None]:
MODEL_NAME = "bert-base-uncased"  # можно поменять на "ai-forever/ruBert-base"
MAX_LEN    = 160
BATCH      = 32
EPOCHS     = 3
LR         = 2e-5
LAMBDA_NDCG = 0.5   # вес NeuralNDCG
TOPK        = 3     # фокус на top-k (идея «усилить голову распределения»)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

@dataclass
class EncodedDataset(torch.utils.data.Dataset):
    enc: Dict[str, torch.Tensor]
    y: torch.Tensor

    def __len__(self): return self.y.size(0)
    def __getitem__(self, i):
        item = {k: v[i] for k, v in self.enc.items()}
        item["labels"] = self.y[i]
        return item

def make_encoded_dataset(texts: List[str], labels: List[int]):
    enc = tokenizer(
        texts, padding=True, truncation=True, max_length=MAX_LEN,
        return_tensors="pt",
        # return_token_type_ids=False  # можно явно выключить для DistilBERT
    )
    y = torch.tensor(labels, dtype=torch.long)
    return EncodedDataset(enc, y)

def build_model(num_labels: int):
    m = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    return m


In [19]:
def precision_at_k(logits: torch.Tensor, y: torch.Tensor, k: int = 3) -> float:
    topk = logits.topk(k, dim=-1).indices  # [B,k]
    y = y.view(-1, 1).expand_as(topk)
    hit = (topk == y).any(dim=-1).float().mean().item()
    return hit

def ndcg_at_k(logits: torch.Tensor, y: torch.Tensor, k: int = 3) -> float:
    # relevance = one-hot
    B, C = logits.shape
    gains = torch.zeros_like(logits)
    gains[torch.arange(B), y] = 1.0
    # hard NDCG@k:
    sorted_logits, idx = torch.sort(logits, dim=-1, descending=True)
    sorted_gains = torch.gather(gains, dim=-1, index=idx)
    D = 1.0 / torch.log2(torch.arange(1, C+1, device=logits.device).float() + 1.0)
    dcg  = (sorted_gains[:, :k] * D[:k]).sum(dim=-1)
    idcg = (torch.sort(gains, dim=-1, descending=True).values[:, :k] * D[:k]).sum(dim=-1) + 1e-8
    return (dcg / idcg).mean().item()


In [20]:
def train_one(task="sentiment_en", use_neural_ndcg=True, seed=42):
    torch.manual_seed(seed); np.random.seed(seed); random.seed(seed)

    (Xtr, ytr), (Xte, yte), C = load_text_classification(task, limit_train=None, limit_eval=None)
    ds_tr = make_encoded_dataset(Xtr, ytr)
    ds_te = make_encoded_dataset(Xte, yte)
    tr_loader = DataLoader(ds_tr, batch_size=BATCH, shuffle=True)
    te_loader = DataLoader(ds_te, batch_size=BATCH)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = build_model(C).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    num_steps = EPOCHS * len(tr_loader)
    sch = get_linear_schedule_with_warmup(opt, int(0.1*num_steps), num_steps)

    ce = nn.CrossEntropyLoss()
    ndcg_loss = NeuralNDCG(tau=1.0, topk=TOPK)

    for ep in range(1, EPOCHS+1):
        model.train()
        tr_loss, tr_ce, tr_ndcg = 0.0, 0.0, 0.0
        for batch in tr_loader:
            batch = {k: v.to(device) for k,v in batch.items()}
            y = batch.pop("labels")
            out = model(**batch)  # logits
            logits = out.logits

            loss_ce = ce(logits, y)
            if use_neural_ndcg:
                # формы [B,C]
                gains = torch.zeros_like(logits).scatter_(1, y.unsqueeze(1), 1.0)
                loss_ndcg = ndcg_loss(logits, gains)
                loss = loss_ce + LAMBDA_NDCG * loss_ndcg
                tr_ndcg += loss_ndcg.item() * y.size(0)
            else:
                loss = loss_ce

            tr_ce   += loss_ce.item() * y.size(0)
            tr_loss += loss.item()   * y.size(0)

            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step(); sch.step()

        # валидация
        model.eval()
        all_y, all_p = [], []
        with torch.no_grad():
            for batch in te_loader:
                labels = batch.pop("labels")
                batch = {k: v.to(device) for k,v in batch.items()}
                out = model(**batch)
                logits = out.logits.detach().cpu()
                preds  = logits.argmax(dim=-1)
                all_y.append(labels)
                all_p.append(logits)
        y_true = torch.cat(all_y)
        logits = torch.cat(all_p)

        acc = accuracy_score(y_true, logits.argmax(dim=-1))
        f1  = f1_score(y_true, logits.argmax(dim=-1), average="macro")
        p_at_k = precision_at_k(logits, y_true, k=TOPK)
        ndcg_k = ndcg_at_k(logits, y_true, k=TOPK)

        n = len(ds_tr)
        msg = (f"[{task}] epoch {ep}/{EPOCHS} | "
               f"train_loss={tr_loss/n:.4f} (ce={tr_ce/n:.4f}, ndcg={tr_ndcg/n:.4f}) | "
               f"Acc={acc:.4f} F1={f1:.4f} P@{TOPK}={p_at_k:.4f} NDCG@{TOPK}={ndcg_k:.4f}")
        print(msg)

    return model


In [21]:
import torch.nn as nn
from transformers import AutoModel

class SimpleCLSHead(nn.Module):
    def __init__(self, model_name: str, num_labels: int, dropout: float = 0.1):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, **kwargs):
        # Универсально: передаём только те тензоры, которые есть
        inputs = {}
        if input_ids is not None: inputs["input_ids"] = input_ids
        if attention_mask is not None: inputs["attention_mask"] = attention_mask
        # Некоторые модели (DistilBERT) не используют token_type_ids — просто не передаём
        if token_type_ids is not None and "token_type_ids" in self.backbone.forward.__code__.co_varnames:
            inputs["token_type_ids"] = token_type_ids

        out = self.backbone(**inputs)                    # last_hidden_state: [B, T, H]
        cls = out.last_hidden_state[:, 0]               # CLS
        logits = self.classifier(self.dropout(cls))     # [B, C]

        # Возвращаем объект с .logits, чтобы остальной код не трогать
        return type("Out", (), {"logits": logits})

def build_model(num_labels: int):
    # Использует твой MODEL_NAME
    return SimpleCLSHead(MODEL_NAME, num_labels)


In [None]:
# ===== ХЕЛПЕР: сохраняем историю метрик из train_one =====
def train_one_with_history(task="sentiment_en", use_neural_ndcg=True, seed=42):
    torch.manual_seed(seed); np.random.seed(seed); random.seed(seed)

    (Xtr, ytr), (Xte, yte), C = load_text_classification(task, limit_train=None, limit_eval=None)
    ds_tr = make_encoded_dataset(Xtr, ytr)
    ds_te = make_encoded_dataset(Xte, yte)
    tr_loader = DataLoader(ds_tr, batch_size=BATCH, shuffle=True)
    te_loader = DataLoader(ds_te, batch_size=BATCH)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = build_model(C).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    num_steps = EPOCHS * len(tr_loader)
    sch = get_linear_schedule_with_warmup(opt, int(0.1*num_steps), num_steps)

    ce = nn.CrossEntropyLoss()
    ndcg_loss = NeuralNDCG(tau=1.0, topk=TOPK)

    history = []  # тут копим метрики по эпохам

    for ep in range(1, EPOCHS+1):
        model.train()
        tr_loss, tr_ce, tr_ndcg = 0.0, 0.0, 0.0
        n_seen = 0

        for batch in tr_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            y = batch.pop("labels")
            out = model(**batch)
            logits = out.logits

            loss_ce = ce(logits, y)
            if use_neural_ndcg:
                gains = torch.zeros_like(logits).scatter_(1, y.unsqueeze(1), 1.0)
                loss_ndcg = ndcg_loss(logits, gains)
                loss = loss_ce + LAMBDA_NDCG * loss_ndcg
                tr_ndcg += loss_ndcg.item() * y.size(0)
            else:
                loss = loss_ce

            tr_ce   += loss_ce.item() * y.size(0)
            tr_loss += loss.item()   * y.size(0)
            n_seen  += y.size(0)

            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step(); sch.step()

        model.eval()
        all_y, all_logits = [], []
        with torch.no_grad():
            for batch in te_loader:
                labels = batch.pop("labels")
                batch = {k: v.to(device) for k, v in batch.items()}
                out = model(**batch)
                logits = out.logits.detach().cpu()
                all_y.append(labels); all_logits.append(logits)

        y_true = torch.cat(all_y)
        logits = torch.cat(all_logits)
        acc = accuracy_score(y_true, logits.argmax(dim=-1))
        f1  = f1_score(y_true, logits.argmax(dim=-1), average="macro")
        p_at_k = precision_at_k(logits, y_true, k=TOPK)
        ndcg_k = ndcg_at_k(logits, y_true, k=TOPK)

        history.append({
            "task": task,
            "mode": "CE+NeuralNDCG" if use_neural_ndcg else "CE",
            "epoch": ep,
            "train_loss": tr_loss / max(1, n_seen),
            "train_ce":   tr_ce   / max(1, n_seen),
            "train_ndcg": tr_ndcg / max(1, n_seen) if use_neural_ndcg else 0.0,
            "acc": acc, "f1": f1, f"P@{TOPK}": p_at_k, f"NDCG@{TOPK}": ndcg_k
        })

        print(f"[{task}][{'ND' if use_neural_ndcg else 'CE'}] "
              f"epoch {ep}/{EPOCHS} | "
              f"loss={history[-1]['train_loss']:.4f} "
              f"(ce={history[-1]['train_ce']:.4f}, nd={history[-1]['train_ndcg']:.4f}) | "
              f"Acc={acc:.4f} F1={f1:.4f} P@{TOPK}={p_at_k:.4f} NDCG@{TOPK}={ndcg_k:.4f}")

    return model, history


# ===== ЗАПУСК ЭКСПЕРИМЕНТОВ =====
all_hist = []

print("=== BASELINE (CrossEntropy) ===")
_, h1 = train_one_with_history("sentiment_en", use_neural_ndcg=False)
_, h2 = train_one_with_history("topic_en",     use_neural_ndcg=False)
all_hist += h1 + h2

print("\n=== CE + NeuralNDCG (λ=%.2f, topk=%d) ===" % (LAMBDA_NDCG, TOPK))
_, h3 = train_one_with_history("sentiment_en", use_neural_ndcg=True)
_, h4 = train_one_with_history("topic_en",     use_neural_ndcg=True)
all_hist += h3 + h4

# ===== СВОДНАЯ ТАБЛИЦА =====
import pandas as pd
df_hist = pd.DataFrame(all_hist)
display(df_hist)

# Итог по лучшей эпохе для каждой (task, mode)
summary = (df_hist
           .sort_values(["task","mode","f1"], ascending=[True, True, False])
           .groupby(["task","mode"], as_index=False)
           .first()[["task","mode","epoch","acc","f1",f"P@{TOPK}",f"NDCG@{TOPK}"]])
print("\n==== SUMMARY (best epoch per task/mode) ====")
display(summary)

# ===== ГРАФИКИ (динамика по эпохам) =====
import matplotlib.pyplot as plt

for task in df_hist["task"].unique():
    sub = df_hist[df_hist["task"]==task]
    plt.figure(figsize=(8,4))
    for mode in ["CE", "CE+NeuralNDCG"]:
        sm = sub[sub["mode"]==mode]
        plt.plot(sm["epoch"], sm["f1"], marker="o", label=f"{mode} F1")
        plt.plot(sm["epoch"], sm[f"NDCG@{TOPK}"], marker="s", linestyle="--", label=f"{mode} NDCG@{TOPK}")
    plt.title(f"{task}: F1 & NDCG@{TOPK} by epoch")
    plt.xlabel("epoch"); plt.grid(True); plt.legend(); plt.show()


=== BASELINE (CrossEntropy) ===


ValueError: Could not find BertModel neither in <module 'transformers.models.bert' from '/Users/anpalmak/vscode/.venv/lib/python3.9/site-packages/transformers/models/bert/__init__.py'> nor in <module 'transformers' from '/Users/anpalmak/vscode/.venv/lib/python3.9/site-packages/transformers/__init__.py'>!