В ноутбуке представлен 2 этап решения - тренировка LLM 

In [2]:
!pip -q install -U transformers datasets accelerate scikit-learn torchmetrics

In [1]:
import os, time, random, re, warnings
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report
import torch
from torchmetrics.functional import calibration_error
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)

# Пути
DATA_DIR = "/content"
LLM_PSEUDO = "/content/pseudo_labeled_llm.csv"   # результат LLM
CATS_TXT = f"{DATA_DIR}/categories.txt"
TEST_CSV = f"{DATA_DIR}/test.csv"
OUT_DIR = "./out_cls"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)


Загрузка категорий и данных

In [5]:
# категории
with open(CATS_TXT, "r", encoding="utf-8") as f:
    CATEGORIES = [l.strip() for l in f if l.strip()]
if "нет товара" not in CATEGORIES:
    CATEGORIES.append("нет товара")
cat2id = {c:i for i,c in enumerate(CATEGORIES)}
id2cat = {i:c for c,i in cat2id.items()}

# псевдолейблы (LLM → fallback)
df = pd.read_csv(LLM_PSEUDO)
assert {"text","category"}.issubset(df.columns)
print("Loaded LLM pseudo labels:", df.shape)

# очистка и кодировка
df["text"] = df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
df = df[df["text"].str.len() > 0].reset_index(drop=True)
df = df[df["category"].isin(CATEGORIES)].copy()
df["label"] = df["category"].map(cat2id)

# train/val
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)
print("train:", train_df.shape, "val:", val_df.shape)
print("val dist:\n", val_df["category"].value_counts())


Loaded LLM pseudo labels: (1818, 3)
train: (1454, 4) val: (364, 4)
val dist:
 category
одежда                    165
нет товара                 86
текстиль                   45
обувь                      40
электроника                10
товары для детей            8
украшения и аксессуары      7
бытовая техника             2
посуда                      1
Name: count, dtype: int64


Для начала попробуем обучить microsoft/mdeberta-v3-base

In [None]:
MODEL_NAME = "microsoft/mdeberta-v3-base" 
MAX_LEN = 192

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tok_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding=False)

ds_train = Dataset.from_pandas(train_df[["text","label"]])
ds_val   = Dataset.from_pandas(val_df[["text","label"]])
ds_train = ds_train.map(tok_batch, batched=True, remove_columns=["text"])
ds_val   = ds_val.map(tok_batch,   batched=True, remove_columns=["text"])

collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



Map:   0%|          | 0/1454 [00:00<?, ? examples/s]

Map:   0%|          | 0/364 [00:00<?, ? examples/s]

In [11]:
# class weights
classes = np.array(sorted(cat2id.values()))
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_df["label"].values
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to("cuda" if torch.cuda.is_available() else "cpu")

# модель
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(CATEGORIES)
)

# кастомный loss
from torch.nn import CrossEntropyLoss
def custom_ce_loss(outputs, labels):
    logits = outputs
    loss_fct = CrossEntropyLoss(weight=class_weights, label_smoothing=0.05)
    return loss_fct(logits, labels)

# метрики
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"weighted_f1": f1}


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Обучение

In [None]:

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir=OUT_DIR,                
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=25,
    eval_strategy="epoch",     
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",              
    load_best_model_at_end=False,      
    save_total_limit=0,                
    fp16=torch.cuda.is_available(),
    report_to="none"
)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = custom_ce_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = WeightedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Weighted F1
1,No log,3.012653,1.5e-05
2,3.290000,2.886813,0.05577
3,3.100700,2.794524,0.180407
4,2.918700,2.708902,0.237462
5,2.788800,2.649888,0.296052
6,2.559100,2.623199,0.280444
7,2.523600,2.60533,0.325909
8,2.310800,2.625452,0.325448
9,2.197600,2.554307,0.342684
10,2.076100,2.611024,0.398743


TrainOutput(global_step=1150, training_loss=2.06171735017196, metrics={'train_runtime': 299.2415, 'train_samples_per_second': 121.474, 'train_steps_per_second': 3.843, 'total_flos': 1393482933478656.0, 'train_loss': 2.06171735017196, 'epoch': 25.0})

Валидация и калибровка «нет товара»

In [13]:
raw = trainer.predict(ds_val)
val_logits = raw.predictions
val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()
val_preds = val_probs.argmax(axis=1)
val_true  = val_df["label"].to_numpy()

print("\nValidation report:")
print(classification_report(val_true, val_preds, target_names=CATEGORIES, digits=4))

# подбор tau
none_id = cat2id["нет товара"]
probs_max = val_probs.max(axis=1)
base_f1 = f1_score(val_true, val_preds, average="weighted")

best_tau, best_f1 = 0.0, base_f1
for tau in np.linspace(0.40, 0.95, 12):
    adj_preds = val_preds.copy()
    adj_preds[probs_max < tau] = none_id
    f1 = f1_score(val_true, adj_preds, average="weighted")
    if f1 > best_f1:
        best_f1, best_tau = f1, tau

print(f"\nBase F1={base_f1:.4f} | Best F1={best_f1:.4f} at tau={best_tau:.2f}")



Validation report:
                        precision    recall  f1-score   support

       бытовая техника     0.0000    0.0000    0.0000         2
                 обувь     0.6800    0.4250    0.5231        40
                одежда     0.9231    0.0727    0.1348       165
                посуда     0.0051    1.0000    0.0102         1
              текстиль     0.7838    0.6444    0.7073        45
      товары для детей     0.2000    0.2500    0.2222         8
украшения и аксессуары     0.2857    0.2857    0.2857         7
           электроника     0.5000    0.2000    0.2857        10
            нет товара     0.9429    0.7674    0.8462        86

              accuracy                         0.3599       364
             macro avg     0.4801    0.4050    0.3350       364
          weighted avg     0.8365    0.3599    0.4242       364


Base F1=0.4242 | Best F1=0.4242 at tau=0.00


Модель показала себя не очень хорошо. Теперь попробуем с моделью SBERT

In [None]:
import os, time, re, warnings
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModel,
    DataCollatorWithPadding, TrainingArguments, Trainer
)

# Модель
BASE_MODEL = "ai-forever/sbert_large_nlu_ru"

# Режим fine-tune: "head_only" | "last_n" | "all"
FINETUNE_MODE = "all"

# Токенизация/батчи
MAX_LEN = 160                
TRAIN_BS = 16                 # уменьшить, если тесно по VRAM
EVAL_BS  = 64

# Обучение
EPOCHS   = 25
LR_HEAD  = 2e-4               # голова побольше шаг
LR_ENC   = 1e-5               # энкодер поменьше
WEIGHT_DECAY = 0.01
LABEL_SMOOTH = 0.05

# Производительность/память
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [16]:
# категории
with open(CATS_TXT, "r", encoding="utf-8") as f:
    CATEGORIES = [l.strip() for l in f if l.strip()]
if "нет товара" not in CATEGORIES:
    CATEGORIES.append("нет товара")

cat2id = {c:i for i,c in enumerate(CATEGORIES)}
id2cat = {i:c for c,i in cat2id.items()}

# псевдо-лейблы: LLM → fallback
df = pd.read_csv(LLM_PSEUDO)
assert {"text","category"}.issubset(df.columns), "Ожидаю колонки text, category"
print("Loaded LLM pseudo labels:", df.shape)


# очистка
df["text"] = df["text"].astype(str).str.replace(r"\s+"," ", regex=True).str.strip()
df = df[df["text"].str.len() > 0]
df = df[df["category"].isin(CATEGORIES)].copy()

df["label"] = df["category"].map(cat2id)
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)
print("train:", train_df.shape, "val:", val_df.shape)
print("val dist:\n", val_df["category"].value_counts())


Loaded LLM pseudo labels: (1818, 3)
train: (1454, 4) val: (364, 4)
val dist:
 category
одежда                    165
нет товара                 86
текстиль                   45
обувь                      40
электроника                10
товары для детей            8
украшения и аксессуары      7
бытовая техника             2
посуда                      1
Name: count, dtype: int64


In [5]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

def tok_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding=False)

ds_train = Dataset.from_pandas(train_df[["text","label"]])
ds_val   = Dataset.from_pandas(val_df[["text","label"]])
ds_train = ds_train.map(tok_batch, batched=True, remove_columns=["text"])
ds_val   = ds_val.map(tok_batch,   batched=True, remove_columns=["text"])

collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1454 [00:00<?, ? examples/s]

Map:   0%|          | 0/364 [00:00<?, ? examples/s]

In [6]:
class MeanPooling(nn.Module):
    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: (B, T, H)
        mask = attention_mask.unsqueeze(-1).type_as(token_embeddings)  # (B,T,1)
        summed = (token_embeddings * mask).sum(dim=1)                  # (B,H)
        counts = mask.sum(dim=1).clamp(min=1e-9)                       # (B,1)->(B,H) broadcast
        return summed / counts

class SbertMeanPoolClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels, dropout=0.2):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hid = self.encoder.config.hidden_size
        self.pool = MeanPooling()
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hid, num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        enc_out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # enc_out.last_hidden_state: (B,T,H)
        pooled = self.pool(enc_out.last_hidden_state, attention_mask)  # (B,H)
        logits = self.classifier(self.dropout(pooled))                 # (B,C)
        return {"logits": logits}


In [7]:
def set_finetune_policy(model: SbertMeanPoolClassifier, mode: str = "last_n", last_n: int = 4):
    # Всегда обучаем классификатор
    for p in model.classifier.parameters():
        p.requires_grad = True

    # По умолчанию всё заморозим
    for p in model.encoder.parameters():
        p.requires_grad = False

    if mode == "head_only":
        print("FT policy: head_only (только классификатор).")
        return

    if mode == "all":
        for p in model.encoder.parameters():
            p.requires_grad = True
        print("FT policy: all layers (полный FT).")
        return

    # last_n
    try:
        encoder_layers = model.encoder.encoder.layer  # BERT-совместимо
    except AttributeError:
        # иногда структуры отличаются; подстрахуемся
        encoder_layers = None
    if encoder_layers is not None:
        for layer in encoder_layers[-last_n:]:
            for p in layer.parameters():
                p.requires_grad = True
        # также разрешим grad в pooler, если он есть
        if hasattr(model.encoder, "pooler") and model.encoder.pooler is not None:
            for p in model.encoder.pooler.parameters():
                p.requires_grad = True
        print(f"FT policy: last_n (разморожены последние {last_n} слоёв).")
    else:
        # если не нашли encoder.layer — разморозим всё как fallback
        for p in model.encoder.parameters():
            p.requires_grad = True
        print("Не удалось адресовать encoder.layer; FT policy fallback: all layers.")

model = SbertMeanPoolClassifier(BASE_MODEL, num_labels=len(CATEGORIES)).to(device)
set_finetune_policy(model, FINETUNE_MODE, UNFREEZE_LAST_N)

# Память: градиентный чекпоинтинг (снижает VRAM, медленнее)
model.encoder.gradient_checkpointing_enable()


FT policy: all layers (полный FT).


In [8]:
# class weights (для дисбаланса)
classes = np.arange(len(CATEGORIES))
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_df["label"].values
)
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

loss_fct = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=LABEL_SMOOTH)

def compute_loss_from_logits(logits, labels):
    return loss_fct(logits, labels)


In [9]:
# Патчим класс модели: добавляем методы с поддержкой kwargs
def _gc_enable(self, gradient_checkpointing_kwargs=None, **kwargs):
    enc = getattr(self, "encoder", None)
    if enc is None:
        return
    # Trainer ожидает, что use_cache выключен при GC
    if hasattr(enc, "config") and getattr(enc.config, "use_cache", None) is not None:
        enc.config.use_cache = False
    # Включаем gradient checkpointing у энкодера с пробросом kwargs
    if hasattr(enc, "gradient_checkpointing_enable"):
        try:
            enc.gradient_checkpointing_enable(**(gradient_checkpointing_kwargs or {}))
        except TypeError:
            # На случай старых сигнатур просто вызываем без аргументов
            enc.gradient_checkpointing_enable()

def _gc_disable(self, **kwargs):
    enc = getattr(self, "encoder", None)
    if enc is None:
        return
    if hasattr(enc, "gradient_checkpointing_disable"):
        try:
            enc.gradient_checkpointing_disable(**kwargs)
        except TypeError:
            enc.gradient_checkpointing_disable()

# навесим методы на наш класс
SbertMeanPoolClassifier.gradient_checkpointing_enable  = _gc_enable
SbertMeanPoolClassifier.gradient_checkpointing_disable = _gc_disable

# Можно сразу вручную включить GC (не обязательно, Trainer сделает сам)
try:
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
    print("Gradient checkpointing enabled (patched).")
except Exception as e:
    print("GC enable warning:", e)


Gradient checkpointing enabled (patched).


In [None]:

import gc, torch
from transformers import TrainingArguments, Trainer

# освободим предыдущий trainer/кэш
try:
    del trainer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
except NameError:
    pass

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    from sklearn.metrics import f1_score
    return {"weighted_f1": f1_score(labels, preds, average="weighted")}

# группы параметров 
head_params = list(model.classifier.parameters())
enc_params  = [p for n,p in model.named_parameters() if p.requires_grad and ("classifier" not in n)]
optim_groups = [
    {"params": enc_params,  "lr": LR_ENC,  "weight_decay": WEIGHT_DECAY},
    {"params": head_params, "lr": LR_HEAD, "weight_decay": WEIGHT_DECAY},
]

args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",           # сохраняем раз в эпоху
    save_total_limit=1,              # держим только один чекпоинт
    load_best_model_at_end=True,     # в конце подгружаем лучший по метрике
    metric_for_best_model="weighted_f1",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    report_to="none",
)

class SbertTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = torch.optim.AdamW(optim_groups, betas=(0.9, 0.999))
        return self.optimizer


    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)           
        logits  = outputs["logits"]
        loss = compute_loss_from_logits(logits, labels)  
        return (loss, outputs) if return_outputs else loss


trainer = SbertTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    processing_class=tokenizer,   
    data_collator=collator,
    compute_metrics=compute_metrics,
)


trainer.train()


Epoch,Training Loss,Validation Loss,Weighted F1
1,3.2916,2.589093,0.379805
2,2.4907,2.408216,0.450249
3,2.2293,2.361682,0.504203
4,2.0533,2.371959,0.546119
5,1.8541,2.497634,0.400316
6,1.7445,2.487984,0.381856
7,1.762,2.478459,0.73008
8,1.6503,2.498127,0.429576
9,1.8053,2.485314,0.402872
10,1.7213,2.489939,0.375788


Epoch,Training Loss,Validation Loss,Weighted F1
1,3.2916,2.589093,0.379805
2,2.4907,2.408216,0.450249
3,2.2293,2.361682,0.504203
4,2.0533,2.371959,0.546119
5,1.8541,2.497634,0.400316
6,1.7445,2.487984,0.381856
7,1.762,2.478459,0.73008
8,1.6503,2.498127,0.429576
9,1.8053,2.485314,0.402872
10,1.7213,2.489939,0.375788


TrainOutput(global_step=2275, training_loss=1.8151969389863067, metrics={'train_runtime': 2104.4132, 'train_samples_per_second': 17.273, 'train_steps_per_second': 1.081, 'total_flos': 0.0, 'train_loss': 1.8151969389863067, 'epoch': 25.0})

In [12]:
raw = trainer.predict(ds_val)
val_logits = raw.predictions
val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()
val_preds = val_probs.argmax(axis=1)
val_true  = val_df["label"].to_numpy()

print("\nValidation report:")
print(classification_report(val_true, val_preds, target_names=CATEGORIES, digits=4))

# Порог для "нет товара" по max softmax
none_id = cat2id["нет товара"]
probs_max = val_probs.max(axis=1)
base_f1 = f1_score(val_true, val_preds, average="weighted")

best_tau, best_f1 = 0.0, base_f1
for tau in np.linspace(0.40, 0.95, 12):
    adj = val_preds.copy()
    adj[probs_max < tau] = none_id
    f1 = f1_score(val_true, adj, average="weighted")
    if f1 > best_f1:
        best_f1, best_tau = f1, tau

print(f"\nBase F1={base_f1:.4f} | Best F1={best_f1:.4f} at tau={best_tau:.2f}")



Validation report:
                        precision    recall  f1-score   support

       бытовая техника     0.1667    0.5000    0.2500         2
                 обувь     0.7500    0.4500    0.5625        40
                одежда     0.9469    0.6485    0.7698       165
                посуда     0.0097    1.0000    0.0192         1
              текстиль     0.8250    0.7333    0.7765        45
      товары для детей     0.2222    0.2500    0.2353         8
украшения и аксессуары     1.0000    0.1429    0.2500         7
           электроника     0.8333    0.5000    0.6250        10
            нет товара     0.9839    0.7093    0.8243        86

              accuracy                         0.6291       364
             macro avg     0.6375    0.5482    0.4792       364
          weighted avg     0.8940    0.6291    0.7301       364


Base F1=0.7301 | Best F1=0.7301 at tau=0.00


Разультат намного лучше

In [13]:
test_df = pd.read_csv(TEST_CSV)
test_texts = test_df["text"].astype(str).str.replace(r"\s+"," ", regex=True).str.strip().tolist()

def predict_texts(texts: List[str], batch_size=256):
    mdl = trainer.model.eval()
    dev = next(mdl.parameters()).device
    all_probs = []
    with torch.inference_mode(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", truncation=True, max_length=MAX_LEN,
                            padding=True, pad_to_multiple_of=8).to(dev)
            logits = mdl(**enc)["logits"]
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs)

test_probs = predict_texts(test_texts, batch_size=256)
test_preds = test_probs.argmax(axis=1)

# применим tau для "нет товара"
test_max = test_probs.max(axis=1)
test_preds[test_max < best_tau] = none_id

submission = pd.DataFrame({
    "id": np.arange(len(test_df)),
    "category": [id2cat[i] for i in test_preds]
})
sub_path = f"{OUT_DIR}/submission_sbert_ft.csv"
submission.to_csv(sub_path, index=False)
print("Saved submission:", sub_path)


  with torch.inference_mode(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


Saved submission: ./out_cls/submission_sbert_ft.csv


Пробуем ещё чуть-чуть улучшить метрику

In [None]:
# --- апгрейд гиперпараметров ---
MAX_LEN = 256          # было 160 — длиннее контекст часто помогает
TRAIN_BS = 8           # уменьшили батч для VRAM при MAX_LEN=256
EVAL_BS  = 64

FINETUNE_MODE   = "all"


set_finetune_policy(model, FINETUNE_MODE, UNFREEZE_LAST_N)

from datasets import Dataset
from transformers import DataCollatorWithPadding

# Если по какой-то причине train_df/val_df утеряны, раскомментируй блок ниже, чтобы восстановить их
# import pandas as pd
# df = pd.read_csv("./pseudo_labeled_llm_zeroshot_fast.csv")  # или твой файл с псевдолейблами
# df = df[df["text"].astype(str).str.strip().ne("")]
# df["label"] = df["category"].map(cat2id)
# from sklearn.model_selection import train_test_split
# train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

ds_train = Dataset.from_pandas(train_df[["text", "label"]].reset_index(drop=True))
ds_val   = Dataset.from_pandas(val_df[["text", "label"]].reset_index(drop=True))

# Токенизация с новым MAX_LEN
def tok_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding=False)

ds_train = ds_train.map(tok_batch, batched=True, remove_columns=["text"])
ds_val   = ds_val.map(tok_batch,   batched=True, remove_columns=["text"])

#  Коллатор и sanity-check
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
print("train columns:", ds_train.column_names)
print("val columns:", ds_val.column_names)
# должны увидеть что-то вроде: ['label', 'input_ids', 'token_type_ids', 'attention_mask']



FT policy: all layers.


Map:   0%|          | 0/1454 [00:00<?, ? examples/s]

Map:   0%|          | 0/364 [00:00<?, ? examples/s]

train columns: ['label', 'input_ids', 'token_type_ids', 'attention_mask']
val columns: ['label', 'input_ids', 'token_type_ids', 'attention_mask']


FocalLoss (вместо CrossEntropy) + class weights

In [None]:
import torch.nn as nn
import torch

# class weights из данных
from sklearn.utils.class_weight import compute_class_weight
classes = np.arange(len(CATEGORIES))
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_df["label"].values
)
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=1.5, label_smoothing=0.02):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ls = label_smoothing
    def forward(self, logits, target, sample_weight=None):
        ce = nn.functional.cross_entropy(
            logits, target, weight=self.alpha, reduction="none", label_smoothing=self.ls
        )
        pt = torch.softmax(logits, dim=-1)[torch.arange(len(target), device=logits.device), target]
        loss = ((1 - pt) ** self.gamma) * ce
        if sample_weight is not None:
            loss = loss * sample_weight.to(loss.dtype)
        return loss.mean()

loss_fct = FocalLoss(alpha=class_weights, gamma=1.5, label_smoothing=0.02)

def compute_loss_from_logits(logits, labels, sample_weight=None):
    return loss_fct(logits, labels, sample_weight=sample_weight)


In [None]:
from transformers import TrainingArguments, Trainer
import torch, gc

def get_llrd_params(model, base_lr_enc=1e-5, head_lr=2e-4, decay=0.9, weight_decay=0.01):
    groups = []
    # encoder layers
    layers = getattr(getattr(model.encoder, "encoder", None), "layer", None)
    if layers is not None:
        lr = base_lr_enc
        for layer in reversed(layers):     # глубже — меньше lr
            groups.append({"params": layer.parameters(), "lr": lr, "weight_decay": weight_decay})
            lr *= decay
        # embeddings и pooler
        groups.append({"params": model.encoder.embeddings.parameters(), "lr": lr, "weight_decay": weight_decay})
        if hasattr(model.encoder, "pooler") and model.encoder.pooler is not None:
            groups.append({"params": model.encoder.pooler.parameters(), "lr": base_lr_enc, "weight_decay": weight_decay})
    else:
        groups.append({"params": model.encoder.parameters(), "lr": base_lr_enc, "weight_decay": weight_decay})
    # head
    groups.append({"params": model.classifier.parameters(), "lr": head_lr, "weight_decay": weight_decay})
    return groups

optim_groups = get_llrd_params(model, base_lr_enc=1e-5, head_lr=2e-4, decay=0.9, weight_decay=0.01)

# чистим старый trainer из памяти
try:
    del trainer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
except NameError:
    pass

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    from sklearn.metrics import f1_score
    return {"weighted_f1": f1_score(labels, preds, average="weighted")}

args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=7,                     
    eval_strategy="epoch",
    save_strategy="epoch",                   # сохраняем раз в эпоху
    save_total_limit=1,                      # храним только лучший чекпоинт
    load_best_model_at_end=True,             # подгружаем лучший по метрике
    metric_for_best_model="weighted_f1",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=False,            # чтобы избежать коллизий с кастомной моделью
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    report_to="none",
)

class SbertTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = torch.optim.AdamW(optim_groups, betas=(0.9, 0.999))
        return self.optimizer
    
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        sample_weight = inputs.pop("sample_weight", None) if "sample_weight" in inputs else None
        outputs = model(**inputs)   
        logits  = outputs["logits"]
        loss = compute_loss_from_logits(logits, labels, sample_weight=sample_weight)
        return (loss, outputs) if return_outputs else loss

trainer = SbertTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    processing_class=tokenizer,  
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.0582,2.549911,0.767551


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.0582,2.549911,0.767551
2,0.009,2.707064,0.771813
3,0.003,3.098678,0.769319
4,0.0036,3.093747,0.775874
5,0.0002,3.239947,0.768843
6,0.0002,3.273902,0.768843
7,0.0001,3.285834,0.766041


TrainOutput(global_step=1274, training_loss=0.012041866613750812, metrics={'train_runtime': 687.4828, 'train_samples_per_second': 14.805, 'train_steps_per_second': 1.853, 'total_flos': 0.0, 'train_loss': 0.012041866613750812, 'epoch': 7.0})

In [24]:
from sklearn.metrics import f1_score, classification_report

raw = trainer.predict(ds_val)
val_logits = raw.predictions
val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()
val_true  = val_df["label"].to_numpy()
base_pred = val_probs.argmax(1)

print("\nBase validation report:")
print(classification_report(val_true, base_pred, target_names=CATEGORIES, digits=4))

K = len(CATEGORIES)
best_tau_per_class = np.zeros(K)
best_pred = base_pred.copy()
best_f1 = f1_score(val_true, base_pred, average="weighted")
none_id = cat2id["нет товара"]

for k in range(K):
    best_k_tau, best_k_f1 = 0.0, best_f1
    for tau in np.linspace(0.30, 0.95, 14):
        pred = base_pred.copy()
        # для тех, у кого предсказан класс k, а уверенность по k < tau → переводим в "нет товара"
        mask = (pred == k) & (val_probs[np.arange(len(val_probs)), k] < tau)
        pred[mask] = none_id
        f1 = f1_score(val_true, pred, average="weighted")
        if f1 > best_k_f1:
            best_k_f1, best_k_tau, best_pred = f1, tau, pred
    best_tau_per_class[k] = best_k_tau

print(f"\nF1 after per-class thresholds: {f1_score(val_true, best_pred, average='weighted'):.4f}")
print("Per-class tau:", {CATEGORIES[i]: round(float(t), 2) for i,t in enumerate(best_tau_per_class)})

def apply_per_class_thresholds(probs: np.ndarray, base_pred: np.ndarray, tau_pc: np.ndarray, none_idx: int):
    pred = base_pred.copy()
    for k in range(probs.shape[1]):
        mask = (pred == k) & (probs[np.arange(len(probs)), k] < tau_pc[k])
        pred[mask] = none_idx
    return pred



Base validation report:
                        precision    recall  f1-score   support

       бытовая техника     0.0000    0.0000    0.0000         2
                 обувь     0.5385    0.5250    0.5316        40
                одежда     0.8315    0.8970    0.8630       165
                посуда     0.0000    0.0000    0.0000         1
              текстиль     0.7755    0.8444    0.8085        45
      товары для детей     0.5000    0.2500    0.3333         8
украшения и аксессуары     1.0000    0.1429    0.2500         7
           электроника     0.4286    0.3000    0.3529        10
            нет товара     0.8706    0.8605    0.8655        86

              accuracy                         0.7885       364
             macro avg     0.5494    0.4244    0.4450       364
          weighted avg     0.7796    0.7885    0.7759       364



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



F1 after per-class thresholds: 0.7771
Per-class tau: {'бытовая техника': 0.0, 'обувь': 0.5, 'одежда': 0.0, 'посуда': 0.0, 'текстиль': 0.0, 'товары для детей': 0.0, 'украшения и аксессуары': 0.0, 'электроника': 0.8, 'нет товара': 0.0}


In [None]:
# --- self-training: берём уверенные примеры и даём им больший вес ---
import numpy as np
import torch
from typing import List
from datasets import Dataset

def predict_probs_texts(texts: List[str], batch_size=256):
    mdl = trainer.model.eval()
    dev = next(mdl.parameters()).device
    out = []
    with torch.inference_mode(), torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(
                batch, return_tensors="pt",
                truncation=True, max_length=MAX_LEN,
                padding=True, pad_to_multiple_of=8
            ).to(dev)
            logits = mdl(**enc)["logits"]
            out.append(torch.softmax(logits, dim=-1).cpu().numpy())
    return np.vstack(out)

# 1) Прогоняем текущую модель по TRAIN-части (без валидации)
train_texts = train_df["text"].astype(str).tolist()
train_probs = predict_probs_texts(train_texts, batch_size=256)
train_conf  = train_probs.max(1)

# 2) Формируем веса: уверенным примерам больше вес
weights = np.where(train_conf >= 0.80, 1.0, 0.6).astype("float32")

# 3) Добавляем веса прямо в pandas и строим HF Dataset
train_df_st = train_df.reset_index(drop=True).copy()
train_df_st["sample_weight"] = weights[:len(train_df_st)]

ds_train_st = Dataset.from_pandas(train_df_st[["text","label","sample_weight"]])

def tok_batch_with_weight(batch):
    enc = tokenizer(
        batch["text"], truncation=True, max_length=MAX_LEN,
        padding=False
    )
    # протаскиваем веса как есть (HF сам склеит батч)
    enc["sample_weight"] = batch["sample_weight"]
    return enc

ds_train_st = ds_train_st.map(
    tok_batch_with_weight, batched=True, remove_columns=["text"]
)

print("train_st columns:", ds_train_st.column_names)
# должно быть: ['label','sample_weight','input_ids','token_type_ids','attention_mask']

# 4) Дообучение ещё 2 эпохи на взвешенном датасете
from transformers import TrainingArguments, Trainer
import gc

args_st = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="weighted_f1",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=False,   # оставим false для надёжности
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    report_to="none",
)

class SbertTrainerST(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            # используем те же optim_groups, что и раньше (LLRD/голова)
            self.optimizer = torch.optim.AdamW(optim_groups, betas=(0.9, 0.999))
        return self.optimizer


    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        sample_weight = inputs.pop("sample_weight") if "sample_weight" in inputs else None
        outputs = model(**inputs)      # {"logits": ...}
        logits  = outputs["logits"]
        loss = compute_loss_from_logits(logits, labels, sample_weight=sample_weight)
        return (loss, outputs) if return_outputs else loss

# подчистим старый тренер
try:
    del trainer_st
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
except:
    pass

trainer_st = SbertTrainerST(
    model=trainer.model,                # продолжаем обучать текущую лучшую
    args=args_st,
    train_dataset=ds_train_st,
    eval_dataset=ds_val,                # вал не трогаем
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer_st.train()

# обновим основной trainer на дообученную модель
trainer = trainer_st


Map:   0%|          | 0/1454 [00:00<?, ? examples/s]

train_st columns: ['label', 'sample_weight', 'input_ids', 'token_type_ids', 'attention_mask']


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.0002,3.607557,0.789523


Epoch,Training Loss,Validation Loss,Weighted F1
1,0.0002,3.607557,0.789523
2,0.0,3.775112,0.777546


In [28]:
# пересчитаем метрики и пороги (могли измениться после self-training)
raw = trainer.predict(ds_val)
val_logits = raw.predictions
val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()
val_true  = val_df["label"].to_numpy()
base_pred = val_probs.argmax(1)

# пер-классовые пороги (повтор как в ячейке 4)
K = len(CATEGORIES)
best_tau_per_class = np.zeros(K)
best_pred = base_pred.copy()
best_f1 = f1_score(val_true, base_pred, average="weighted")
none_id = cat2id["нет товара"]

for k in range(K):
    best_k_tau, best_k_f1 = 0.0, best_f1
    for tau in np.linspace(0.30, 0.95, 14):
        pred = base_pred.copy()
        mask = (pred == k) & (val_probs[np.arange(len(val_probs)), k] < tau)
        pred[mask] = none_id
        f1 = f1_score(val_true, pred, average="weighted")
        if f1 > best_k_f1:
            best_k_f1, best_k_tau, best_pred = f1, tau, pred
    best_tau_per_class[k] = best_k_tau

print(f"Final F1 on val: {f1_score(val_true, best_pred, average='weighted'):.4f}")
print("Final per-class tau:", {CATEGORIES[i]: round(float(t), 2) for i,t in enumerate(best_tau_per_class)})

# --- инференс на test ---
test_df = pd.read_csv(f"{DATA_DIR}/test.csv")
test_texts = test_df["text"].astype(str).str.replace(r"\s+"," ", regex=True).str.strip().tolist()

def predict_probs_texts_batch(texts: List[str], batch_size=256):
    mdl = trainer.model.eval()
    dev = next(mdl.parameters()).device
    outs = []
    with torch.inference_mode(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", truncation=True, max_length=MAX_LEN,
                            padding=True, pad_to_multiple_of=8).to(dev)
            logits = mdl(**enc)["logits"]
            outs.append(torch.softmax(logits, dim=-1).cpu().numpy())
    return np.vstack(outs)

test_probs = predict_probs_texts_batch(test_texts, batch_size=256)
test_base  = test_probs.argmax(1)
test_pred  = apply_per_class_thresholds(test_probs, test_base, best_tau_per_class, none_id)

submission = pd.DataFrame({
    "id": np.arange(len(test_df)),
    "category": [CATEGORIES[i] for i in test_pred]
})
sub_path = f"{OUT_DIR}/submission_sbert_ft_plus.csv"
submission.to_csv(sub_path, index=False)
print("Saved:", sub_path)


Final F1 on val: 0.7895
Final per-class tau: {'бытовая техника': 0.0, 'обувь': 0.0, 'одежда': 0.0, 'посуда': 0.0, 'текстиль': 0.0, 'товары для детей': 0.0, 'украшения и аксессуары': 0.0, 'электроника': 0.0, 'нет товара': 0.0}


  with torch.inference_mode(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


Saved: ./out_cls/submission_sbert_ft_plus.csv


In [None]:
import pandas as pd

BAD_PATH = "/content/out_cls/submission_sbert_ft_plus.csv"  
GOOD_PATH = "/content/submission.csv"

sub_bad = pd.read_csv(BAD_PATH)

# если есть колонка id — отсортируем и удалим
if "id" in sub_bad.columns:
    sub_bad = sub_bad.sort_values("id")

submission = sub_bad[["category"]].copy()
assert list(submission.columns) == ["category"], "Ожидаю одну колонку 'category'"

submission.to_csv(GOOD_PATH, index=False)
print("Saved fixed submission to:", GOOD_PATH, "| shape:", submission.shape)


Saved fixed submission to: /content/submission.csv | shape: (7276, 1)
