In [1]:
import os

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from src.category_tree.category_tree import CategoryTree
from loguru import logger

INFO:datasets:PyTorch version 2.6.0 available.


In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [10]:
CAT_ID_COL = "cat_id"
TITLE_COL = "source_name"

TITLE_MODEL_COL = "text"
CAT_ID_MODEL_COL = "label"
PART_TYPE_COL = "part_type"
PART_COL = "part"

DATASET_PATH = "../data/dataset_v1/dataset_for_experiments.parquet"
CAT_TREE_PATH = "../data/category_tree.csv"

MODEL = "cointegrated/rubert-tiny2"
NUM_EPOCHS = 1

# Load dataset

In [17]:
category_tree = CategoryTree(category_tree_path=LOCAL_CAT_TREE_PATH)
df = pd.read_parquet(LOCAL_DATASET_PATH)

df = df.rename(columns={TITLE_COL:TITLE_MODEL_COL, CAT_ID_COL:CAT_ID_MODEL_COL})
df[CAT_ID_MODEL_COL] = category_tree.label_encoder.transform(df[CAT_ID_MODEL_COL])

In [18]:
df.head()

Unnamed: 0,text,label,part_type,part
0,"Сетевой кабель, патч корд Rj45 5 метров CAT5E,...",627,is,train
1,Фильтр-заглушка сливного насоса стиральной маш...,649,is,train
2,Умные часы CheckME Smart CMSKC06SS с калькулят...,38,is,train
3,Силиконовый чехол Mcover для беспроводных науш...,56,is,train
4,Зарядное устройство Panasonic Basic BQ-CC51 + ...,307,is,train


# Tokenization

In [26]:
from datasets import Dataset, DatasetDict

tokenizer = AutoTokenizer.from_pretrained(MODEL)
def tokenize_function(examples):
    return tokenizer(examples[TITLE_MODEL_COL], truncation=True)

def load_experiment_dataset(df: pd.DataFrame):
    parts_datasets = {
        part: Dataset.from_pandas(
            df[df[PART_COL]==part][[TITLE_MODEL_COL, CAT_ID_MODEL_COL]],
            split=part
        )
        for part in ["train", "val"]
    }

    dataset = DatasetDict(parts_datasets)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    return tokenized_dataset

def load_full_dataset(df: pd.DataFrame):
    dataset = Dataset.from_pandas(df[[TITLE_MODEL_COL, CAT_ID_MODEL_COL]])
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [36]:
tokenized_datasets = load_experiment_dataset(df)

Map:   0%|          | 0/491736 [00:00<?, ? examples/s]

Map:   0%|          | 0/122980 [00:00<?, ? examples/s]

# Label Smoothing
Идея: для каждой вершины размажем вероятность по ее соседям-листьям(должно уменьшать влияние ошибок разметки)

In [28]:
import torch
import torch.nn as nn
from transformers import Trainer
from torch.nn import CrossEntropyLoss
import torch
from torch import Tensor
from functools import partial

class LabelSmoothingCrossEntropyLoss(nn.Module):
    def __init__(self, category_tree: CategoryTree, smoothing: float = 0.2, reduction: str = "mean"):
        super().__init__()
        
        self.smoothing = smoothing
        self.category_tree = category_tree

        self.label_encoder = self.category_tree.label_encoder
        self.leaf_nodes = set(self.category_tree.leaf_nodes)
        self.category_tree_edges = self.category_tree.inverted_edge_dict 

        self.nearest_neighbors = self._precompute_nearest_neighbors()
        
        self.loss_fct = CrossEntropyLoss(reduction=reduction)
        
    def forward(self, input, target):
        num_classes = input.shape[-1]
        
        true_dist = self._smooth_labels(target=target, num_classes=num_classes)
        loss = self.loss_fct(input, true_dist)
        return loss

    def _smooth_labels(self, target, num_classes):
        # Initialize smoothed label distribution
        true_dist = torch.zeros(target.size(0), num_classes).to(target.device)

        target_inv = self.label_encoder.inverse_transform(target.tolist())
        for i, (label, label_inv) in enumerate(zip(target, target_inv)):
            # Distribute smoothing factor among nearest neighbors
            neighbors = self.nearest_neighbors[label_inv]
            neighbors = self.label_encoder.transform(neighbors)

            # Set the true label probability
            if len(neighbors) == 0:
                true_dist[i, label] = 1.0
            else:
                true_dist[i, label] = 1.0 - self.smoothing
                neighbor_prob = self.smoothing / len(neighbors)
                true_dist[i, neighbors] = neighbor_prob
        
        return true_dist

    def _precompute_nearest_neighbors(self):
        neighbors = dict()
        
        for label in self.leaf_nodes:
            target_parent = self.category_tree_edges[label]
            label_neighbors = []
            for node, parent in self.category_tree_edges.items():
                if parent == target_parent and node != label and node in self.leaf_nodes:
                    label_neighbors.append(node)
            neighbors[label] = label_neighbors
        
        return neighbors

class LabelSmoothingCrossEntropyLossTrainer(Trainer):
    def __init__(self, category_tree: CategoryTree, smoothing: float, reduction: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = LabelSmoothingCrossEntropyLoss(category_tree=category_tree, smoothing=smoothing, reduction=reduction)
        self.ce_loss = CrossEntropyLoss(reduction=reduction)
         
    def compute_loss(self, model, inputs, num_items_in_batch=0, return_outputs=False):
        outputs = model(**inputs)
        logits, labels = outputs.logits, inputs["labels"]
        
        if model.training:
            loss = self.loss_fct(logits, labels)
        else:
            loss = self.ce_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Обучение модели(train + val)

Оценим качество такого подхода на валидационной выборке

In [29]:
from src.metrics.transformers_metrics import hierarchical_accuracy

## 1. Обучение с Label Smoothing

In [31]:
seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(category_tree.leaf_nodes)
)


training_args = TrainingArguments(
    output_dir="label_smoothing_rubert_trainer",
    eval_strategy="steps",
    eval_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=512,
    report_to="none" # disable wandb
)

trainer = LabelSmoothingCrossEntropyLossTrainer(
    model=model,
    category_tree=category_tree,
    smoothing=0.2,
    reduction="mean",
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    processing_class=tokenizer, # Automatic DataCollatorWithPadding
    compute_metrics=partial(hierarchical_accuracy, category_tree=category_tree.inverted_edge_dict)
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Hierarchical Accuracy
500,4.4484,5.13758,0.288843
1000,2.5906,4.564746,0.393297
1500,1.9228,4.331824,0.414706
2000,1.6168,4.247887,0.423817
2500,1.45,4.116574,0.435737
3000,1.3531,4.051841,0.440705
3500,1.3054,4.06018,0.442233


TrainOutput(global_step=3842, training_loss=2.0252213008946147, metrics={'train_runtime': 498.2579, 'train_samples_per_second': 986.911, 'train_steps_per_second': 7.711, 'total_flos': 676148946465600.0, 'train_loss': 2.0252213008946147, 'epoch': 1.0})

## 2. Обуение без Label Smoothing

In [37]:
seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(category_tree.leaf_nodes)
)


training_args = TrainingArguments(
    output_dir="rubert_trainer",
    eval_strategy="steps",
    eval_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=512,
    report_to="none" # disable wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    processing_class=tokenizer, # Automatic DataCollatorWithPadding
    compute_metrics=partial(hierarchical_accuracy, category_tree=category_tree.inverted_edge_dict)
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Hierarchical Accuracy
500,4.0979,5.958326,0.27353
1000,2.0725,5.773002,0.393524
1500,1.3716,5.673862,0.423695
2000,1.061,5.76571,0.429899
2500,0.8884,5.773645,0.432419
3000,0.7848,5.809369,0.434596
3500,0.7385,5.849384,0.436265


TrainOutput(global_step=3842, training_loss=1.4966335822864474, metrics={'train_runtime': 311.3373, 'train_samples_per_second': 1579.432, 'train_steps_per_second': 12.34, 'total_flos': 676148946465600.0, 'train_loss': 1.4966335822864474, 'epoch': 1.0})

# Обучение модели(full)

In [32]:
tokenized_datasets = load_full_dataset(df)

Map:   0%|          | 0/614716 [00:00<?, ? examples/s]

In [34]:
seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(category_tree.leaf_nodes)
)

training_args = TrainingArguments(
    output_dir="label_smoothing_rubert_full_trainer",
    num_train_epochs=1,
    per_device_train_batch_size=128,
    report_to="none" # disable wandb
)

trainer = LabelSmoothingCrossEntropyLossTrainer(
    model=model,
    category_tree=category_tree,
    smoothing=0.2,
    reduction="mean",
    args=training_args,
    train_dataset=tokenized_datasets,
    processing_class=tokenizer, # Automatic DataCollatorWithPadding
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,4.582
1000,2.6536
1500,1.9567
2000,1.6131
2500,1.425
3000,1.3225
3500,1.2474
4000,1.2127
4500,1.1841


TrainOutput(global_step=4803, training_loss=1.8641801175290833, metrics={'train_runtime': 420.0021, 'train_samples_per_second': 1463.602, 'train_steps_per_second': 11.436, 'total_flos': 866965952696160.0, 'train_loss': 1.8641801175290833, 'epoch': 1.0})