In [1]:
! pip install optuna seqeval evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 1

In [31]:
import pandas as pd
import ast
from tqdm import tqdm
from collections import Counter
import datasets
from datasets import (Dataset, Features, Sequence, Value, ClassLabel, load_dataset,
                    load_from_disk, concatenate_datasets, DatasetDict)
from sklearn.model_selection import KFold
from transformers import (AutoTokenizer, AutoModel, AutoModelForTokenClassification,
                         pipeline, TrainingArguments, Trainer,
                         DataCollatorForTokenClassification, EarlyStoppingCallback)
import torch
import optuna
import os
os.environ['WANDB_DISABLED'] = 'true'
import pickle
import numpy as np
import seqeval
import evaluate
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

seqeval = evaluate.load("seqeval")

In [3]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [4]:
TRAIN_PATH_COLAB = "/content/drive/MyDrive/Hackatons/X5_ner_2025/train.csv"
SUBMISSION_PATH_COLAB = "/content/drive/MyDrive/Hackatons/X5_ner_2025/submission.csv"
TRAIN_PATH_KAGGLE = "/kaggle/input/x5-ner-train/train.csv"
TRAIN_AUGMENTED = "/kaggle/input/train-augmented-col-perc/train_with_augmented_volume_percent.csv"

## IN

## Подготовка датасета

In [5]:
train_df_raw = pd.read_csv(TRAIN_AUGMENTED, sep=';')
train_df_raw

Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"
...,...,...
27547,ветчина 300 гр нарезка,"[(0, 7, 'B-TYPE'), (8, 11, 'B-VOLUME'), (12, 1..."
27548,кукуруза 400 г mikado,"[(0, 8, 'B-TYPE'), (9, 12, 'B-VOLUME'), (13, 1..."
27549,кукуруза 340 гр,"[(0, 8, 'B-TYPE'), (9, 12, 'B-VOLUME'), (13, 1..."
27550,хлеб 350 г 5 злаков,"[(0, 4, 'B-TYPE'), (5, 8, 'B-VOLUME'), (9, 10,..."


In [6]:
train_df_raw['annotation'] = train_df_raw['annotation'].str.replace("\'0\'", "O")

In [7]:
train_df_raw['annotation'] = train_df_raw['annotation'].apply(lambda x: ast.literal_eval(str(x)))

In [8]:
all_words = []
all_tags = []
for i, row in tqdm(train_df_raw.iterrows()):
    words_sample = []
    entities_sample = []
    for ent in row['annotation']:
        word = row['sample'][ent[0]:ent[1]]
        words_sample.append(word)
        entities_sample.append(ent[2])
    all_words.append(words_sample)
    all_tags.append(entities_sample)
    # print(words_sample, entities_sample)

27552it [00:01, 26338.87it/s]


In [9]:
assert len(all_words) == len(all_tags), 'different lengths'

In [10]:
lbls_in_dataset = [
 'O',
 'B-BRAND',
 'B-PERCENT',
 'B-TYPE',
 'B-VOLUME',
 'I-BRAND',
 'I-PERCENT',
 'I-TYPE',
 'I-VOLUME']
label2id = {v:i for i, v in enumerate(lbls_in_dataset)}
id2label = {i:v for i, v in enumerate(lbls_in_dataset)}

In [11]:
features=Features(
    {
        "id": Value(dtype='int32', id=None),
        "tokens": Sequence(feature=Value(dtype='string', id=None)),
        "ner_tags": Sequence(feature=ClassLabel(num_classes=len(lbls_in_dataset), names=list(lbls_in_dataset)), id=None)
    }
)

In [12]:
ds = Dataset.from_dict({"id": list(range(len(all_words))),
                        "tokens": all_words,
                        "ner_tags": all_tags},
                       features=features)

In [None]:
ds_splitted = ds.train_test_split(
    test_size=0.25, shuffle=True, seed=42,
    # stratify_by_column='ner_tags'
)

In [43]:
Counter([x for l in ds['ner_tags'] for x in l])

Counter({0: 5407,
         1: 7252,
         3: 24845,
         5: 490,
         7: 4703,
         2: 180,
         4: 207,
         8: 163,
         6: 15})

In [None]:
Counter([x for l in ds_splitted['test']['ner_tags'] for x in l])

Counter({3: 6129, 1: 1774, 7: 1133, 0: 1358, 5: 122, 2: 9, 4: 18, 8: 8, 6: 2})

## Загрузка модели

In [16]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/rut5_large_ft_colab_2509/pytorch/default/1/ruT5_large_250925',
                                          use_fast=True,
                                          add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained('/kaggle/input/rut5_large_ft_colab_2509/pytorch/default/1/ruT5_large_250925',
                                                        num_labels=len(lbls_in_dataset),
                                                        id2label=id2label,
                                                        label2id=label2id).to("cuda")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
s = '''
сироп топинамбура
'''
r = tokenizer(s)
[tokenizer.decode(x) for x in r.input_ids][:5]

['сироп', 'то', 'пина', 'м', 'бур']

## Токенизация и подготовка

In [16]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_index = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
tokenized_ds_train = ds_splitted['train'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/20438 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
tokenized_ds_test = ds_splitted['test'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/6813 [00:00<?, ? examples/s]

## Обучение

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

print("Compiling model for faster training...")
torch.set_float32_matmul_precision('high')
model.to(device)

Using device: cuda:0
Compiling model for faster training...


T5ForTokenClassification(
  (transformer): T5EncoderModel(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo):

In [None]:
# функция метрики
def compute_metrics_custom(p):
    labels_list = list(id2label.values())
    predictions, labels = p
    if not os.path.exists('/content/test/p_trainer.pkl'):
        os.makedirs('/content/test', exist_ok=True)
        with open('/content/test/p_trainer.pkl', 'wb') as f:
            pickle.dump(p, f)
    predictions = np.argmax(predictions, axis=2)


    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    report_dict = classification_report(true_labels, true_predictions, digits=4, output_dict=True)
    report = classification_report(true_labels, true_predictions, digits=4)
    macro_f1 = report_dict["macro avg"]["f1-score"]
    print("=== seqeval classification_report ===")
    print(report)
    CLASS_REPORT_PATH = '/content/logs/last_classification_report.txt'
    try:
        with open(CLASS_REPORT_PATH, "w", encoding="utf-8") as f:
            f.write(report)
    except Exception as e:
        print(f"Warning: failed to write classification report to {CLASS_REPORT_PATH}: {e}")

    return {
        "f1_macro": macro_f1,
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
import optuna.logging
optuna.logging.set_verbosity(optuna.logging.INFO)

best_f1 = 0

GDRIVE_DIR = '/content/drive/MyDrive/Hackatons/X5_ner_2025/ruT5_large_250925'

def printer(s):
    print('*'*150, end='\n\n')
    print(s, end='\n\n')
    print('*'*150, end='\n\n')

In [None]:
early_stop_cb = EarlyStoppingCallback(early_stopping_patience=2)

In [None]:
def objective(trial: optuna.Trial, model=model):
    global best_f1
    model = model
    model_name = 'ruT5_large_250925_optuna_v1'
    trial_check_dir = f"./checkpoints_trial"

    os.makedirs(trial_check_dir, exist_ok=True)
    args = TrainingArguments(
        # f"{model_name}-finetuned-ner",
        output_dir=trial_check_dir,
        overwrite_output_dir=True,
        eval_strategy = "epoch",
        torch_compile=True,
        # 64\
        per_device_train_batch_size=256,
        per_device_eval_batch_size=256,
        learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
        weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
        num_train_epochs=trial.suggest_int('num_train_epochs', low = 3, high = 10),
        seed=42,
        data_seed=24,
        gradient_accumulation_steps=2,
        warmup_ratio=0.1,
        report_to=None,
        logging_dir="./logs",
        logging_steps=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        # greater_is_better=False,
        save_total_limit=1,
        save_strategy="epoch",  # Changed to match evaluation_strategy
    )

    # early_stopping = EarlyStoppingCallback(
    #     early_stopping_patience=1,  # Stop if F1 decreases for 1 consecutive epoch
    #     early_stopping_threshold=0.001
    # )

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_ds_train,
        eval_dataset=tokenized_ds_test,
        # train_dataset=small_dataset_train,
        # eval_dataset=small_dataset_test,
        data_collator=data_collator,
        compute_metrics=compute_metrics_custom,
        tokenizer=tokenizer,
        callbacks=[early_stop_cb],
    )

    trainer.train()

    # Evaluate and save best model globally
    eval_metrics = trainer.evaluate()
    current_f1 = eval_metrics["eval_f1_macro"]

    if current_f1 > best_f1:
        best_f1 = current_f1
        trainer.save_model("./best_model")
        trainer.save_model(GDRIVE_DIR)
        printer(f"New best model saved with F1: {best_f1:.4f}")
    return current_f1

In [None]:
study = optuna.create_study(study_name='test_optuna', direction='maximize')
study.optimize(func=objective, n_trials=5)

[I 2025-09-25 15:47:56,064] A new study created in memory with name: test_optuna
  learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
  weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
W0925 15:48:54.194000 16270 torch/_inductor/utils.py:1436] [0/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,F1 Macro,Precision,Recall,F1,Accuracy
1,0.7423,0.537095,0.400311,0.803884,0.892531,0.845891,0.843418
2,0.3624,0.358328,0.450653,0.905844,0.943039,0.924067,0.915001
3,0.1586,0.302577,0.579675,0.930021,0.954029,0.941872,0.930194
4,0.1584,0.325737,0.720133,0.916179,0.951784,0.933642,0.924071
5,0.0339,0.308182,0.823673,0.938384,0.958659,0.948413,0.937526
6,0.0416,0.37998,0.770415,0.939387,0.954543,0.946905,0.936732
7,0.036,0.425532,0.874428,0.938848,0.96067,0.949634,0.938206
8,0.0148,0.478993,0.876659,0.943126,0.959313,0.951151,0.938773
9,0.0042,0.514241,0.876177,0.942816,0.958425,0.950557,0.938093


  _warn_prf(average, modifier, msg_start, len(result))


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.6379    0.7954    0.7080      5043
     PERCENT     0.0000    0.0000    0.0000        14
        TYPE     0.8637    0.9250    0.8933     16297
      VOLUME     0.0000    0.0000    0.0000        29

   micro avg     0.8039    0.8925    0.8459     21383
   macro avg     0.3754    0.4301    0.4003     21383
weighted avg     0.8087    0.8925    0.8478     21383



  _warn_prf(average, modifier, msg_start, len(result))


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8601    0.8547    0.8574      5043
     PERCENT     0.0000    0.0000    0.0000        14
        TYPE     0.9191    0.9729    0.9452     16297
      VOLUME     0.0000    0.0000    0.0000        29

   micro avg     0.9058    0.9430    0.9241     21383
   macro avg     0.4448    0.4569    0.4507     21383
weighted avg     0.9034    0.9430    0.9226     21383

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8873    0.9011    0.8941      5043
     PERCENT     1.0000    0.0714    0.1333        14
        TYPE     0.9439    0.9723    0.9579     16297
      VOLUME     0.3600    0.3103    0.3333        29

   micro avg     0.9300    0.9540    0.9419     21383
   macro avg     0.7978    0.5638    0.5797     21383
weighted avg     0.9298    0.9540    0.9415     21383

=== seqeval classification_report ===
              pr

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9096    0.9139    0.9118      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9535    0.9737    0.9635     16297
      VOLUME     0.8667    0.8966    0.8814        29

   micro avg     0.9431    0.9593    0.9512     21383
   macro avg     0.9074    0.8568    0.8767     21383
weighted avg     0.9430    0.9593    0.9511     21383



[I 2025-09-25 16:21:06,317] Trial 0 finished with value: 0.8766589668718043 and parameters: {'learning_rate': 0.0004683284028374025, 'weight_decay': 0.0020733902422178014, 'num_train_epochs': 9}. Best is trial 0 with value: 0.8766589668718043.
  learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
  weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


******************************************************************************************************************************************************

New best model saved with F1: 0.8767

******************************************************************************************************************************************************



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,Precision,Recall,F1,Accuracy
1,0.0029,0.546093,0.872007,0.943423,0.959968,0.951624,0.939189
2,0.0009,0.623692,0.854504,0.941767,0.960529,0.951056,0.939076
3,0.0001,0.644883,0.872071,0.94294,0.960623,0.951699,0.939453
4,0.001,0.610403,0.871778,0.944058,0.958893,0.951418,0.939302
5,0.0002,0.615621,0.872327,0.94305,0.961044,0.951962,0.940134
6,0.0002,0.625413,0.872305,0.942818,0.960763,0.951706,0.939869
7,0.0001,0.628055,0.872243,0.942769,0.96067,0.951635,0.939907


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9143    0.9096    0.9119      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9523    0.9760    0.9640     16297
      VOLUME     0.8621    0.8621    0.8621        29

   micro avg     0.9434    0.9600    0.9516     21383
   macro avg     0.9072    0.8476    0.8720     21383
weighted avg     0.9432    0.9600    0.9515     21383

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9089    0.9137    0.9113      5043
     PERCENT     0.8889    0.5714    0.6957        14
        TYPE     0.9520    0.9755    0.9636     16297
      VOLUME     0.8333    0.8621    0.8475        29

   micro avg     0.9418    0.9605    0.9511     21383
   macro avg     0.8958    0.8307    0.8545     21383
weighted avg     0.9416    0.9605    0.9509     21383

=== seqeval classification_report ===
              pr

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


[I 2025-09-25 16:41:59,096] Trial 1 finished with value: 0.8723269404322582 and parameters: {'learning_rate': 1.6956919779303677e-05, 'weight_decay': 0.0004587134033578368, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8766589668718043.
  learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
  weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9085    0.9175    0.9130      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9538    0.9750    0.9643     16297
      VOLUME     0.8621    0.8621    0.8621        29

   micro avg     0.9430    0.9610    0.9520     21383
   macro avg     0.9061    0.8494    0.8723     21383
weighted avg     0.9429    0.9610    0.9519     21383



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,Precision,Recall,F1,Accuracy
1,0.0,1.07341,0.871447,0.943406,0.958098,0.950695,0.938017
2,0.0015,0.843991,0.870619,0.942651,0.957022,0.949782,0.937866
3,0.0002,1.071172,0.892632,0.940048,0.962821,0.951298,0.939151
4,0.0,1.03439,0.883332,0.941954,0.960015,0.950899,0.939
5,0.0006,0.740452,0.887223,0.94166,0.959407,0.950451,0.939265


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9087    0.9120    0.9103      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9541    0.9728    0.9634     16297
      VOLUME     0.8621    0.8621    0.8621        29

   micro avg     0.9434    0.9581    0.9507     21383
   macro avg     0.9062    0.8474    0.8714     21383
weighted avg     0.9433    0.9581    0.9506     21383

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9020    0.9177    0.9098      5043
     PERCENT     1.0000    0.6429    0.7826        14
        TYPE     0.9554    0.9697    0.9625     16297
      VOLUME     0.8276    0.8276    0.8276        29

   micro avg     0.9427    0.9570    0.9498     21383
   macro avg     0.9212    0.8395    0.8706     21383
weighted avg     0.9427    0.9570    0.9498     21383

=== seqeval classification_report ===
              pr

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9099    0.9129    0.9114      5043
     PERCENT     1.0000    0.7143    0.8333        14
        TYPE     0.9492    0.9786    0.9637     16297
      VOLUME     0.8621    0.8621    0.8621        29

   micro avg     0.9400    0.9628    0.9513     21383
   macro avg     0.9303    0.8670    0.8926     21383
weighted avg     0.9399    0.9628    0.9512     21383



[I 2025-09-25 16:56:43,952] Trial 2 finished with value: 0.8926324166140165 and parameters: {'learning_rate': 6.44944805049113e-05, 'weight_decay': 0.004066034568680406, 'num_train_epochs': 9}. Best is trial 2 with value: 0.8926324166140165.
  learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
  weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


******************************************************************************************************************************************************

New best model saved with F1: 0.8926

******************************************************************************************************************************************************



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,Precision,Recall,F1,Accuracy
1,0.137,0.558386,0.858002,0.93688,0.950288,0.943536,0.931932
2,0.0319,0.395097,0.795494,0.938484,0.951036,0.944718,0.93197
3,0.0089,0.435523,0.804443,0.941481,0.952532,0.946974,0.934805


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8983    0.8937    0.8960      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9487    0.9683    0.9584     16297
      VOLUME     0.8276    0.8276    0.8276        29

   micro avg     0.9369    0.9503    0.9435     21383
   macro avg     0.8937    0.8331    0.8580     21383
weighted avg     0.9366    0.9503    0.9434     21383

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8909    0.9100    0.9003      5043
     PERCENT     0.8333    0.7143    0.7692        14
        TYPE     0.9537    0.9648    0.9592     16297
      VOLUME     0.7222    0.4483    0.5532        29

   micro avg     0.9385    0.9510    0.9447     21383
   macro avg     0.8500    0.7593    0.7955     21383
weighted avg     0.9385    0.9510    0.9447     21383

=== seqeval classification_report ===
              pr

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


[I 2025-09-25 17:05:50,476] Trial 3 finished with value: 0.8580016656503588 and parameters: {'learning_rate': 0.00048392903640124505, 'weight_decay': 0.0038169848188184135, 'num_train_epochs': 9}. Best is trial 2 with value: 0.8926324166140165.
  learning_rate=trial.suggest_loguniform('learning_rate', low=1e-5, high=5e-4),
  weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 0.05),
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8983    0.8937    0.8960      5043
     PERCENT     0.9000    0.6429    0.7500        14
        TYPE     0.9487    0.9683    0.9584     16297
      VOLUME     0.8276    0.8276    0.8276        29

   micro avg     0.9369    0.9503    0.9435     21383
   macro avg     0.8937    0.8331    0.8580     21383
weighted avg     0.9366    0.9503    0.9434     21383



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,Precision,Recall,F1,Accuracy
1,0.0096,0.497267,0.858961,0.939611,0.957583,0.948512,0.936846
2,0.0017,0.623256,0.883508,0.939863,0.956741,0.948227,0.936392
3,0.0002,0.633704,0.87954,0.94054,0.958706,0.949536,0.937866


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9097    0.9014    0.9056      5043
     PERCENT     1.0000    0.6429    0.7826        14
        TYPE     0.9487    0.9756    0.9619     16297
      VOLUME     0.8148    0.7586    0.7857        29

   micro avg     0.9396    0.9576    0.9485     21383
   macro avg     0.9183    0.8196    0.8590     21383
weighted avg     0.9394    0.9576    0.9483     21383

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9054    0.9032    0.9043      5043
     PERCENT     1.0000    0.6429    0.7826        14
        TYPE     0.9504    0.9736    0.9619     16297
      VOLUME     0.8438    0.9310    0.8852        29

   micro avg     0.9399    0.9567    0.9482     21383
   macro avg     0.9249    0.8627    0.8835     21383
weighted avg     0.9397    0.9567    0.9481     21383

=== seqeval classification_report ===
              pr

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


[I 2025-09-25 17:14:40,644] Trial 4 finished with value: 0.8835080653340189 and parameters: {'learning_rate': 0.00012973534241957156, 'weight_decay': 0.0006306410098490446, 'num_train_epochs': 3}. Best is trial 2 with value: 0.8926324166140165.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9054    0.9032    0.9043      5043
     PERCENT     1.0000    0.6429    0.7826        14
        TYPE     0.9504    0.9736    0.9619     16297
      VOLUME     0.8438    0.9310    0.8852        29

   micro avg     0.9399    0.9567    0.9482     21383
   macro avg     0.9249    0.8627    0.8835     21383
weighted avg     0.9397    0.9567    0.9481     21383



In [None]:
best_trial = study.best_trial
printer(f"Final best F1: {best_trial.value}")
printer(f"Final params: {best_trial.params}")

******************************************************************************************************************************************************

Final best F1: 0.8926324166140165

******************************************************************************************************************************************************

******************************************************************************************************************************************************

Final params: {'learning_rate': 6.44944805049113e-05, 'weight_decay': 0.004066034568680406, 'num_train_epochs': 9}

******************************************************************************************************************************************************



## тест

In [None]:
df_test = pd.read_csv('submission.csv', sep=';', usecols=['sample'])

In [None]:
best_model_path = '/content/drive/MyDrive/Hackatons/X5_ner_2025/ruT5_large_250925'
final_tokenizer = AutoTokenizer.from_pretrained(best_model_path, use_fast=True, add_prefix_space=True)
final_model = AutoModelForTokenClassification.from_pretrained(best_model_path)


In [None]:
final_model.to('cuda')

T5ForTokenClassification(
  (transformer): T5EncoderModel(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo):

In [None]:
token_classifier = pipeline(
    "token-classification", model=final_model, aggregation_strategy="first", tokenizer=final_tokenizer
)
for s in samples:
    print(f'sample: {s}')
    res = token_classifier(s)
    for i, r in enumerate(res):
        # print('Entity: '+ r['entity_group'] + '   Word: ' + r['word'])
        print('Entity: '+ r['entity_group'] + '   Word: ' + r['word'] + '. Probs:  ' + str(round(r['score'], 4)))
    print('#'*40)

Device set to use cuda:0


NameError: name 'samples' is not defined

In [None]:
s = 'очиститель для унитаза'
res = token_classifier(s)
for i, r in enumerate(res):
    # print('Entity: '+ r['entity_group'] + '   Word: ' + r['word'])
    print('Entity: '+ r['entity_group'] + '   Word: ' + r['word'] + '. Probs:  ' + str(round(r['score'], 4)))
print(res)

Entity: TYPE   Word: очиститель. Probs:  0.9998
[{'entity_group': 'TYPE', 'score': np.float32(0.9997584), 'word': 'очиститель', 'start': 0, 'end': 10}]


In [None]:
lbls_in_dataset = [
 'O',
 'B-BRAND',
 'B-PERCENT',
 'B-TYPE',
 'B-VOLUME',
 'I-BRAND',
 'I-PERCENT',
 'I-TYPE',
 'I-VOLUME']
label2id = {v:i for i, v in enumerate(lbls_in_dataset)}
id2label = {i:v for i, v in enumerate(lbls_in_dataset)}

In [21]:
import torch
import torch.nn.functional as F
import re

def predict_all_entities(text: str, model, tokenizer, id2label, device=None, debug=False):
    """
    Word-level inference with original character spans (start_idx, end_idx, ENTITY).
    Returns a list of tuples for each word (including 'O').
    """

    model.eval()
    if device is None:
        device = next(model.parameters()).device

    # --- find words and their char spans in original text ---
    words = []
    spans = []
    for match in re.finditer(r"\S+", text):
        words.append(match.group())
        spans.append(match.span())  # (start_idx, end_idx)

    # encode with word-level info
    enc = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )

    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)
    word_ids = enc.word_ids(batch_index=0)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits[0]               # (seq_len, num_labels)
        probs = F.softmax(logits, dim=-1)        # (seq_len, num_labels)
    # print()
    # print(probs)
    results = []
    prev_word_idx = None
    # print(word_ids)
    for token_idx, word_idx in enumerate(word_ids):
        if debug:
            print(token_idx, word_idx, logits[token_idx])
        if word_idx is None:
            prev_word_idx = None
            continue

        # only take first subtoken per word
        if word_idx != prev_word_idx:
            label_id = int(torch.argmax(logits[token_idx]).cpu().numpy())
            label = id2label[label_id]

            start_idx, end_idx = spans[word_idx]
            results.append((start_idx, end_idx, label))

        prev_word_idx = word_idx

    return results


# -------------------------
# Example usage
# -------------------------
s = "сыр натура сливочный"

# Example id2label (replace with your mapping)
# id2label = {0: "O", 1: "B-TYPE", 2: "I-TYPE", ...}

res = predict_all_entities(s, trained_model, tokenizer, id2label)
print(res)


[(0, 3, 'B-TYPE'), (4, 10, 'B-BRAND'), (11, 20, 'I-BRAND')]


In [None]:
annotations = []
for s in tqdm(df_test['sample'].tolist()):
    r = predict_all_entities(s, final_model, final_tokenizer, id2label)
    annotations.append(r)

100%|██████████| 5000/5000 [01:59<00:00, 41.73it/s]


In [None]:
df_test['annotation'] = annotations

In [None]:
df_test.sample(20)

Unnamed: 0,sample,annotation
1501,куркум,"[(0, 6, B-TYPE)]"
2586,вялены,"[(0, 6, B-TYPE)]"
2653,exponetto,"[(0, 9, B-BRAND)]"
1055,моцарелла шарики,"[(0, 9, B-TYPE), (10, 16, I-TYPE)]"
705,галеты petra,"[(0, 6, B-TYPE), (7, 12, B-BRAND)]"
106,ванилик,"[(0, 7, B-TYPE)]"
589,нектар j,"[(0, 6, B-TYPE), (7, 8, B-BRAND)]"
2468,тнк,"[(0, 3, O)]"
2413,рыбный фарш,"[(0, 6, B-TYPE), (7, 11, I-TYPE)]"
1600,сгущенные молочн,"[(0, 9, B-TYPE), (10, 16, I-TYPE)]"


In [None]:
df_test.to_csv('submission_ruT5_large_250925.csv', index=False, sep=';')

## k-fold

In [96]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Create a list to hold each fold (train/val split)
folds = []

for train_idx, val_idx in kf.split(ds):
    train_split = ds.select(train_idx)
    val_split = ds.select(val_idx)
    folds.append(DatasetDict({"train": train_split, "validation": val_split}))

In [18]:
# tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruT5-large',
#                                           use_fast=True,
#                                           add_prefix_space=True)

# model = AutoModelForTokenClassification.from_pretrained('ai-forever/ruT5-large',
#                                                         num_labels=len(lbls_in_dataset),
#                                                         id2label=id2label,
#                                                         label2id=label2id).to("cuda")

In [19]:
# lbls_in_dataset = [
#  'O',
#  'B-BRAND',
#  'B-PERCENT',
#  'B-TYPE',
#  'B-VOLUME',
#  'I-BRAND',
#  'I-PERCENT',
#  'I-TYPE',
#  'I-VOLUME']
# label2id = {v:i for i, v in enumerate(lbls_in_dataset)}
# id2label = {i:v for i, v in enumerate(lbls_in_dataset)}

In [97]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
df_all_valid_docs = pd.DataFrame()

for fold in folds:
    print(f"Start training for fold {fold}")
    ds_train = fold['train'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})
    ds_validation = fold['validation'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})
    training_args = TrainingArguments(
        eval_strategy="no",  # No evaluation during training
        torch_compile=True,
        per_device_train_batch_size=256,
        learning_rate=6.44944805049113e-05,          # fixed value (was suggested by Optuna before)
        weight_decay=0.004066034568680406,
        num_train_epochs=3,          # fixed
        seed=42,
        data_seed=24,
        gradient_accumulation_steps=2,
        warmup_ratio=0.1,
        report_to=None,
        logging_dir="./logs",
        logging_steps=20,
        )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        data_collator=data_collator,
        tokenizer=tokenizer,

    )

    # Start training
    print("Starting training without evaluation...")
    trainer.train()
    trained_model = trainer.model

    # Optional: Save final model
    print(f"Training completed for fold {fold}")

    df_valid = ds_validation.to_pandas()
    predicted_ner_tags = []
    for doc in df_valid['tokens'].tolist():
        predicted_doc_ner = predict_all_entities(doc[0], trained_model, tokenizer, id2label)
        predicted_ner_tags.append(predicted_doc_ner)
    df_valid['predicted_ner_tags'] = predicted_ner_tags

    if df_all_valid_docs.empty:
        df_all_valid_docs = df_valid
    else:
        df_all_valid_docs = pd.concat([df_all_valid_docs, df_valid])

Start training for fold DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
})


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training without evaluation...


  trainer = Trainer(


Step,Training Loss


Training completed for fold DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
})
Start training for fold DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
})


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training without evaluation...


  trainer = Trainer(


Step,Training Loss


Training completed for fold DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
})
Start training for fold DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 100
    })
})


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training without evaluation...


  trainer = Trainer(


Step,Training Loss


In [24]:
doc[0]

'abon'

In [18]:
df_all_valid_docs

Unnamed: 0,id,tokens,ner_tags,input_ids,attention_mask,labels,predicted_ner_tags
0,3,[abon],[0],"[8, 4877, 1146, 2]","[1, 1, 1, 1]","[0, 0, 0, -100]","[(0, 3, B-TYPE), (4, 10, I-TYPE), (11, 20, I-T..."
1,6,"[abtoys, игрушк]","[1, 3]","[8, 4877, 2769, 448, 125, 5112, 319, 189, 2]","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 3, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, I-TYPE), (11, 20, I-T..."
2,17,[active],[1],"[8, 4128, 2496, 2]","[1, 1, 1, 1]","[1, 1, 1, -100]","[(0, 3, B-TYPE), (4, 10, I-TYPE), (11, 20, I-T..."
3,29,[agata],[1],"[8, 6084, 12625, 2]","[1, 1, 1, 1]","[1, 1, 1, -100]","[(0, 3, B-TYPE), (4, 10, I-TYPE), (11, 20, I-T..."
4,30,"[agnesi, пше]","[1, 3]","[8, 6084, 16805, 633, 454, 682, 2]","[1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, I-TYPE), (11, 20, I-T..."
...,...,...,...,...,...,...,...
5445,27232,[яыц],[3],"[35, 19, 557, 2]","[1, 1, 1, 1]","[3, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, B-BRAND), (11, 20, I-..."
5446,27233,[яыца],[3],"[35, 19, 518, 2]","[1, 1, 1, 1]","[3, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, B-BRAND), (11, 20, I-..."
5447,27240,"[№1, газе]","[1, 3]","[545, 471, 3762, 13, 2]","[1, 1, 1, 1, 1]","[1, 1, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, B-BRAND), (11, 20, I-..."
5448,27247,"[№1, кофейник]","[1, 3]","[545, 471, 18465, 588, 2]","[1, 1, 1, 1, 1]","[1, 1, 3, 3, -100]","[(0, 3, B-TYPE), (4, 10, B-BRAND), (11, 20, I-..."


In [19]:
df_all_valid_docs.iloc[-1]['predicted_ner_tags']

[(0, 3, 'B-TYPE'), (4, 10, 'I-TYPE'), (11, 20, 'I-TYPE')]

## где ошибка в трейне

In [None]:
df_valid_final[df_valid_final.annotation.astype(str) != df_valid_final.ann.astype(str)].to_csv('comparison_valid.csv', index=False)

In [None]:
folds[0]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 21800
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5451
    })
})

In [None]:
dd = pd.DataFrame()
dd.empty

True

## итоговая тренировка

In [13]:
tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruT5-large',
                                          use_fast=True,
                                          add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained('ai-forever/ruT5-large',
                                                        num_labels=len(lbls_in_dataset),
                                                        id2label=id2label,
                                                        label2id=label2id).to("cuda")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at ai-forever/ruT5-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:

ds_train = ds.map(tokenize_and_align_labels,
                                          batched=True,
                                          fn_kwargs={'tokenizer': tokenizer})

training_args = TrainingArguments(
    eval_strategy="no",  # No evaluation during training
    torch_compile=True,
    per_device_train_batch_size=256,
    learning_rate=0.00019727099511884864,          # fixed value (was suggested by Optuna before)
    weight_decay=0.000514766062249604,
    num_train_epochs=8,          # fixed
    seed=42,
    data_seed=24,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    report_to=None,
    logging_dir="./logs",
    logging_steps=20,
    )

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    data_collator=data_collator,
    tokenizer=tokenizer,

)

# Start training
print("Starting training without evaluation...")
trainer.train()
trained_model = trainer.model




Map:   0%|          | 0/27552 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Starting training without evaluation...




Step,Training Loss
20,3.1108
40,0.8228
60,0.526
80,0.3522
100,0.2528
120,0.1884
140,0.1541
160,0.125
180,0.1
200,0.0896


In [18]:
ds_train.select(range(5)).to_pandas()

Unnamed: 0,id,tokens,ner_tags,input_ids,attention_mask,labels
0,0,[aa],[0],"[351, 500, 2]","[1, 1, 1]","[0, 0, -100]"
1,1,[aala],[0],"[351, 700, 500, 2]","[1, 1, 1, 1]","[0, 0, 0, -100]"
2,2,[aarcca],[0],"[351, 1228, 679, 4701, 2]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, -100]"
3,3,[abon],[0],"[8, 4877, 1146, 2]","[1, 1, 1, 1]","[0, 0, 0, -100]"
4,4,[abso],[1],"[8, 4877, 12364, 2]","[1, 1, 1, 1]","[1, 1, 1, -100]"


In [23]:
df_validation = pd.read_csv("/kaggle/input/submission/submission.csv", sep=";", usecols=['sample'])

In [24]:
predicted_ner_tags = []
for doc in tqdm(df_validation['sample'].tolist()):
    predicted_doc_ner = predict_all_entities(doc, trained_model, tokenizer, id2label)
    predicted_ner_tags.append(predicted_doc_ner)
df_validation['annotation'] = predicted_ner_tags

100%|██████████| 5000/5000 [01:39<00:00, 50.21it/s]


In [29]:
trained_model

T5ForTokenClassification(
  (transformer): T5EncoderModel(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo):

In [25]:
df_validation.sample(20)

Unnamed: 0,sample,annotation
1501,куркум,"[(0, 6, B-TYPE)]"
2586,вялены,"[(0, 6, B-TYPE)]"
2653,exponetto,"[(0, 9, B-BRAND)]"
1055,моцарелла шарики,"[(0, 9, B-TYPE), (10, 16, I-TYPE)]"
705,галеты petra,"[(0, 6, B-TYPE), (7, 12, B-BRAND)]"
106,ванилик,"[(0, 7, B-TYPE)]"
589,нектар j,"[(0, 6, B-TYPE), (7, 8, B-BRAND)]"
2468,тнк,"[(0, 3, O)]"
2413,рыбный фарш,"[(0, 6, B-TYPE), (7, 11, I-TYPE)]"
1600,сгущенные молочн,"[(0, 9, B-TYPE), (10, 16, I-TYPE)]"


In [26]:
df_validation.to_csv('submission_ruT5_large_ft_augmented_new_hyperparams_260925.csv', index=False, sep=';')

In [22]:
predict_all_entities("garner ructi", trained_model, tokenizer, id2label, debug=True)

0 0 tensor([ 6.5906, 10.1643, -2.2721,  3.4366, -4.5278,  2.5592, -2.8466, -6.9580,
        -3.9529], device='cuda:0')
1 0 tensor([ 7.7309, 10.9714, -3.8946,  3.1987, -4.7314,  3.2854, -1.5130, -5.1659,
        -4.1112], device='cuda:0')
2 0 tensor([ 7.2921, 10.1729, -5.3683,  2.7412, -4.5518,  4.3491, -3.4186, -4.6163,
        -5.3117], device='cuda:0')
3 1 tensor([ 5.6538,  3.8720, -0.9463,  3.5795, -3.4112,  5.8417, -3.4994, -3.0971,
        -2.3954], device='cuda:0')
4 1 tensor([ 6.2706,  6.4004, -1.8007,  6.0896, -2.7667,  3.7546, -2.8255, -1.3013,
        -5.2245], device='cuda:0')
5 1 tensor([ 6.1525,  4.2112, -3.2930,  6.1238, -4.6684,  3.9089, -2.6030, -0.5535,
        -2.8735], device='cuda:0')
6 1 tensor([ 6.2605,  5.1263, -3.0759,  4.7641, -2.6424,  4.7713, -2.8157, -2.3314,
        -4.1695], device='cuda:0')
7 None tensor([ 1.7911, -0.1701,  0.5656, -0.5731,  1.6106, -0.8561,  2.0464,  2.4553,
         1.5692], device='cuda:0')


[(0, 6, 'B-BRAND'), (7, 12, 'I-BRAND')]

In [30]:
id2label

{0: 'O',
 1: 'B-BRAND',
 2: 'B-PERCENT',
 3: 'B-TYPE',
 4: 'B-VOLUME',
 5: 'I-BRAND',
 6: 'I-PERCENT',
 7: 'I-TYPE',
 8: 'I-VOLUME'}

In [31]:
tokenizer.tokenize(" garner ructi")

['▁', 'gar', 'ner', '▁', 'ru', 'c', 'ti']

In [82]:
# молоко 1,5 % -- не находит процент
# стейк говядина --  проверить усреднение
# сок 2 литра яблочный -- не находит volume 

In [37]:
predict_all_entities("schwartau со", trained_model, tokenizer, id2label, debug=True)

0 0 tensor([ 2.4230e+00,  1.3040e+01, -6.8861e-03,  9.9319e-01, -1.6785e+00,
        -1.4810e+00, -8.7815e-01, -2.5692e-01, -1.3998e+00], device='cuda:0')
1 0 tensor([ 2.2394, 12.3609,  0.4930,  1.3229, -1.7823, -0.0829, -1.3356, -0.2234,
        -0.8064], device='cuda:0')
2 0 tensor([ 2.6557e+00,  1.3508e+01,  1.0873e-02,  1.7207e+00, -2.1920e+00,
         4.2342e-01, -7.9274e-01, -9.1873e-02, -1.4111e+00], device='cuda:0')
3 0 tensor([ 2.6026, 13.6531, -0.9042,  1.6521, -2.1215,  0.8590, -0.8216, -1.2197,
        -0.8692], device='cuda:0')
4 0 tensor([ 3.3646, 14.1260,  0.9424,  1.7963, -0.5018,  1.6216, -1.6133, -2.3228,
        -1.5213], device='cuda:0')
5 1 tensor([ 6.6762, -1.6012, -2.8019, 11.6132, -3.1620,  1.1287, -1.9783,  0.8047,
        -2.5529], device='cuda:0')
6 None tensor([ 6.3755, -2.4741, -3.1129, 14.5177, -3.3633, -0.2019, -3.0747,  1.3072,
        -4.7479], device='cuda:0')


[(0, 9, 'B-BRAND'), (10, 12, 'B-TYPE')]

In [36]:
tokenizer.tokenize("крем jundo")

['▁крем', '▁j', 'und', 'o']

In [32]:
tokenizer2 = AutoTokenizer.from_pretrained('ai-forever/ruT5-large',
                                          use_fast=True,
                                          add_prefix_space=True)

model2 = AutoModel.from_pretrained('ai-forever/ruT5-large').to("cuda")

In [36]:
model2.encoder

T5Stack(
  (embed_tokens): Embedding(32128, 1024)
  (block): ModuleList(
    (0): T5Block(
      (layer): ModuleList(
        (0): T5LayerSelfAttention(
          (SelfAttention): T5Attention(
            (q): Linear(in_features=1024, out_features=1024, bias=False)
            (k): Linear(in_features=1024, out_features=1024, bias=False)
            (v): Linear(in_features=1024, out_features=1024, bias=False)
            (o): Linear(in_features=1024, out_features=1024, bias=False)
            (relative_attention_bias): Embedding(32, 16)
          )
          (layer_norm): T5LayerNorm()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (1): T5LayerFF(
          (DenseReluDense): T5DenseActDense(
            (wi): Linear(in_features=1024, out_features=4096, bias=False)
            (wo): Linear(in_features=4096, out_features=1024, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
            (act): ReLU()
          )
          (layer_norm): T5LayerNorm()
 

In [39]:
trained_model

T5ForTokenClassification(
  (transformer): T5EncoderModel(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo):

In [None]:
trained_model.save_model("/kaggle/working/ruT5_large_260925_ft250925")