In [None]:
# %pip install optuna seqeval evaluate -q

In [1]:
import pandas as pd
import ast
from tqdm import tqdm
from collections import Counter
import datasets
from datasets import (Dataset, Features, Sequence, Value, ClassLabel, load_dataset,
                    load_from_disk, concatenate_datasets, DatasetDict)
from sklearn.model_selection import KFold
from transformers import (AutoTokenizer, AutoModel, AutoModelForTokenClassification,
                         pipeline, TrainingArguments, Trainer,
                         DataCollatorForTokenClassification, EarlyStoppingCallback, BertForMaskedLM,
                        AutoConfig, AutoModelForMaskedLM)
import torch
import optuna
from optuna.pruners import MedianPruner
import os
os.environ['WANDB_DISABLED'] = 'true'
import pickle
import numpy as np
import seqeval
import evaluate
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

seqeval = evaluate.load("seqeval")

2025-10-02 07:50:53.208683: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-02 07:50:53.257393: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading builder script: 6.34kB [00:00, 10.8MB/s]


## Загрузка

In [3]:
TRAIN_PATH = "/home/jupyter/datasphere/project/mikhail/train_with_augmented_volume_percent.csv"
SUBMISSION_PATH = "/home/jupyter/datasphere/project/mikhail/submission.csv"

In [4]:
train_df_raw = pd.read_csv(TRAIN_PATH, sep=';')
train_df_raw

Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"
...,...,...
27547,ветчина 300 гр нарезка,"[(0, 7, 'B-TYPE'), (8, 11, 'B-VOLUME'), (12, 1..."
27548,кукуруза 400 г mikado,"[(0, 8, 'B-TYPE'), (9, 12, 'B-VOLUME'), (13, 1..."
27549,кукуруза 340 гр,"[(0, 8, 'B-TYPE'), (9, 12, 'B-VOLUME'), (13, 1..."
27550,хлеб 350 г 5 злаков,"[(0, 4, 'B-TYPE'), (5, 8, 'B-VOLUME'), (9, 10,..."


In [5]:
train_df_raw['annotation'] = train_df_raw['annotation'].str.replace("\'0\'", "O")

In [6]:
train_df_raw['annotation'] = train_df_raw['annotation'].apply(lambda x: ast.literal_eval(str(x)))

In [7]:
all_words = []
all_tags = []
for i, row in tqdm(train_df_raw.iterrows()):
    words_sample = []
    entities_sample = []
    for ent in row['annotation']:
        word = row['sample'][ent[0]:ent[1]]
        words_sample.append(word)
        entities_sample.append(ent[2])
    all_words.append(words_sample)
    all_tags.append(entities_sample)
    # print(words_sample, entities_sample)

27552it [00:01, 18988.87it/s]


In [8]:
lbls_in_dataset = [
 'O',
 'B-BRAND',
 'B-PERCENT',
 'B-TYPE',
 'B-VOLUME',
 'I-BRAND',
 'I-PERCENT',
 'I-TYPE',
 'I-VOLUME']
label2id = {v:i for i, v in enumerate(lbls_in_dataset)}
id2label = {i:v for i, v in enumerate(lbls_in_dataset)}

In [9]:
features=Features(
    {
        "id": Value(dtype='int32', id=None),
        "tokens": Sequence(feature=Value(dtype='string', id=None)),
        "ner_tags": Sequence(feature=ClassLabel(num_classes=len(lbls_in_dataset), names=list(lbls_in_dataset)), id=None)
    }
)

In [10]:
ds = Dataset.from_dict({"id": list(range(len(all_words))),
                        "tokens": all_words,
                        "ner_tags": all_tags},
                       features=features)

In [11]:
ds_splitted = ds.train_test_split(
    test_size=0.2, shuffle=True, seed=42,
    # stratify_by_column='ner_tags'
)

In [12]:
model_path = "Dersty/distilrubert_X5_ner_MLM"

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          use_fast=True,
                                          add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=len(lbls_in_dataset),
    id2label=id2label,
    label2id=label2id,
).to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Подготовка данных

In [15]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True,
        padding="max_length",   
        max_length=16, 
                                )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_index = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [16]:
tokenized_ds_train = ds_splitted['train'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})

Map: 100%|██████████| 22041/22041 [00:01<00:00, 17073.30 examples/s]


In [17]:
tokenized_ds_test = ds_splitted['test'].map(tokenize_and_align_labels,
                                              batched=True,
                                              fn_kwargs={'tokenizer': tokenizer})

Map: 100%|██████████| 5511/5511 [00:00<00:00, 21783.94 examples/s]


In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer)

## Поиск гиперпараметров с помощью OPTUNA

In [19]:
# функция метрики
def compute_metrics_custom(p):
    labels_list = list(id2label.values())

    if hasattr(p, 'predictions'):
        predictions = p.predictions
        labels = p.label_ids
    else:
        predictions, labels = p
        
    print(f"DEBUG: predictions type: {type(predictions)}, shape: {getattr(predictions, 'shape', 'No shape')}")
    print(f"DEBUG: labels type: {type(labels)}, shape: {getattr(labels, 'shape', 'No shape')}")
    

    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Take logits
    
    # Convert to numpy if they're tensors
    if hasattr(predictions, 'cpu'):
        predictions = predictions.cpu().numpy()
    if hasattr(labels, 'cpu'):
        labels = labels.cpu().numpy()
    
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    report_dict = classification_report(true_labels, true_predictions, digits=4, output_dict=True)
    report = classification_report(true_labels, true_predictions, digits=4)
    macro_f1 = report_dict["macro avg"]["f1-score"]
    print("=== seqeval classification_report ===")
    print(report)
    CLASS_REPORT_PATH = 'logs/last_classification_report2.txt'
    try:
        with open(CLASS_REPORT_PATH, "w", encoding="utf-8") as f:
            f.write(report)
    except Exception as e:
        print(f"Warning: failed to write classification report to {CLASS_REPORT_PATH}: {e}")

    return {
        "f1_macro": macro_f1,
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [20]:
early_stop_cb = EarlyStoppingCallback(early_stopping_patience=2)

In [21]:
def objective(trial: optuna.Trial):
    print(f"\n=== Trial {trial.number} ===")
    print(trial.params)
    
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 5e-4, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.001, 0.1, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 4, 14)

    print(f"\n=== Current parameters: {trial.params} ===")
    

    model = AutoModelForTokenClassification.from_pretrained(
        model_path, 
        num_labels=len(lbls_in_dataset),
        id2label=id2label,
        label2id=label2id
    ).to(device)
    print(model_path)

    trial_check_dir = f"./distilrubert_X5_ner_MLM_ft_0210/checkpoints_trial_{trial.number}"
    os.makedirs(trial_check_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=trial_check_dir,
        overwrite_output_dir=True,
        eval_strategy="epoch",
        torch_compile=False, # Для Яндекса Датасферы
        per_device_train_batch_size=1024,   
        per_device_eval_batch_size=512,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        seed=42     ,          
        data_seed=24,
        gradient_accumulation_steps=1,   
        
        lr_scheduler_type="cosine",
        
        warmup_ratio=0.1,
        report_to=None,
        logging_dir="./logs",
        logging_steps=100,
        load_best_model_at_end=False,
        metric_for_best_model="eval_f1_macro",
        save_total_limit=1,
        save_strategy="no",
        disable_tqdm=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_ds_train,
        eval_dataset=tokenized_ds_test,
        data_collator=data_collator,
        compute_metrics=compute_metrics_custom,
        tokenizer=tokenizer,
        callbacks=[early_stop_cb] if 'early_stop_cb' in globals() else [],
    )

    trainer.train()

    eval_metrics = trainer.evaluate()
    f1_score = eval_metrics["eval_f1_macro"]

    trial.report(f1_score, step=0)

    if trial.should_prune():
        del model, trainer
        torch.cuda.empty_cache()
        raise optuna.TrialPruned()

    print(f"✅ Trial {trial.number} finished | F1: {f1_score:.4f}")
    return f1_score


study = optuna.create_study(
    study_name='test_optuna',
    direction='maximize',
    pruner=MedianPruner(
        n_startup_trials=2,
        n_warmup_steps=1,     
        interval_steps=1     
    )
)

study.optimize(objective, n_trials=25)

print("Best trial:")
print(f"  Value: {study.best_value:.4f}")
print("  Params:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

[I 2025-10-02 07:51:31,474] A new study created in memory with name: test_optuna



=== Trial 0 ===
{}

=== Current parameters: {'learning_rate': 0.00011410644661724481, 'weight_decay': 0.007328327054148027, 'num_train_epochs': 14} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  7%|▋         | 22/308 [00:24<04:37,  1.03it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.48it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.81it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.84it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.40it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.92it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.82it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.74it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.70it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.04it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                
  7%|▋         | 22/308 [00:29<04:37,  1.03it/s]
100%|██████████| 11/11 [00:04<00:00,  5.04it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.6592    0.8092    0.7265      3475
     PERCENT     0.0000    0.0000    0.0000        83
        TYPE     0.8360    0.8711    0.8532     10485
      VOLUME     0.0000    0.0000    0.0000        52

   micro avg     0.7863    0.8475    0.8158     14095
   macro avg     0.3738    0.4201    0.3949     14095
weighted avg     0.7844    0.8475    0.8138     14095

{'eval_loss': 0.6176654100418091, 'eval_f1_macro': 0.39493185428285105, 'eval_precision': 0.7863349131121643, 'eval_recall': 0.847534586732884, 'eval_f1': 0.8157885751357258, 'eval_accuracy': 0.8267487600478878, 'eval_runtime': 4.9451, 'eval_samples_per_second': 1114.431, 'eval_steps_per_second': 2.224, 'epoch': 1.0}


 14%|█▍        | 44/308 [00:54<04:19,  1.02it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.82it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.85it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.90it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.43it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.16it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.97it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.85it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.77it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.71it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.06it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


                                                
 14%|█▍        | 44/308 [00:58<04:19,  1.02it/s]
100%|██████████| 11/11 [00:04<00:00,  5.06it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8593    0.9013    0.8798      3475
     PERCENT     0.7900    0.9518    0.8634        83
        TYPE     0.9377    0.9710    0.9541     10485
      VOLUME     0.7143    0.4808    0.5747        52

   micro avg     0.9166    0.9519    0.9339     14095
   macro avg     0.8253    0.8262    0.8180     14095
weighted avg     0.9167    0.9519    0.9338     14095

{'eval_loss': 0.23832497000694275, 'eval_f1_macro': 0.8179892643125214, 'eval_precision': 0.9166495866639338, 'eval_recall': 0.9518978361120964, 'eval_f1': 0.9339412501740221, 'eval_accuracy': 0.929650533036885, 'eval_runtime': 4.8345, 'eval_samples_per_second': 1139.921, 'eval_steps_per_second': 2.275, 'epoch': 2.0}


 21%|██▏       | 66/308 [01:24<04:05,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.72it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.79it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.85it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.41it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.12it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.93it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.83it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.75it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.68it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.03it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===

                                                
 21%|██▏       | 66/308 [01:28<04:05,  1.01s/it]
100%|██████████| 11/11 [00:04<00:00,  5.03it/s][A
                                               [A


              precision    recall  f1-score   support

       BRAND     0.9035    0.9237    0.9135      3475
     PERCENT     0.9756    0.9639    0.9697        83
        TYPE     0.9520    0.9715    0.9616     10485
      VOLUME     0.9000    0.8654    0.8824        52

   micro avg     0.9399    0.9593    0.9495     14095
   macro avg     0.9328    0.9311    0.9318     14095
weighted avg     0.9400    0.9593    0.9495     14095

{'eval_loss': 0.19580543041229248, 'eval_f1_macro': 0.9317906507069748, 'eval_precision': 0.9399374348279458, 'eval_recall': 0.9592763391273501, 'eval_f1': 0.9495084269662921, 'eval_accuracy': 0.9436748189954962, 'eval_runtime': 4.8723, 'eval_samples_per_second': 1131.092, 'eval_steps_per_second': 2.258, 'epoch': 3.0}


 29%|██▊       | 88/308 [01:53<03:38,  1.01it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:01,  7.87it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.21it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.54it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.14it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  4.98it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9167    0.9249    0.9208      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9539    0.9727    0.9632     10485
      VOLUME     0.9231    0.9231    0.9231        52

   micro avg     0.9447    0.9607    0.9527     14095
   macro avg     0.9394    0.9461    0.9427     14095
weighted avg     0.9447    0.9607    0.9526     14095



                                                
 29%|██▊       | 88/308 [01:58<03:38,  1.01it/s]
100%|██████████| 11/11 [00:04<00:00,  4.97it/s][A
                                               [A

{'eval_loss': 0.20267938077449799, 'eval_f1_macro': 0.9427330352766099, 'eval_precision': 0.9447429009976976, 'eval_recall': 0.960695282014899, 'eval_f1': 0.9526523146193894, 'eval_accuracy': 0.9461832278661422, 'eval_runtime': 4.9301, 'eval_samples_per_second': 1117.83, 'eval_steps_per_second': 2.231, 'epoch': 4.0}


 32%|███▏      | 100/308 [02:12<04:05,  1.18s/it]

{'loss': 0.4461, 'grad_norm': 0.8242065906524658, 'learning_rate': 9.796380439543023e-05, 'epoch': 4.55}


 36%|███▌      | 110/308 [02:23<03:17,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.71it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.82it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.85it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.38it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.70it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.65it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.99it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


                                                 
 36%|███▌      | 110/308 [02:28<03:17,  1.00it/s]
100%|██████████| 11/11 [00:04<00:00,  4.99it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9022    0.9263    0.9141      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9590    0.9649    0.9619     10485
      VOLUME     0.9231    0.9231    0.9231        52

   micro avg     0.9446    0.9553    0.9499     14095
   macro avg     0.9371    0.9476    0.9423     14095
weighted avg     0.9449    0.9553    0.9500     14095

{'eval_loss': 0.21567752957344055, 'eval_f1_macro': 0.9422891293302222, 'eval_precision': 0.9446471165988495, 'eval_recall': 0.9553032990422136, 'eval_f1': 0.9499453243500653, 'eval_accuracy': 0.9424206145601733, 'eval_runtime': 4.8857, 'eval_samples_per_second': 1127.977, 'eval_steps_per_second': 2.251, 'epoch': 5.0}


 43%|████▎     | 132/308 [02:54<02:55,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.71it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.79it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.85it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.38it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.92it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.79it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.70it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


                                                 
 43%|████▎     | 132/308 [02:59<02:55,  1.00it/s]
100%|██████████| 11/11 [00:04<00:00,  4.97it/s][A
 43%|████▎     | 132/308 [02:59<03:58,  1.36s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9251    0.8955    0.9101      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9500    0.9744    0.9621     10485
      VOLUME     0.9231    0.9231    0.9231        52

   micro avg     0.9441    0.9548    0.9494     14095
   macro avg     0.9406    0.9422    0.9413     14095
weighted avg     0.9438    0.9548    0.9491     14095

{'eval_loss': 0.24276717007160187, 'eval_f1_macro': 0.9413160265797837, 'eval_precision': 0.9440897930550683, 'eval_recall': 0.9548066690315715, 'eval_f1': 0.9494179894179895, 'eval_accuracy': 0.942819679607776, 'eval_runtime': 4.8951, 'eval_samples_per_second': 1125.83, 'eval_steps_per_second': 2.247, 'epoch': 6.0}
{'train_runtime': 179.1027, 'train_samples_per_second': 1722.888, 'train_steps_per_second': 1.72, 'train_loss': 0.3500657352534207, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.99it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.36it/s]
[I 2025-10-02 07:54:36,266] Trial 0 finished with value: 0.9413160265797837 and parameters: {'learning_rate': 0.00011410644661724481, 'weight_decay': 0.007328327054148027, 'num_train_epochs': 14}. Best is trial 0 with value: 0.9413160265797837.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9251    0.8955    0.9101      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9500    0.9744    0.9621     10485
      VOLUME     0.9231    0.9231    0.9231        52

   micro avg     0.9441    0.9548    0.9494     14095
   macro avg     0.9406    0.9422    0.9413     14095
weighted avg     0.9438    0.9548    0.9491     14095

✅ Trial 0 finished | F1: 0.9413

=== Trial 1 ===
{}

=== Current parameters: {'learning_rate': 0.00022071470022688497, 'weight_decay': 0.006233252128617817, 'num_train_epochs': 11} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  9%|▉         | 22/242 [00:24<03:38,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.54it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.98it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

                                                
  9%|▉         | 22/242 [00:29<03:38,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8098    0.8236    0.8167      3475
     PERCENT     1.0000    0.2410    0.3883        83
        TYPE     0.9088    0.9423    0.9253     10485
      VOLUME     0.0000    0.0000    0.0000        52

   micro avg     0.8847    0.9054    0.8950     14095
   macro avg     0.6797    0.5017    0.5326     14095
weighted avg     0.8816    0.9054    0.8919     14095

{'eval_loss': 0.3668540418148041, 'eval_f1_macro': 0.5325701768123192, 'eval_precision': 0.8847140381282496, 'eval_recall': 0.9054274565448741, 'eval_f1': 0.8949509116409538, 'eval_accuracy': 0.8896300096915797, 'eval_runtime': 4.8792, 'eval_samples_per_second': 1129.498, 'eval_steps_per_second': 2.254, 'epoch': 1.0}


 18%|█▊        | 44/242 [00:54<03:18,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8599    0.9258    0.8916      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9455    0.9631    0.9542     10485
      VOLUME     0.8667    0.7500    0.8041        52

   micro avg     0.9234    0.9531    0.9380     14095
   macro avg     0.9090    0.9007    0.9035     14095
weighted avg     0.9242    0.9531    0.9383     14095




                                                
 18%|█▊        | 44/242 [00:59<03:18,  1.00s/it][A
                                               [A

{'eval_loss': 0.22473075985908508, 'eval_f1_macro': 0.903456428062124, 'eval_precision': 0.9233624304075881, 'eval_recall': 0.9531039375665129, 'eval_f1': 0.937997486384583, 'eval_accuracy': 0.9324439883701043, 'eval_runtime': 4.9044, 'eval_samples_per_second': 1123.692, 'eval_steps_per_second': 2.243, 'epoch': 2.0}


 27%|██▋       | 66/242 [01:25<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 27%|██▋       | 66/242 [01:30<02:56,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9174    0.9145    0.9160      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9536    0.9778    0.9655     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9449    0.9619    0.9533     14095
   macro avg     0.9438    0.9430    0.9433     14095
weighted avg     0.9447    0.9619    0.9532     14095

{'eval_loss': 0.1910022795200348, 'eval_f1_macro': 0.9432852173406157, 'eval_precision': 0.9448742072618301, 'eval_recall': 0.9619013834693153, 'eval_f1': 0.953311770496414, 'eval_accuracy': 0.9458411721110541, 'eval_runtime': 5.1599, 'eval_samples_per_second': 1068.041, 'eval_steps_per_second': 2.132, 'epoch': 3.0}


 36%|███▋      | 88/242 [01:55<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 36%|███▋      | 88/242 [02:00<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9238    0.9142    0.9190      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9598    0.9627    0.9612     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9510    0.9507    0.9508     14095
   macro avg     0.9471    0.9422    0.9445     14095
weighted avg     0.9509    0.9507    0.9508     14095

{'eval_loss': 0.21190528571605682, 'eval_f1_macro': 0.9445014519241395, 'eval_precision': 0.9509616066993116, 'eval_recall': 0.95069173465768, 'eval_f1': 0.9508266515291279, 'eval_accuracy': 0.9415084658799384, 'eval_runtime': 4.9165, 'eval_samples_per_second': 1120.908, 'eval_steps_per_second': 2.237, 'epoch': 4.0}


 41%|████▏     | 100/242 [02:14<02:49,  1.20s/it]

{'loss': 0.3507, 'grad_norm': 0.5459311008453369, 'learning_rate': 0.0001632141260303895, 'epoch': 4.55}


 45%|████▌     | 110/242 [02:25<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 45%|████▌     | 110/242 [02:30<02:12,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9117    0.9183    0.9150      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9558    0.9693    0.9625     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9450    0.9566    0.9508     14095
   macro avg     0.9479    0.9466    0.9472     14095
weighted avg     0.9450    0.9566    0.9507     14095

{'eval_loss': 0.24058693647384644, 'eval_f1_macro': 0.9471787195520314, 'eval_precision': 0.9450480128968949, 'eval_recall': 0.9565803476410074, 'eval_f1': 0.9507792116211832, 'eval_accuracy': 0.9420215495125706, 'eval_runtime': 4.9533, 'eval_samples_per_second': 1112.599, 'eval_steps_per_second': 2.221, 'epoch': 5.0}


 55%|█████▍    | 132/242 [02:56<01:51,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 11/11 [00:04<00:00,  4.96it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9280    0.9019    0.9148      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9519    0.9748    0.9632     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9463    0.9567    0.9515     14095
   macro avg     0.9510    0.9439    0.9473     14095
weighted avg     0.9461    0.9567    0.9512     14095

{'eval_loss': 0.2803444266319275, 'eval_f1_macro': 0.9473015156946281, 'eval_precision': 0.9463120218962734, 'eval_recall': 0.9566512947853849, 'eval_f1': 0.9514535704205476, 'eval_accuracy': 0.9443589305056724, 'eval_runtime': 4.9359, 'eval_samples_per_second': 1116.508, 'eval_steps_per_second': 2.229, 'epoch': 6.0}


 55%|█████▍    | 132/242 [03:01<01:51,  1.01s/it]
 64%|██████▎   | 154/242 [03:26<01:28,  1.00s/it]A
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 64%|██████▎   | 154/242 [03:31<01:28,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9097    0.9214    0.9155      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9557    0.9702    0.9629     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9445    0.9581    0.9513     14095
   macro avg     0.9475    0.9507    0.9490     14095
weighted avg     0.9445    0.9581    0.9512     14095

{'eval_loss': 0.3033178150653839, 'eval_f1_macro': 0.9489549885691786, 'eval_precision': 0.9444716413735226, 'eval_recall': 0.9581411848173111, 'eval_f1': 0.951257307881947, 'eval_accuracy': 0.9433327632404082, 'eval_runtime': 4.9344, 'eval_samples_per_second': 1116.843, 'eval_steps_per_second': 2.229, 'epoch': 7.0}


 73%|███████▎  | 176/242 [03:56<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 73%|███████▎  | 176/242 [04:01<01:06,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9059    0.9255    0.9156      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9585    0.9639    0.9611     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9454    0.9544    0.9499     14095
   macro avg     0.9473    0.9501    0.9485     14095
weighted avg     0.9455    0.9544    0.9499     14095

{'eval_loss': 0.3310803771018982, 'eval_f1_macro': 0.9485257562548548, 'eval_precision': 0.94539321104786, 'eval_recall': 0.9543809861653069, 'eval_f1': 0.9498658381584522, 'eval_accuracy': 0.941052391539821, 'eval_runtime': 5.1789, 'eval_samples_per_second': 1064.13, 'eval_steps_per_second': 2.124, 'epoch': 8.0}


 82%|████████▏ | 198/242 [04:27<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.11it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.56it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.70it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.28it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



[A                                            

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9084    0.9160    0.9122      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9558    0.9689    0.9623     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9442    0.9558    0.9500     14095
   macro avg     0.9472    0.9490    0.9480     14095
weighted avg     0.9442    0.9558    0.9499     14095



                                                 
 82%|████████▏ | 198/242 [04:32<00:44,  1.01s/it]A
 82%|████████▏ | 198/242 [04:32<01:00,  1.37s/it]A

{'eval_loss': 0.3248503506183624, 'eval_f1_macro': 0.9479579773027251, 'eval_precision': 0.944210821418559, 'eval_recall': 0.9557999290528556, 'eval_f1': 0.9499700313789091, 'eval_accuracy': 0.9425916424377173, 'eval_runtime': 4.9244, 'eval_samples_per_second': 1119.127, 'eval_steps_per_second': 2.234, 'epoch': 9.0}
{'train_runtime': 272.1277, 'train_samples_per_second': 890.946, 'train_steps_per_second': 0.889, 'train_loss': 0.18512607945336235, 'epoch': 9.0}



100%|██████████| 11/11 [00:02<00:00,  4.97it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.38it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9084    0.9160    0.9122      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9558    0.9689    0.9623     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9442    0.9558    0.9500     14095
   macro avg     0.9472    0.9490    0.9480     14095
weighted avg     0.9442    0.9558    0.9499     14095




[I 2025-10-02 07:59:14,134] Trial 1 finished with value: 0.9479579773027251 and parameters: {'learning_rate': 0.00022071470022688497, 'weight_decay': 0.006233252128617817, 'num_train_epochs': 11}. Best is trial 1 with value: 0.9479579773027251.


✅ Trial 1 finished | F1: 0.9480

=== Trial 2 ===
{}

=== Current parameters: {'learning_rate': 0.00018101727452685053, 'weight_decay': 0.009533146794213108, 'num_train_epochs': 12} ===


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f3d72c1e-4b05-4c71-b72a-07b137dfc500)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  8%|▊         | 22/264 [00:24<03:59,  1.01it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.71it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.80it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.86it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.38it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.93it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.81it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.73it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.66it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.00it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

                                                
  8%|▊         | 22/264 [00:29<03:59,  1.01it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8345    0.7632    0.7972      3475
     PERCENT     1.0000    0.0964    0.1758        83
        TYPE     0.8832    0.9371    0.9094     10485
      VOLUME     0.0000    0.0000    0.0000        52

   micro avg     0.8725    0.8858    0.8791     14095
   macro avg     0.6794    0.4492    0.4706     14095
weighted avg     0.8686    0.8858    0.8741     14095

{'eval_loss': 0.39577600359916687, 'eval_f1_macro': 0.47061307634212535, 'eval_precision': 0.8724757179791769, 'eval_recall': 0.8858460446967009, 'eval_f1': 0.8791100471731325, 'eval_accuracy': 0.8791402998688786, 'eval_runtime': 4.9135, 'eval_samples_per_second': 1121.603, 'eval_steps_per_second': 2.239, 'epoch': 1.0}


 17%|█▋        | 44/264 [00:54<03:39,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.36it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.91it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.80it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.71it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.67it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.01it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 44/264 [00:59<03:39,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8733    0.9108    0.8917      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9433    0.9719    0.9574     10485
      VOLUME     0.7778    0.6731    0.7216        52

   micro avg     0.9255    0.9557    0.9404     14095
   macro avg     0.8897    0.8829    0.8852     14095
weighted avg     0.9256    0.9557    0.9404     14095

{'eval_loss': 0.2121114879846573, 'eval_f1_macro': 0.8851940209886364, 'eval_precision': 0.9255238749570595, 'eval_recall': 0.9557289819084782, 'eval_f1': 0.9403839441535777, 'eval_accuracy': 0.9346673507781769, 'eval_runtime': 4.863, 'eval_samples_per_second': 1133.256, 'eval_steps_per_second': 2.262, 'epoch': 2.0}


 25%|██▌       | 66/264 [01:24<03:17,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.36it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.08it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 25%|██▌       | 66/264 [01:29<03:17,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9220    0.9111    0.9165      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9529    0.9765    0.9646     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9456    0.9602    0.9528     14095
   macro avg     0.9497    0.9449    0.9470     14095
weighted avg     0.9454    0.9602    0.9527     14095

{'eval_loss': 0.1859140396118164, 'eval_f1_macro': 0.9469869288713657, 'eval_precision': 0.9455739537483406, 'eval_recall': 0.9601986520042568, 'eval_f1': 0.9528301886792453, 'eval_accuracy': 0.9467533207912889, 'eval_runtime': 5.1014, 'eval_samples_per_second': 1080.288, 'eval_steps_per_second': 2.156, 'epoch': 3.0}


 33%|███▎      | 88/264 [01:55<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 88/264 [01:59<02:56,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9253    0.9119    0.9186      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9566    0.9723    0.9644     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9491    0.9571    0.9531     14095
   macro avg     0.9510    0.9344    0.9421     14095
weighted avg     0.9490    0.9571    0.9530     14095

{'eval_loss': 0.2079065889120102, 'eval_f1_macro': 0.9420657750911563, 'eval_precision': 0.9491346559729844, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9531244480553888, 'eval_accuracy': 0.9448150048457898, 'eval_runtime': 4.8542, 'eval_samples_per_second': 1135.3, 'eval_steps_per_second': 2.266, 'epoch': 4.0}


 38%|███▊      | 100/264 [02:14<03:15,  1.19s/it]

{'loss': 0.3776, 'grad_norm': 0.787873387336731, 'learning_rate': 0.00014283095641666403, 'epoch': 4.55}


 42%|████▏     | 110/264 [02:25<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.81it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.84it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.38it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 42%|████▏     | 110/264 [02:30<02:34,  1.00s/it]A
 42%|████▏     | 110/264 [02:30<03:30,  1.36s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9110    0.9281    0.9195      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9553    0.9749    0.9650     10485
      VOLUME     0.9375    0.8654    0.9000        52

   micro avg     0.9444    0.9630    0.9536     14095
   macro avg     0.9420    0.9361    0.9386     14095
weighted avg     0.9444    0.9630    0.9536     14095

{'eval_loss': 0.238160640001297, 'eval_f1_macro': 0.9386351513498131, 'eval_precision': 0.9444057890342332, 'eval_recall': 0.9629655906349769, 'eval_f1': 0.9535953911546702, 'eval_accuracy': 0.9452710791859072, 'eval_runtime': 4.862, 'eval_samples_per_second': 1133.488, 'eval_steps_per_second': 2.262, 'epoch': 5.0}
{'train_runtime': 150.1034, 'train_samples_per_second': 1762.065, 'train_steps_per_second': 1.759, 'train_loss': 0.3475639974529093, 'epoch': 5.0}



100%|██████████| 11/11 [00:02<00:00,  4.98it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.26it/s]
[I 2025-10-02 08:02:01,361] Trial 2 finished with value: 0.9386351513498131 and parameters: {'learning_rate': 0.00018101727452685053, 'weight_decay': 0.009533146794213108, 'num_train_epochs': 12}. Best is trial 1 with value: 0.9479579773027251.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9110    0.9281    0.9195      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9553    0.9749    0.9650     10485
      VOLUME     0.9375    0.8654    0.9000        52

   micro avg     0.9444    0.9630    0.9536     14095
   macro avg     0.9420    0.9361    0.9386     14095
weighted avg     0.9444    0.9630    0.9536     14095

✅ Trial 2 finished | F1: 0.9386

=== Trial 3 ===
{}

=== Current parameters: {'learning_rate': 0.000364991586158588, 'weight_decay': 0.003092662398243915, 'num_train_epochs': 7} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 14%|█▍        | 22/154 [00:25<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.70it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 14%|█▍        | 22/154 [00:29<02:12,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8482    0.8650    0.8565      3475
     PERCENT     0.8723    0.9880    0.9266        83
        TYPE     0.9311    0.9582    0.9444     10485
      VOLUME     0.8000    0.6154    0.6957        52

   micro avg     0.9100    0.9342    0.9219     14095
   macro avg     0.8629    0.8566    0.8558     14095
weighted avg     0.9098    0.9342    0.9217     14095

{'eval_loss': 0.3015751242637634, 'eval_f1_macro': 0.8557956400569849, 'eval_precision': 0.9100145137880987, 'eval_recall': 0.9341610500177367, 'eval_f1': 0.9219297017224478, 'eval_accuracy': 0.9160253121258766, 'eval_runtime': 4.8517, 'eval_samples_per_second': 1135.889, 'eval_steps_per_second': 2.267, 'epoch': 1.0}


 29%|██▊       | 44/154 [00:55<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.66it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 29%|██▊       | 44/154 [00:59<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8765    0.9329    0.9038      3475
     PERCENT     0.9756    0.9639    0.9697        83
        TYPE     0.9465    0.9627    0.9546     10485
      VOLUME     0.9048    0.7308    0.8085        52

   micro avg     0.9287    0.9545    0.9414     14095
   macro avg     0.9258    0.8976    0.9091     14095
weighted avg     0.9293    0.9545    0.9416     14095

{'eval_loss': 0.20617720484733582, 'eval_f1_macro': 0.9091468637383754, 'eval_precision': 0.9286946917926416, 'eval_recall': 0.9545228804540618, 'eval_f1': 0.9414316702819956, 'eval_accuracy': 0.9364916481386466, 'eval_runtime': 4.8584, 'eval_samples_per_second': 1134.321, 'eval_steps_per_second': 2.264, 'epoch': 2.0}


 43%|████▎     | 66/154 [01:25<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.22it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.59it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 43%|████▎     | 66/154 [01:30<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9307    0.8999    0.9150      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9497    0.9782    0.9637     10485
      VOLUME     0.9583    0.8846    0.9200        52

   micro avg     0.9454    0.9585    0.9519     14095
   macro avg     0.9507    0.9346    0.9422     14095
weighted avg     0.9451    0.9585    0.9516     14095

{'eval_loss': 0.21927155554294586, 'eval_f1_macro': 0.9421962112586088, 'eval_precision': 0.9453502204184452, 'eval_recall': 0.9584959205391983, 'eval_f1': 0.9518776861833298, 'eval_accuracy': 0.9444729490907018, 'eval_runtime': 5.1429, 'eval_samples_per_second': 1071.564, 'eval_steps_per_second': 2.139, 'epoch': 3.0}


 57%|█████▋    | 88/154 [01:55<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.54it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 57%|█████▋    | 88/154 [02:00<01:06,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9137    0.9226    0.9181      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9550    0.9682    0.9616     10485
      VOLUME     0.9412    0.9231    0.9320        52

   micro avg     0.9449    0.9569    0.9509     14095
   macro avg     0.9436    0.9505    0.9470     14095
weighted avg     0.9448    0.9569    0.9508     14095

{'eval_loss': 0.23100069165229797, 'eval_f1_macro': 0.9469797769599299, 'eval_precision': 0.9448686514886164, 'eval_recall': 0.9569350833628947, 'eval_f1': 0.9508635882974974, 'eval_accuracy': 0.9431617353628642, 'eval_runtime': 4.9095, 'eval_samples_per_second': 1122.509, 'eval_steps_per_second': 2.241, 'epoch': 4.0}


 65%|██████▍   | 100/154 [02:14<01:04,  1.20s/it]

{'loss': 0.2971, 'grad_norm': 0.34760212898254395, 'learning_rate': 0.00012531183713528, 'epoch': 4.55}


 71%|███████▏  | 110/154 [02:25<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 71%|███████▏  | 110/154 [02:30<00:44,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9115    0.9183    0.9149      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9562    0.9684    0.9623     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9453    0.9560    0.9506     14095
   macro avg     0.9481    0.9494    0.9486     14095
weighted avg     0.9452    0.9560    0.9506     14095

{'eval_loss': 0.27924808859825134, 'eval_f1_macro': 0.9486249740130521, 'eval_precision': 0.9452823570676956, 'eval_recall': 0.956012770485988, 'eval_f1': 0.9506172839506174, 'eval_accuracy': 0.9424206145601733, 'eval_runtime': 4.9063, 'eval_samples_per_second': 1123.253, 'eval_steps_per_second': 2.242, 'epoch': 5.0}


 86%|████████▌ | 132/154 [02:56<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.79it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 86%|████████▌ | 132/154 [03:01<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9098    0.9203    0.9150      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9548    0.9684    0.9616     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9438    0.9565    0.9501     14095
   macro avg     0.9473    0.9499    0.9485     14095
weighted avg     0.9438    0.9565    0.9501     14095

{'eval_loss': 0.2846280634403229, 'eval_f1_macro': 0.9484853590773024, 'eval_precision': 0.9437871893594679, 'eval_recall': 0.95650940049663, 'eval_f1': 0.9501057082452431, 'eval_accuracy': 0.942477623852688, 'eval_runtime': 4.8548, 'eval_samples_per_second': 1135.163, 'eval_steps_per_second': 2.266, 'epoch': 6.0}


100%|██████████| 154/154 [03:26<00:00,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 154/154 [03:31<00:00,  1.00s/it]A
100%|██████████| 154/154 [03:31<00:00,  1.37s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9124    0.9203    0.9163      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9556    0.9699    0.9627     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9450    0.9576    0.9513     14095
   macro avg     0.9482    0.9503    0.9491     14095
weighted avg     0.9450    0.9576    0.9512     14095

{'eval_loss': 0.29016607999801636, 'eval_f1_macro': 0.9490884370630193, 'eval_precision': 0.9450357092844139, 'eval_recall': 0.9575736076622916, 'eval_f1': 0.951263347076858, 'eval_accuracy': 0.9432757539478935, 'eval_runtime': 4.8965, 'eval_samples_per_second': 1125.498, 'eval_steps_per_second': 2.247, 'epoch': 7.0}
{'train_runtime': 211.5066, 'train_samples_per_second': 729.467, 'train_steps_per_second': 0.728, 'train_loss': 0.19865755407841174, 'epoch': 7.0}



100%|██████████| 11/11 [00:02<00:00,  4.95it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.40it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9124    0.9203    0.9163      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9556    0.9699    0.9627     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9450    0.9576    0.9513     14095
   macro avg     0.9482    0.9503    0.9491     14095
weighted avg     0.9450    0.9576    0.9512     14095




[I 2025-10-02 08:05:38,615] Trial 3 finished with value: 0.9490884370630193 and parameters: {'learning_rate': 0.000364991586158588, 'weight_decay': 0.003092662398243915, 'num_train_epochs': 7}. Best is trial 3 with value: 0.9490884370630193.


✅ Trial 3 finished | F1: 0.9491

=== Trial 4 ===
{}

=== Current parameters: {'learning_rate': 0.00044174775154295215, 'weight_decay': 0.0011060715876380523, 'num_train_epochs': 13} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  8%|▊         | 22/286 [00:25<04:24,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.66it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8615    0.8432    0.8522      3475
     PERCENT     0.5816    0.9880    0.7321        83
        TYPE     0.9255    0.9657    0.9452     10485
      VOLUME     0.4286    0.1154    0.1818        52

   micro avg     0.9067    0.9325    0.9194     14095
   macro avg     0.6993    0.7280    0.6778     14095
weighted avg     0.9059    0.9325    0.9182     14095




                                                
  8%|▊         | 22/286 [00:29<04:24,  1.00s/it][A
                                               [A

{'eval_loss': 0.3091735243797302, 'eval_f1_macro': 0.6778395598611544, 'eval_precision': 0.9066639072847682, 'eval_recall': 0.9324583185526782, 'eval_f1': 0.9193802245461858, 'eval_accuracy': 0.9142010147654067, 'eval_runtime': 4.8679, 'eval_samples_per_second': 1132.117, 'eval_steps_per_second': 2.26, 'epoch': 1.0}


 15%|█▌        | 44/286 [00:55<04:03,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 15%|█▌        | 44/286 [01:00<04:03,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8645    0.9364    0.8990      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9529    0.9628    0.9578     10485
      VOLUME     0.8636    0.7308    0.7917        52

   micro avg     0.9297    0.9556    0.9424     14095
   macro avg     0.9086    0.9045    0.9047     14095
weighted avg     0.9308    0.9556    0.9428     14095

{'eval_loss': 0.20687761902809143, 'eval_f1_macro': 0.9047313482948092, 'eval_precision': 0.9296659304251794, 'eval_recall': 0.9555870876197233, 'eval_f1': 0.9424483084350839, 'eval_accuracy': 0.9372897782338521, 'eval_runtime': 4.9516, 'eval_samples_per_second': 1112.963, 'eval_steps_per_second': 2.221, 'epoch': 2.0}


 23%|██▎       | 66/286 [01:25<03:41,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 23%|██▎       | 66/286 [01:30<03:41,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8596    0.9462    0.9008      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9553    0.9569    0.9561     10485
      VOLUME     0.9362    0.8462    0.8889        52

   micro avg     0.9300    0.9539    0.9418     14095
   macro avg     0.9287    0.9283    0.9274     14095
weighted avg     0.9317    0.9539    0.9423     14095

{'eval_loss': 0.23344796895980835, 'eval_f1_macro': 0.9274204797623763, 'eval_precision': 0.9299993082935603, 'eval_recall': 0.9538843561546648, 'eval_f1': 0.9417904174838891, 'eval_accuracy': 0.9359215552134998, 'eval_runtime': 5.0985, 'eval_samples_per_second': 1080.907, 'eval_steps_per_second': 2.157, 'epoch': 3.0}


 31%|███       | 88/286 [01:55<03:19,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 31%|███       | 88/286 [02:00<03:19,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9036    0.9220    0.9127      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9549    0.9643    0.9596     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9422    0.9537    0.9479     14095
   macro avg     0.9404    0.9367    0.9383     14095
weighted avg     0.9423    0.9537    0.9479     14095

{'eval_loss': 0.2471286505460739, 'eval_f1_macro': 0.9383145100612015, 'eval_precision': 0.9421742482652274, 'eval_recall': 0.9536715147215324, 'eval_f1': 0.9478880191805936, 'eval_accuracy': 0.9399692149820421, 'eval_runtime': 4.8721, 'eval_samples_per_second': 1131.126, 'eval_steps_per_second': 2.258, 'epoch': 4.0}


 35%|███▍      | 100/286 [02:15<03:43,  1.20s/it]

{'loss': 0.3265, 'grad_norm': 0.43463703989982605, 'learning_rate': 0.00036570091584890176, 'epoch': 4.55}


 38%|███▊      | 110/286 [02:26<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.73it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 38%|███▊      | 110/286 [02:31<02:56,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8901    0.9327    0.9109      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9518    0.9655    0.9586     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9362    0.9571    0.9465     14095
   macro avg     0.9381    0.9349    0.9357     14095
weighted avg     0.9366    0.9571    0.9467     14095

{'eval_loss': 0.2855836749076843, 'eval_f1_macro': 0.9357134158740077, 'eval_precision': 0.936220417794434, 'eval_recall': 0.9570769776516496, 'eval_f1': 0.9465338198147628, 'eval_accuracy': 0.9383729547916311, 'eval_runtime': 4.8754, 'eval_samples_per_second': 1130.38, 'eval_steps_per_second': 2.256, 'epoch': 5.0}


 46%|████▌     | 132/286 [02:56<02:35,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.51it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 46%|████▌     | 132/286 [03:01<02:35,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9247    0.9114    0.9180      3475
     PERCENT     0.9405    0.9518    0.9461        83
        TYPE     0.9486    0.9735    0.9609     10485
      VOLUME     0.9792    0.9038    0.9400        52

   micro avg     0.9429    0.9578    0.9503     14095
   macro avg     0.9482    0.9351    0.9412     14095
weighted avg     0.9428    0.9578    0.9501     14095

{'eval_loss': 0.29935574531555176, 'eval_f1_macro': 0.9412409282553315, 'eval_precision': 0.9429349724104211, 'eval_recall': 0.9577864490954239, 'eval_f1': 0.9503026890046459, 'eval_accuracy': 0.9415084658799384, 'eval_runtime': 4.8499, 'eval_samples_per_second': 1136.319, 'eval_steps_per_second': 2.268, 'epoch': 6.0}


 54%|█████▍    | 154/286 [03:26<02:13,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.29it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 54%|█████▍    | 154/286 [03:31<02:13,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9050    0.9243    0.9146      3475
     PERCENT     0.9412    0.9639    0.9524        83
        TYPE     0.9563    0.9614    0.9588     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9434    0.9519    0.9476     14095
   macro avg     0.9400    0.9287    0.9337     14095
weighted avg     0.9436    0.9519    0.9477     14095

{'eval_loss': 0.31298649311065674, 'eval_f1_macro': 0.9337158368718871, 'eval_precision': 0.943397553086767, 'eval_recall': 0.9518978361120964, 'eval_f1': 0.9476286329766571, 'eval_accuracy': 0.9386580012542044, 'eval_runtime': 4.9207, 'eval_samples_per_second': 1119.972, 'eval_steps_per_second': 2.235, 'epoch': 7.0}


 62%|██████▏   | 176/286 [03:57<01:50,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.51it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 62%|██████▏   | 176/286 [04:02<01:50,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9078    0.9177    0.9127      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9543    0.9665    0.9604     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9429    0.9542    0.9485     14095
   macro avg     0.9433    0.9344    0.9381     14095
weighted avg     0.9429    0.9542    0.9485     14095

{'eval_loss': 0.35150495171546936, 'eval_f1_macro': 0.9381498163503288, 'eval_precision': 0.9428671573781984, 'eval_recall': 0.954239091876552, 'eval_f1': 0.9485190409026798, 'eval_accuracy': 0.9397411778119834, 'eval_runtime': 4.8681, 'eval_samples_per_second': 1132.075, 'eval_steps_per_second': 2.26, 'epoch': 8.0}


 62%|██████▏   | 176/286 [04:02<02:31,  1.38s/it]

{'train_runtime': 242.2008, 'train_samples_per_second': 1183.039, 'train_steps_per_second': 1.181, 'train_loss': 0.19361952895467932, 'epoch': 8.0}



100%|██████████| 11/11 [00:02<00:00,  4.76it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.26it/s]
[I 2025-10-02 08:09:46,768] Trial 4 finished with value: 0.9381498163503288 and parameters: {'learning_rate': 0.00044174775154295215, 'weight_decay': 0.0011060715876380523, 'num_train_epochs': 13}. Best is trial 3 with value: 0.9490884370630193.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9078    0.9177    0.9127      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9543    0.9665    0.9604     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9429    0.9542    0.9485     14095
   macro avg     0.9433    0.9344    0.9381     14095
weighted avg     0.9429    0.9542    0.9485     14095

✅ Trial 4 finished | F1: 0.9381

=== Trial 5 ===
{}

=== Current parameters: {'learning_rate': 0.0004384618142333153, 'weight_decay': 0.007825198064126099, 'num_train_epochs': 10} ===


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 75938dd8-b239-4a9b-ac32-f38338a5c546)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 10%|█         | 22/220 [00:24<03:16,  1.01it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.77it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.82it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.86it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.38it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.91it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.80it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.99it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 10%|█         | 22/220 [00:29<03:16,  1.01it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8710    0.8242    0.8470      3475
     PERCENT     0.5896    0.9518    0.7281        83
        TYPE     0.9170    0.9663    0.9410     10485
      VOLUME     1.0000    0.1346    0.2373        52

   micro avg     0.9036    0.9281    0.9157     14095
   macro avg     0.8444    0.7192    0.6883     14095
weighted avg     0.9041    0.9281    0.9140     14095

{'eval_loss': 0.3071883022785187, 'eval_f1_macro': 0.6883459100138531, 'eval_precision': 0.9035778422434038, 'eval_recall': 0.9281305427456545, 'eval_f1': 0.9156896370699612, 'eval_accuracy': 0.9098112992417764, 'eval_runtime': 4.8456, 'eval_samples_per_second': 1137.317, 'eval_steps_per_second': 2.27, 'epoch': 1.0}


 20%|██        | 44/220 [00:54<02:55,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.76it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.91it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 20%|██        | 44/220 [00:59<02:55,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8674    0.9298    0.8975      3475
     PERCENT     0.9630    0.9398    0.9512        83
        TYPE     0.9485    0.9648    0.9566     10485
      VOLUME     0.8478    0.7500    0.7959        52

   micro avg     0.9275    0.9552    0.9411     14095
   macro avg     0.9067    0.8961    0.9003     14095
weighted avg     0.9282    0.9552    0.9414     14095

{'eval_loss': 0.21187564730644226, 'eval_f1_macro': 0.900308406055728, 'eval_precision': 0.9274643521388717, 'eval_recall': 0.9552323518978361, 'eval_f1': 0.9411435761219069, 'eval_accuracy': 0.9348953879482356, 'eval_runtime': 4.8313, 'eval_samples_per_second': 1140.693, 'eval_steps_per_second': 2.277, 'epoch': 2.0}


 30%|███       | 66/220 [01:24<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.59it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 30%|███       | 66/220 [01:29<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8571    0.9387    0.8960      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9578    0.9582    0.9580     10485
      VOLUME     0.9583    0.8846    0.9200        52

   micro avg     0.9312    0.9532    0.9421     14095
   macro avg     0.9344    0.9394    0.9360     14095
weighted avg     0.9330    0.9532    0.9427     14095

{'eval_loss': 0.23819518089294434, 'eval_f1_macro': 0.9360220653635254, 'eval_precision': 0.9312448017743277, 'eval_recall': 0.9532458318552678, 'eval_f1': 0.9421168881253724, 'eval_accuracy': 0.9358645459209851, 'eval_runtime': 5.1199, 'eval_samples_per_second': 1076.386, 'eval_steps_per_second': 2.148, 'epoch': 3.0}


 40%|████      | 88/220 [01:55<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 40%|████      | 88/220 [02:00<02:12,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9142    0.9137    0.9139      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9526    0.9685    0.9605     10485
      VOLUME     0.9583    0.8846    0.9200        52

   micro avg     0.9434    0.9547    0.9490     14095
   macro avg     0.9474    0.9357    0.9411     14095
weighted avg     0.9432    0.9547    0.9489     14095

{'eval_loss': 0.25153765082359314, 'eval_f1_macro': 0.941125676516505, 'eval_precision': 0.9433578689099194, 'eval_recall': 0.954735721887194, 'eval_f1': 0.9490126939351199, 'eval_accuracy': 0.9403112707371302, 'eval_runtime': 4.882, 'eval_samples_per_second': 1128.831, 'eval_steps_per_second': 2.253, 'epoch': 4.0}


 45%|████▌     | 100/220 [02:14<02:23,  1.20s/it]

{'loss': 0.3085, 'grad_norm': 0.49321016669273376, 'learning_rate': 0.0002942122933901133, 'epoch': 4.55}


 50%|█████     | 110/220 [02:25<01:50,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 50%|█████     | 110/220 [02:30<01:50,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9201    0.9145    0.9173      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9513    0.9729    0.9620     10485
      VOLUME     0.9412    0.9231    0.9320        52

   micro avg     0.9438    0.9583    0.9510     14095
   macro avg     0.9441    0.9436    0.9438     14095
weighted avg     0.9437    0.9583    0.9509     14095

{'eval_loss': 0.29002121090888977, 'eval_f1_macro': 0.9437984535819779, 'eval_precision': 0.9438194395919223, 'eval_recall': 0.958283079106066, 'eval_f1': 0.9509962683940012, 'eval_accuracy': 0.9420215495125706, 'eval_runtime': 4.8958, 'eval_samples_per_second': 1125.648, 'eval_steps_per_second': 2.247, 'epoch': 5.0}


 60%|██████    | 132/220 [02:55<01:29,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 60%|██████    | 132/220 [03:00<01:29,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9087    0.9188    0.9137      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9535    0.9690    0.9612     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9426    0.9563    0.9494     14095
   macro avg     0.9460    0.9323    0.9385     14095
weighted avg     0.9426    0.9563    0.9494     14095

{'eval_loss': 0.33347654342651367, 'eval_f1_macro': 0.9385208315604311, 'eval_precision': 0.9425874125874126, 'eval_recall': 0.9562965590634976, 'eval_f1': 0.9493924986793449, 'eval_accuracy': 0.9402542614446154, 'eval_runtime': 4.9553, 'eval_samples_per_second': 1112.147, 'eval_steps_per_second': 2.22, 'epoch': 6.0}


 70%|███████   | 154/220 [03:26<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.73it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.29it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.61it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 70%|███████   | 154/220 [03:31<01:06,  1.01s/it]A
 70%|███████   | 154/220 [03:31<01:06,  1.01s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9116    0.9168    0.9142      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9563    0.9665    0.9614     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9454    0.9539    0.9496     14095
   macro avg     0.9473    0.9281    0.9371     14095
weighted avg     0.9453    0.9539    0.9496     14095

{'eval_loss': 0.32808563113212585, 'eval_f1_macro': 0.9371347308967535, 'eval_precision': 0.9453663338489664, 'eval_recall': 0.9538843561546648, 'eval_f1': 0.9496062435992514, 'eval_accuracy': 0.940710335784733, 'eval_runtime': 4.964, 'eval_samples_per_second': 1110.197, 'eval_steps_per_second': 2.216, 'epoch': 7.0}
{'train_runtime': 211.4036, 'train_samples_per_second': 1042.603, 'train_steps_per_second': 1.041, 'train_loss': 0.20715248042886908, 'epoch': 7.0}


 70%|███████   | 154/220 [03:31<01:30,  1.37s/it]
100%|██████████| 11/11 [00:02<00:00,  4.94it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.36it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9116    0.9168    0.9142      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9563    0.9665    0.9614     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9454    0.9539    0.9496     14095
   macro avg     0.9473    0.9281    0.9371     14095
weighted avg     0.9453    0.9539    0.9496     14095




[I 2025-10-02 08:13:35,138] Trial 5 finished with value: 0.9371347308967535 and parameters: {'learning_rate': 0.0004384618142333153, 'weight_decay': 0.007825198064126099, 'num_train_epochs': 10}. Best is trial 3 with value: 0.9490884370630193.


✅ Trial 5 finished | F1: 0.9371

=== Trial 6 ===
{}

=== Current parameters: {'learning_rate': 0.0004026606979829003, 'weight_decay': 0.008132177642386032, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8645    0.8391    0.8516      3475
     PERCENT     0.8351    0.9759    0.9000        83
        TYPE     0.9399    0.9533    0.9465     10485
      VOLUME     0.3542    0.3269    0.3400        52

   micro avg     0.9192    0.9230    0.9211     14095
   macro avg     0.7484    0.7738    0.7595     14095
weighted avg     0.9185    0.9230    0.9206     14095

{'eval_loss': 0.2961544096469879, 'eval_f1_macro': 0.7595441358558732, 'eval_precision': 0.9192340305257207, 'eval_recall': 0.9229514012061014, 'eval_f1': 0.9210889651998442, 'eval_accuracy': 0.9150561541531269, 'eval_runtime': 4.8829, 'eval_samples_per_second': 1128.642, 'eval_steps_per_second': 2.253, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [01:00<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8614    0.9390    0.8985      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9463    0.9636    0.9549     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9244    0.9573    0.9405     14095
   macro avg     0.9297    0.9390    0.9332     14095
weighted avg     0.9255    0.9573    0.9409     14095

{'eval_loss': 0.21482785046100616, 'eval_f1_macro': 0.9332254243767643, 'eval_precision': 0.9243680208261972, 'eval_recall': 0.9572898190847818, 'eval_f1': 0.9405409173288721, 'eval_accuracy': 0.935693518043441, 'eval_runtime': 4.893, 'eval_samples_per_second': 1126.307, 'eval_steps_per_second': 2.248, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9375    0.8941    0.9153      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9457    0.9793    0.9622     10485
      VOLUME     0.9583    0.8846    0.9200        52

   micro avg     0.9439    0.9579    0.9509     14095
   macro avg     0.9515    0.9335    0.9419     14095
weighted avg     0.9438    0.9579    0.9505     14095

{'eval_loss': 0.20581591129302979, 'eval_f1_macro': 0.9418882642132759, 'eval_precision': 0.9439317673378076, 'eval_recall': 0.9579283433841788, 'eval_f1': 0.9508785520616923, 'eval_accuracy': 0.9439598654580696, 'eval_runtime': 5.1849, 'eval_samples_per_second': 1062.9, 'eval_steps_per_second': 2.122, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 67%|██████▋   | 88/132 [02:00<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9109    0.9235    0.9171      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9536    0.9676    0.9605     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9431    0.9567    0.9498     14095
   macro avg     0.9473    0.9505    0.9488     14095
weighted avg     0.9431    0.9567    0.9498     14095

{'eval_loss': 0.23477418720722198, 'eval_f1_macro': 0.9487512412037433, 'eval_precision': 0.9431349234105058, 'eval_recall': 0.9566512947853849, 'eval_f1': 0.9498450267681038, 'eval_accuracy': 0.942648651730232, 'eval_runtime': 4.8916, 'eval_samples_per_second': 1126.616, 'eval_steps_per_second': 2.249, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:15<00:38,  1.20s/it]

{'loss': 0.2806, 'grad_norm': 0.2335769385099411, 'learning_rate': 7.283242475647538e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:26<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:31<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9150    0.9295    0.9222      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9597    0.9662    0.9630     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9487    0.9571    0.9529     14095
   macro avg     0.9499    0.9517    0.9506     14095
weighted avg     0.9487    0.9571    0.9529     14095

{'eval_loss': 0.25955837965011597, 'eval_f1_macro': 0.9506356053222624, 'eval_precision': 0.948667463610154, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9528888261053821, 'eval_accuracy': 0.9446439769682459, 'eval_runtime': 4.8982, 'eval_samples_per_second': 1125.108, 'eval_steps_per_second': 2.246, 'epoch': 5.0}


100%|██████████| 132/132 [02:56<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.83it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.24it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 132/132 [03:01<00:00,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9190    0.9275    0.9232      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9599    0.9687    0.9643     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9499    0.9585    0.9542     14095
   macro avg     0.9509    0.9518    0.9512     14095
weighted avg     0.9499    0.9585    0.9542     14095

{'eval_loss': 0.2663465738296509, 'eval_f1_macro': 0.9512251951099278, 'eval_precision': 0.9498699289882585, 'eval_recall': 0.9584959205391983, 'eval_f1': 0.954163429620736, 'eval_accuracy': 0.9457271535260248, 'eval_runtime': 5.0966, 'eval_samples_per_second': 1081.317, 'eval_steps_per_second': 2.158, 'epoch': 6.0}


100%|██████████| 132/132 [03:01<00:00,  1.38s/it]

{'train_runtime': 181.7676, 'train_samples_per_second': 727.555, 'train_steps_per_second': 0.726, 'train_loss': 0.2174458201184417, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.39it/s]
[I 2025-10-02 08:16:42,622] Trial 6 finished with value: 0.9512251951099278 and parameters: {'learning_rate': 0.0004026606979829003, 'weight_decay': 0.008132177642386032, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9190    0.9275    0.9232      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9599    0.9687    0.9643     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9499    0.9585    0.9542     14095
   macro avg     0.9509    0.9518    0.9512     14095
weighted avg     0.9499    0.9585    0.9542     14095

✅ Trial 6 finished | F1: 0.9512

=== Trial 7 ===
{}

=== Current parameters: {'learning_rate': 0.00028124600848212724, 'weight_decay': 0.0031393924968630784, 'num_train_epochs': 11} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
  9%|▉         | 22/242 [00:25<03:40,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.75it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.78it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.83it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
  9%|▉         | 22/242 [00:29<03:40,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8254    0.8204    0.8229      3475
     PERCENT     0.7838    0.6988    0.7389        83
        TYPE     0.9186    0.9412    0.9298     10485
      VOLUME     0.3636    0.0769    0.1270        52

   micro avg     0.8950    0.9068    0.9009     14095
   macro avg     0.7229    0.6343    0.6546     14095
weighted avg     0.8928    0.9068    0.8994     14095

{'eval_loss': 0.3427426815032959, 'eval_f1_macro': 0.6546413713750542, 'eval_precision': 0.894972692900154, 'eval_recall': 0.9068463994324228, 'eval_f1': 0.9008704232300807, 'eval_accuracy': 0.8982384128612964, 'eval_runtime': 4.8429, 'eval_samples_per_second': 1137.961, 'eval_steps_per_second': 2.271, 'epoch': 1.0}


 18%|█▊        | 44/242 [00:55<03:18,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 18%|█▊        | 44/242 [01:00<03:18,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8734    0.9309    0.9012      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9484    0.9673    0.9577     10485
      VOLUME     0.8913    0.7885    0.8367        52

   micro avg     0.9292    0.9578    0.9433     14095
   macro avg     0.9194    0.9187    0.9180     14095
weighted avg     0.9298    0.9578    0.9435     14095

{'eval_loss': 0.20029370486736298, 'eval_f1_macro': 0.9179765139361086, 'eval_precision': 0.9291761304976255, 'eval_recall': 0.9577864490954239, 'eval_f1': 0.9432643935159306, 'eval_accuracy': 0.9384869733766604, 'eval_runtime': 4.8305, 'eval_samples_per_second': 1140.87, 'eval_steps_per_second': 2.277, 'epoch': 2.0}


 27%|██▋       | 66/242 [01:25<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 27%|██▋       | 66/242 [01:30<02:56,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9148    0.9240    0.9194      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9532    0.9754    0.9642     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9439    0.9625    0.9531     14095
   macro avg     0.9479    0.9448    0.9461     14095
weighted avg     0.9438    0.9625    0.9530     14095

{'eval_loss': 0.19654330611228943, 'eval_f1_macro': 0.9460815656873212, 'eval_precision': 0.943918730865572, 'eval_recall': 0.9624689606243348, 'eval_f1': 0.9531035936347351, 'eval_accuracy': 0.9456131349409954, 'eval_runtime': 5.1376, 'eval_samples_per_second': 1072.68, 'eval_steps_per_second': 2.141, 'epoch': 3.0}


 36%|███▋      | 88/242 [01:55<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 36%|███▋      | 88/242 [02:00<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8871    0.9318    0.9089      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9581    0.9528    0.9554     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9399    0.9477    0.9438     14095
   macro avg     0.9425    0.9489    0.9454     14095
weighted avg     0.9406    0.9477    0.9440     14095

{'eval_loss': 0.23760628700256348, 'eval_f1_macro': 0.945427878790545, 'eval_precision': 0.9399099352659724, 'eval_recall': 0.9477119545938276, 'eval_f1': 0.9437948210689936, 'eval_accuracy': 0.936605666723676, 'eval_runtime': 4.8548, 'eval_samples_per_second': 1135.165, 'eval_steps_per_second': 2.266, 'epoch': 4.0}


 41%|████▏     | 100/242 [02:14<02:49,  1.20s/it]

{'loss': 0.3302, 'grad_norm': 0.5064418315887451, 'learning_rate': 0.00020797582320869126, 'epoch': 4.55}


 45%|████▌     | 110/242 [02:25<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 45%|████▌     | 110/242 [02:30<02:12,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9259    0.9137    0.9198      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9519    0.9749    0.9632     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9457    0.9596    0.9526     14095
   macro avg     0.9505    0.9469    0.9486     14095
weighted avg     0.9456    0.9596    0.9525     14095

{'eval_loss': 0.2443336397409439, 'eval_f1_macro': 0.9485605413838583, 'eval_precision': 0.9457418542861138, 'eval_recall': 0.9596310748492373, 'eval_f1': 0.9526358418142761, 'eval_accuracy': 0.9461832278661422, 'eval_runtime': 4.9329, 'eval_samples_per_second': 1117.204, 'eval_steps_per_second': 2.23, 'epoch': 5.0}


 55%|█████▍    | 132/242 [02:56<01:51,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 55%|█████▍    | 132/242 [03:01<01:51,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9343    0.8999    0.9167      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9479    0.9751    0.9613     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9448    0.9563    0.9505     14095
   macro avg     0.9514    0.9387    0.9447     14095
weighted avg     0.9447    0.9563    0.9503     14095

{'eval_loss': 0.2818219065666199, 'eval_f1_macro': 0.9447004331808849, 'eval_precision': 0.9448338707416234, 'eval_recall': 0.9562965590634976, 'eval_f1': 0.9505306582983675, 'eval_accuracy': 0.9443589305056724, 'eval_runtime': 4.8731, 'eval_samples_per_second': 1130.909, 'eval_steps_per_second': 2.257, 'epoch': 6.0}


 64%|██████▎   | 154/242 [03:26<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.49it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 64%|██████▎   | 154/242 [03:31<01:28,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9080    0.9312    0.9194      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9592    0.9692    0.9642     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9465    0.9597    0.9530     14095
   macro avg     0.9479    0.9498    0.9487     14095
weighted avg     0.9466    0.9597    0.9531     14095

{'eval_loss': 0.29228124022483826, 'eval_f1_macro': 0.9487168744396723, 'eval_precision': 0.9464735516372796, 'eval_recall': 0.9597020219936148, 'eval_f1': 0.9530418853700637, 'eval_accuracy': 0.945328088478422, 'eval_runtime': 4.9312, 'eval_samples_per_second': 1117.584, 'eval_steps_per_second': 2.231, 'epoch': 7.0}


 73%|███████▎  | 176/242 [03:56<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 73%|███████▎  | 176/242 [04:02<01:06,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9179    0.9232    0.9205      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9576    0.9669    0.9622     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9478    0.9559    0.9519     14095
   macro avg     0.9469    0.9425    0.9444     14095
weighted avg     0.9478    0.9559    0.9518     14095

{'eval_loss': 0.30975666642189026, 'eval_f1_macro': 0.944430009025579, 'eval_precision': 0.9478052898142938, 'eval_recall': 0.9559418233416105, 'eval_f1': 0.951856168980255, 'eval_accuracy': 0.9431617353628642, 'eval_runtime': 5.1494, 'eval_samples_per_second': 1070.213, 'eval_steps_per_second': 2.136, 'epoch': 8.0}


 82%|████████▏ | 198/242 [04:27<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.53it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  4.98it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.56it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9168    0.9223    0.9195      3475
     PERCENT     0.9518    0.9518    0.9518        83
        TYPE     0.9567    0.9701    0.9633     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9469    0.9580    0.9524     14095
   macro avg     0.9463    0.9418    0.9440     14095
weighted avg     0.9469    0.9580    0.9524     14095




                                                 
 82%|████████▏ | 198/242 [04:32<00:44,  1.01s/it]A
 82%|████████▏ | 198/242 [04:32<01:00,  1.38s/it]A

{'eval_loss': 0.31489208340644836, 'eval_f1_macro': 0.9439631928181316, 'eval_precision': 0.946914446002805, 'eval_recall': 0.9579992905285563, 'eval_f1': 0.9524246164697584, 'eval_accuracy': 0.9447009862607605, 'eval_runtime': 4.8868, 'eval_samples_per_second': 1127.737, 'eval_steps_per_second': 2.251, 'epoch': 9.0}
{'train_runtime': 272.36, 'train_samples_per_second': 890.186, 'train_steps_per_second': 0.889, 'train_loss': 0.17445094898493604, 'epoch': 9.0}



100%|██████████| 11/11 [00:02<00:00,  4.94it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9168    0.9223    0.9195      3475
     PERCENT     0.9518    0.9518    0.9518        83
        TYPE     0.9567    0.9701    0.9633     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9469    0.9580    0.9524     14095
   macro avg     0.9463    0.9418    0.9440     14095
weighted avg     0.9469    0.9580    0.9524     14095



100%|██████████| 11/11 [00:04<00:00,  2.38it/s]
[I 2025-10-02 08:21:20,751] Trial 7 finished with value: 0.9439631928181316 and parameters: {'learning_rate': 0.00028124600848212724, 'weight_decay': 0.0031393924968630784, 'num_train_epochs': 11}. Best is trial 6 with value: 0.9512251951099278.


✅ Trial 7 finished | F1: 0.9440

=== Trial 8 ===
{}

=== Current parameters: {'learning_rate': 0.0001552014152658641, 'weight_decay': 0.0012095956363889111, 'num_train_epochs': 7} ===


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 50ecd54a-0c9d-46d4-9e70-b99e5f133490)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 14%|█▍        | 22/154 [00:24<02:11,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.76it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.83it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.86it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.40it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.09it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.79it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.71it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.66it/s][A
100%|██████████| 11/11 [00:02<00:00,  5.00it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 14%|█▍        | 22/154 [00:29<02:11,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8287    0.8187    0.8237      3475
     PERCENT     0.7800    0.4699    0.5865        83
        TYPE     0.8918    0.9499    0.9199     10485
      VOLUME     0.0000    0.0000    0.0000        52

   micro avg     0.8765    0.9112    0.8936     14095
   macro avg     0.6251    0.5596    0.5825     14095
weighted avg     0.8723    0.9112    0.8908     14095

{'eval_loss': 0.362575501203537, 'eval_f1_macro': 0.5825178170838283, 'eval_precision': 0.8765440524124752, 'eval_recall': 0.911245122383824, 'eval_f1': 0.8935578127174064, 'eval_accuracy': 0.8923094464397697, 'eval_runtime': 4.8939, 'eval_samples_per_second': 1126.088, 'eval_steps_per_second': 2.248, 'epoch': 1.0}


 29%|██▊       | 44/154 [00:54<01:49,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.98it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 29%|██▊       | 44/154 [00:59<01:49,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8888    0.9134    0.9009      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9467    0.9707    0.9586     10485
      VOLUME     0.8409    0.7115    0.7708        52

   micro avg     0.9322    0.9557    0.9438     14095
   macro avg     0.9102    0.8929    0.9001     14095
weighted avg     0.9321    0.9557    0.9437     14095

{'eval_loss': 0.20709198713302612, 'eval_f1_macro': 0.900097712377534, 'eval_precision': 0.9321799307958477, 'eval_recall': 0.9556580347641007, 'eval_f1': 0.9437729900157645, 'eval_accuracy': 0.9375748246964255, 'eval_runtime': 4.8849, 'eval_samples_per_second': 1128.18, 'eval_steps_per_second': 2.252, 'epoch': 2.0}


 43%|████▎     | 66/154 [01:24<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9004    0.9237    0.9119      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9564    0.9682    0.9623     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9425    0.9571    0.9498     14095
   macro avg     0.9452    0.9459    0.9453     14095
weighted avg     0.9426    0.9571    0.9498     14095




                                                
 43%|████▎     | 66/154 [01:30<01:28,  1.00s/it][A
                                               [A

{'eval_loss': 0.196588933467865, 'eval_f1_macro': 0.9452725612989532, 'eval_precision': 0.9425038423920638, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9497694392622056, 'eval_accuracy': 0.9427626703152614, 'eval_runtime': 5.2024, 'eval_samples_per_second': 1059.329, 'eval_steps_per_second': 2.114, 'epoch': 3.0}


 57%|█████▋    | 88/154 [01:55<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9218    0.9197    0.9208      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9576    0.9721    0.9648     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9489    0.9590    0.9539     14095
   macro avg     0.9460    0.9459    0.9458     14095
weighted avg     0.9488    0.9590    0.9538     14095




                                                
 57%|█████▋    | 88/154 [02:00<01:06,  1.00s/it][A
                                               [A

{'eval_loss': 0.2077416330575943, 'eval_f1_macro': 0.9458293203272348, 'eval_precision': 0.9488943488943489, 'eval_recall': 0.9589925505498403, 'eval_f1': 0.9539167254763584, 'eval_accuracy': 0.9468673393763183, 'eval_runtime': 4.8795, 'eval_samples_per_second': 1129.425, 'eval_steps_per_second': 2.254, 'epoch': 4.0}


 65%|██████▍   | 100/154 [02:14<01:04,  1.19s/it]

{'loss': 0.3516, 'grad_norm': 0.7113385796546936, 'learning_rate': 5.328499398479438e-05, 'epoch': 4.55}


 71%|███████▏  | 110/154 [02:25<00:44,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 71%|███████▏  | 110/154 [02:30<00:44,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9125    0.9240    0.9182      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9580    0.9715    0.9647     10485
      VOLUME     0.9412    0.9231    0.9320        52

   micro avg     0.9467    0.9597    0.9532     14095
   macro avg     0.9441    0.9516    0.9478     14095
weighted avg     0.9467    0.9597    0.9532     14095

{'eval_loss': 0.2260747104883194, 'eval_f1_macro': 0.947779902406897, 'eval_precision': 0.9467385218365062, 'eval_recall': 0.9597020219936148, 'eval_f1': 0.9531761970193426, 'eval_accuracy': 0.9458411721110541, 'eval_runtime': 4.9222, 'eval_samples_per_second': 1119.613, 'eval_steps_per_second': 2.235, 'epoch': 5.0}


 86%|████████▌ | 132/154 [02:55<00:22,  1.02s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.55it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 86%|████████▌ | 132/154 [03:00<00:22,  1.02s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9052    0.9229    0.9139      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9551    0.9708    0.9629     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9428    0.9589    0.9508     14095
   macro avg     0.9413    0.9464    0.9437     14095
weighted avg     0.9428    0.9589    0.9508     14095

{'eval_loss': 0.23362095654010773, 'eval_f1_macro': 0.943653028349684, 'eval_precision': 0.9427973491454482, 'eval_recall': 0.9588506562610855, 'eval_f1': 0.9507562434048541, 'eval_accuracy': 0.9443589305056724, 'eval_runtime': 4.8589, 'eval_samples_per_second': 1134.215, 'eval_steps_per_second': 2.264, 'epoch': 6.0}


100%|██████████| 154/154 [03:26<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 154/154 [03:31<00:00,  1.01s/it]A
100%|██████████| 154/154 [03:31<00:00,  1.37s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9076    0.9243    0.9159      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9584    0.9679    0.9631     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9458    0.9570    0.9514     14095
   macro avg     0.9427    0.9460    0.9442     14095
weighted avg     0.9459    0.9570    0.9514     14095

{'eval_loss': 0.23438744246959686, 'eval_f1_macro': 0.9441924494516356, 'eval_precision': 0.9458000280465573, 'eval_recall': 0.9570060305072721, 'eval_f1': 0.9513700320908418, 'eval_accuracy': 0.9441308933356137, 'eval_runtime': 4.9089, 'eval_samples_per_second': 1122.651, 'eval_steps_per_second': 2.241, 'epoch': 7.0}
{'train_runtime': 211.0896, 'train_samples_per_second': 730.907, 'train_steps_per_second': 0.73, 'train_loss': 0.24075895160823674, 'epoch': 7.0}



100%|██████████| 11/11 [00:02<00:00,  4.97it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.38it/s]
[I 2025-10-02 08:25:08,735] Trial 8 finished with value: 0.9441924494516356 and parameters: {'learning_rate': 0.0001552014152658641, 'weight_decay': 0.0012095956363889111, 'num_train_epochs': 7}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9076    0.9243    0.9159      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9584    0.9679    0.9631     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9458    0.9570    0.9514     14095
   macro avg     0.9427    0.9460    0.9442     14095
weighted avg     0.9459    0.9570    0.9514     14095

✅ Trial 8 finished | F1: 0.9442

=== Trial 9 ===
{}

=== Current parameters: {'learning_rate': 0.00024514200567817094, 'weight_decay': 0.0017748690777374889, 'num_train_epochs': 5} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 20%|██        | 22/110 [00:25<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.66it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 20%|██        | 22/110 [00:30<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8802    0.8391    0.8592      3475
     PERCENT     0.6212    0.9880    0.7628        83
        TYPE     0.9351    0.9587    0.9467     10485
      VOLUME     0.6111    0.2115    0.3143        52

   micro avg     0.9189    0.9266    0.9228     14095
   macro avg     0.7619    0.7493    0.7207     14095
weighted avg     0.9185    0.9266    0.9217     14095

{'eval_loss': 0.2924019396305084, 'eval_f1_macro': 0.7207446289219572, 'eval_precision': 0.9189474424822346, 'eval_recall': 0.9266406527137283, 'eval_f1': 0.9227780132824643, 'eval_accuracy': 0.9148851262755829, 'eval_runtime': 4.9259, 'eval_samples_per_second': 1118.769, 'eval_steps_per_second': 2.233, 'epoch': 1.0}


 40%|████      | 44/110 [00:55<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 40%|████      | 44/110 [01:00<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8733    0.9344    0.9028      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9493    0.9654    0.9573     10485
      VOLUME     0.8723    0.7885    0.8283        52

   micro avg     0.9296    0.9571    0.9432     14095
   macro avg     0.9120    0.9160    0.9132     14095
weighted avg     0.9303    0.9571    0.9434     14095

{'eval_loss': 0.20250673592090607, 'eval_f1_macro': 0.9131610929533375, 'eval_precision': 0.9295803762144285, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9431627516778524, 'eval_accuracy': 0.938543982669175, 'eval_runtime': 4.9281, 'eval_samples_per_second': 1118.284, 'eval_steps_per_second': 2.232, 'epoch': 2.0}


 60%|██████    | 66/110 [01:25<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.24it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.61it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.56it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 60%|██████    | 66/110 [01:30<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9227    0.9036    0.9131      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9483    0.9770    0.9624     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9423    0.9587    0.9504     14095
   macro avg     0.9487    0.9431    0.9456     14095
weighted avg     0.9421    0.9587    0.9502     14095

{'eval_loss': 0.19493918120861053, 'eval_f1_macro': 0.9455899523165653, 'eval_precision': 0.9423291492329149, 'eval_recall': 0.9587087619723306, 'eval_f1': 0.9504483910673466, 'eval_accuracy': 0.9435608004104669, 'eval_runtime': 5.197, 'eval_samples_per_second': 1060.421, 'eval_steps_per_second': 2.117, 'epoch': 3.0}


 80%|████████  | 88/110 [01:56<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 80%|████████  | 88/110 [02:01<00:22,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9035    0.9298    0.9165      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9575    0.9692    0.9633     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9440    0.9593    0.9516     14095
   macro avg     0.9434    0.9477    0.9452     14095
weighted avg     0.9442    0.9593    0.9517     14095

{'eval_loss': 0.22161827981472015, 'eval_f1_macro': 0.945221769558501, 'eval_precision': 0.944010053057805, 'eval_recall': 0.9593472862717276, 'eval_f1': 0.9516168760336394, 'eval_accuracy': 0.9452140698933926, 'eval_runtime': 4.9221, 'eval_samples_per_second': 1119.648, 'eval_steps_per_second': 2.235, 'epoch': 4.0}


 91%|█████████ | 100/110 [02:15<00:11,  1.20s/it]

{'loss': 0.3057, 'grad_norm': 0.4284382462501526, 'learning_rate': 7.391935948868236e-06, 'epoch': 4.55}


100%|██████████| 110/110 [02:26<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 110/110 [02:31<00:00,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9211    0.9177    0.9194      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9569    0.9701    0.9635     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9482    0.9571    0.9526     14095
   macro avg     0.9477    0.9449    0.9460     14095
weighted avg     0.9481    0.9571    0.9525     14095

{'eval_loss': 0.21902528405189514, 'eval_f1_macro': 0.946002639966516, 'eval_precision': 0.9481970900400647, 'eval_recall': 0.9570769776516496, 'eval_f1': 0.9526163406539085, 'eval_accuracy': 0.9448720141383046, 'eval_runtime': 4.9369, 'eval_samples_per_second': 1116.29, 'eval_steps_per_second': 2.228, 'epoch': 5.0}


100%|██████████| 110/110 [02:31<00:00,  1.38s/it]

{'train_runtime': 151.5342, 'train_samples_per_second': 727.262, 'train_steps_per_second': 0.726, 'train_loss': 0.28170608227903193, 'epoch': 5.0}



100%|██████████| 11/11 [00:02<00:00,  4.92it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.22it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9211    0.9177    0.9194      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9569    0.9701    0.9635     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9482    0.9571    0.9526     14095
   macro avg     0.9477    0.9449    0.9460     14095
weighted avg     0.9481    0.9571    0.9525     14095




[I 2025-10-02 08:27:46,490] Trial 9 finished with value: 0.946002639966516 and parameters: {'learning_rate': 0.00024514200567817094, 'weight_decay': 0.0017748690777374889, 'num_train_epochs': 5}. Best is trial 6 with value: 0.9512251951099278.


✅ Trial 9 finished | F1: 0.9460

=== Trial 10 ===
{}

=== Current parameters: {'learning_rate': 0.0003387875479681483, 'weight_decay': 0.05510363114841351, 'num_train_epochs': 5} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dersty/distilrubert_X5_ner_MLM


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 20%|██        | 22/110 [00:25<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.53it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 20%|██        | 22/110 [00:30<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8764    0.8734    0.8749      3475
     PERCENT     0.7009    0.9880    0.8200        83
        TYPE     0.9377    0.9624    0.9499     10485
      VOLUME     0.7500    0.3462    0.4737        52

   micro avg     0.9206    0.9383    0.9294     14095
   macro avg     0.8162    0.7925    0.7796     14095
weighted avg     0.9205    0.9383    0.9289     14095

{'eval_loss': 0.26023852825164795, 'eval_f1_macro': 0.7796128466797116, 'eval_precision': 0.9206459696505638, 'eval_recall': 0.9383469315360057, 'eval_f1': 0.9294121780682336, 'eval_accuracy': 0.9222963343024914, 'eval_runtime': 4.9165, 'eval_samples_per_second': 1120.912, 'eval_steps_per_second': 2.237, 'epoch': 1.0}


 40%|████      | 44/110 [00:55<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)





=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8756    0.9396    0.9064      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9537    0.9608    0.9572     10485
      VOLUME     0.8936    0.8077    0.8485        52

   micro avg     0.9333    0.9551    0.9441     14095
   macro avg     0.9190    0.9210    0.9191     14095
weighted avg     0.9342    0.9551    0.9444     14095



                                                
 40%|████      | 44/110 [01:00<01:06,  1.00s/it][A
                                               [A

{'eval_loss': 0.19935567677021027, 'eval_f1_macro': 0.9191130054716151, 'eval_precision': 0.9333056017748197, 'eval_recall': 0.9550904576090813, 'eval_f1': 0.9440723728040954, 'eval_accuracy': 0.9390000570092926, 'eval_runtime': 4.9347, 'eval_samples_per_second': 1116.776, 'eval_steps_per_second': 2.229, 'epoch': 2.0}


 60%|██████    | 66/110 [01:25<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 60%|██████    | 66/110 [01:30<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9313    0.9050    0.9180      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9539    0.9728    0.9633     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9485    0.9558    0.9522     14095
   macro avg     0.9471    0.9346    0.9405     14095
weighted avg     0.9483    0.9558    0.9519     14095

{'eval_loss': 0.20588253438472748, 'eval_f1_macro': 0.9405487213531561, 'eval_precision': 0.9485320002816306, 'eval_recall': 0.9557999290528556, 'eval_f1': 0.9521520955544561, 'eval_accuracy': 0.9440168747505844, 'eval_runtime': 5.1571, 'eval_samples_per_second': 1068.628, 'eval_steps_per_second': 2.133, 'epoch': 3.0}


 80%|████████  | 88/110 [01:56<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.68it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9133    0.9281    0.9206      3475
     PERCENT     0.9524    0.9639    0.9581        83
        TYPE     0.9588    0.9668    0.9628     10485
      VOLUME     0.9216    0.9038    0.9126        52

   micro avg     0.9473    0.9570    0.9521     14095
   macro avg     0.9365    0.9406    0.9385     14095
weighted avg     0.9474    0.9570    0.9522     14095




 80%|████████  | 88/110 [02:00<00:22,  1.01s/it][A
                                               [A

{'eval_loss': 0.22922122478485107, 'eval_f1_macro': 0.9385285343090224, 'eval_precision': 0.947327761781024, 'eval_recall': 0.9570060305072721, 'eval_f1': 0.952142302534058, 'eval_accuracy': 0.9450430420158485, 'eval_runtime': 4.8261, 'eval_samples_per_second': 1141.918, 'eval_steps_per_second': 2.279, 'epoch': 4.0}


 91%|█████████ | 100/110 [02:15<00:11,  1.20s/it]

{'loss': 0.2754, 'grad_norm': 0.39223045110702515, 'learning_rate': 1.0215694564163697e-05, 'epoch': 4.55}


100%|██████████| 110/110 [02:26<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 110/110 [02:31<00:00,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9122    0.9240    0.9181      3475
     PERCENT     0.9524    0.9639    0.9581        83
        TYPE     0.9576    0.9683    0.9630     10485
      VOLUME     0.9216    0.9038    0.9126        52

   micro avg     0.9463    0.9571    0.9517     14095
   macro avg     0.9360    0.9400    0.9379     14095
weighted avg     0.9463    0.9571    0.9517     14095

{'eval_loss': 0.23484596610069275, 'eval_f1_macro': 0.9379381251207485, 'eval_precision': 0.9462720067335344, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9516788939051918, 'eval_accuracy': 0.9448720141383046, 'eval_runtime': 4.8761, 'eval_samples_per_second': 1130.211, 'eval_steps_per_second': 2.256, 'epoch': 5.0}


100%|██████████| 110/110 [02:31<00:00,  1.37s/it]

{'train_runtime': 151.2302, 'train_samples_per_second': 728.724, 'train_steps_per_second': 0.727, 'train_loss': 0.253460588238456, 'epoch': 5.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.25it/s]
[I 2025-10-02 08:30:23,793] Trial 10 finished with value: 0.9379381251207485 and parameters: {'learning_rate': 0.0003387875479681483, 'weight_decay': 0.05510363114841351, 'num_train_epochs': 5}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9122    0.9240    0.9181      3475
     PERCENT     0.9524    0.9639    0.9581        83
        TYPE     0.9576    0.9683    0.9630     10485
      VOLUME     0.9216    0.9038    0.9126        52

   micro avg     0.9463    0.9571    0.9517     14095
   macro avg     0.9360    0.9400    0.9379     14095
weighted avg     0.9463    0.9571    0.9517     14095

✅ Trial 10 finished | F1: 0.9379

=== Trial 11 ===
{}

=== Current parameters: {'learning_rate': 0.00035152800439207485, 'weight_decay': 0.036527343851493276, 'num_train_epochs': 8} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 12%|█▎        | 22/176 [00:25<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.72it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.36it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8556    0.8763    0.8658      3475
     PERCENT     0.7321    0.9880    0.8410        83
        TYPE     0.9351    0.9615    0.9481     10485
      VOLUME     0.6875    0.4231    0.5238        52

   micro avg     0.9134    0.9386    0.9259     14095
   macro avg     0.8026    0.8122    0.7947     14095
weighted avg     0.9134    0.9386    0.9256     14095




                                                
 12%|█▎        | 22/176 [00:29<02:34,  1.00s/it][A
                                               [A

{'eval_loss': 0.27829116582870483, 'eval_f1_macro': 0.794679005784521, 'eval_precision': 0.9134217067108533, 'eval_recall': 0.9386307201135154, 'eval_f1': 0.9258546485181427, 'eval_accuracy': 0.9185337209965224, 'eval_runtime': 4.8589, 'eval_samples_per_second': 1134.212, 'eval_steps_per_second': 2.264, 'epoch': 1.0}


 25%|██▌       | 44/176 [00:55<02:12,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.64it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 25%|██▌       | 44/176 [01:00<02:12,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8917    0.9283    0.9096      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9502    0.9703    0.9602     10485
      VOLUME     0.9333    0.8077    0.8660        52

   micro avg     0.9356    0.9594    0.9474     14095
   macro avg     0.9349    0.9206    0.9265     14095
weighted avg     0.9358    0.9594    0.9474     14095

{'eval_loss': 0.19990204274654388, 'eval_f1_macro': 0.9264605310040004, 'eval_precision': 0.9355887643558877, 'eval_recall': 0.959418233416105, 'eval_f1': 0.947353672633017, 'eval_accuracy': 0.9416794937574825, 'eval_runtime': 4.8484, 'eval_samples_per_second': 1136.656, 'eval_steps_per_second': 2.269, 'epoch': 2.0}


 38%|███▊      | 66/176 [01:25<01:50,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.66it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.73it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 38%|███▊      | 66/176 [01:30<01:50,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9311    0.9016    0.9161      3475
     PERCENT     0.9634    0.9518    0.9576        83
        TYPE     0.9483    0.9791    0.9634     10485
      VOLUME     0.9412    0.9231    0.9320        52

   micro avg     0.9443    0.9596    0.9519     14095
   macro avg     0.9460    0.9389    0.9423     14095
weighted avg     0.9441    0.9596    0.9516     14095

{'eval_loss': 0.21237270534038544, 'eval_f1_macro': 0.9422856428790457, 'eval_precision': 0.9442893046635018, 'eval_recall': 0.9596310748492373, 'eval_f1': 0.9518983778458073, 'eval_accuracy': 0.9441308933356137, 'eval_runtime': 5.1216, 'eval_samples_per_second': 1076.024, 'eval_steps_per_second': 2.148, 'epoch': 3.0}


 50%|█████     | 88/176 [01:55<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 88/176 [02:00<01:28,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9085    0.9283    0.9183      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9536    0.9700    0.9617     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9425    0.9596    0.9510     14095
   macro avg     0.9466    0.9493    0.9478     14095
weighted avg     0.9426    0.9596    0.9510     14095

{'eval_loss': 0.24055540561676025, 'eval_f1_macro': 0.9478104807092222, 'eval_precision': 0.9425087108013938, 'eval_recall': 0.9595601277048599, 'eval_f1': 0.9509579891017753, 'eval_accuracy': 0.9432187446553788, 'eval_runtime': 4.8737, 'eval_samples_per_second': 1130.763, 'eval_steps_per_second': 2.257, 'epoch': 4.0}


 57%|█████▋    | 100/176 [02:15<01:31,  1.20s/it]

{'loss': 0.2934, 'grad_norm': 0.3279023766517639, 'learning_rate': 0.00016877623798923461, 'epoch': 4.55}


 62%|██████▎   | 110/176 [02:26<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 62%|██████▎   | 110/176 [02:30<01:06,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9163    0.9108    0.9136      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9546    0.9693    0.9619     10485
      VOLUME     0.9796    0.9231    0.9505        52

   micro avg     0.9455    0.9547    0.9501     14095
   macro avg     0.9537    0.9448    0.9490     14095
weighted avg     0.9453    0.9547    0.9500     14095

{'eval_loss': 0.2950842082500458, 'eval_f1_macro': 0.9490027450034275, 'eval_precision': 0.9454788168341179, 'eval_recall': 0.954735721887194, 'eval_f1': 0.9500847218299915, 'eval_accuracy': 0.9408243543697623, 'eval_runtime': 4.8512, 'eval_samples_per_second': 1136.006, 'eval_steps_per_second': 2.267, 'epoch': 5.0}


 75%|███████▌  | 132/176 [02:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 75%|███████▌  | 132/176 [03:01<00:44,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9107    0.9214    0.9160      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9534    0.9691    0.9612     10485
      VOLUME     0.9796    0.9231    0.9505        52

   micro avg     0.9430    0.9573    0.9501     14095
   macro avg     0.9521    0.9504    0.9510     14095
weighted avg     0.9430    0.9573    0.9501     14095

{'eval_loss': 0.2946520447731018, 'eval_f1_macro': 0.9509724022504052, 'eval_precision': 0.9430388593793682, 'eval_recall': 0.9572898190847818, 'eval_f1': 0.9501109037777699, 'eval_accuracy': 0.9424206145601733, 'eval_runtime': 4.8369, 'eval_samples_per_second': 1139.359, 'eval_steps_per_second': 2.274, 'epoch': 6.0}


 88%|████████▊ | 154/176 [03:26<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.51it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 88%|████████▊ | 154/176 [03:31<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9180    0.9217    0.9199      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9535    0.9710    0.9622     10485
      VOLUME     0.9796    0.9231    0.9505        52

   micro avg     0.9450    0.9587    0.9518     14095
   macro avg     0.9539    0.9479    0.9507     14095
weighted avg     0.9449    0.9587    0.9518     14095

{'eval_loss': 0.3096170127391815, 'eval_f1_macro': 0.9506562388352908, 'eval_precision': 0.9450311210574166, 'eval_recall': 0.9587087619723306, 'eval_f1': 0.9518208072127914, 'eval_accuracy': 0.9433327632404082, 'eval_runtime': 4.8773, 'eval_samples_per_second': 1129.922, 'eval_steps_per_second': 2.255, 'epoch': 7.0}


100%|██████████| 176/176 [03:57<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 176/176 [04:02<00:00,  1.01s/it]A
100%|██████████| 176/176 [04:02<00:00,  1.38s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9172    0.9214    0.9193      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9539    0.9700    0.9619     10485
      VOLUME     0.9796    0.9231    0.9505        52

   micro avg     0.9451    0.9579    0.9514     14095
   macro avg     0.9538    0.9476    0.9504     14095
weighted avg     0.9450    0.9579    0.9514     14095

{'eval_loss': 0.31022611260414124, 'eval_f1_macro': 0.9504403493016386, 'eval_precision': 0.9451172558627932, 'eval_recall': 0.9578573962398014, 'eval_f1': 0.9514446793516561, 'eval_accuracy': 0.9431617353628642, 'eval_runtime': 5.1335, 'eval_samples_per_second': 1073.534, 'eval_steps_per_second': 2.143, 'epoch': 8.0}
{'train_runtime': 242.2709, 'train_samples_per_second': 727.814, 'train_steps_per_second': 0.726, 'train_loss': 0.1726350635290146, 'epoch': 8.0}



100%|██████████| 11/11 [00:02<00:00,  4.97it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.39it/s]
[I 2025-10-02 08:34:31,868] Trial 11 finished with value: 0.9504403493016386 and parameters: {'learning_rate': 0.00035152800439207485, 'weight_decay': 0.036527343851493276, 'num_train_epochs': 8}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9172    0.9214    0.9193      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9539    0.9700    0.9619     10485
      VOLUME     0.9796    0.9231    0.9505        52

   micro avg     0.9451    0.9579    0.9514     14095
   macro avg     0.9538    0.9476    0.9504     14095
weighted avg     0.9450    0.9579    0.9514     14095

✅ Trial 11 finished | F1: 0.9504

=== Trial 12 ===
{}

=== Current parameters: {'learning_rate': 0.0003278965254073193, 'weight_decay': 0.031106055098595146, 'num_train_epochs': 8} ===


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 37d6bc8f-64be-4c94-896f-04f7956913c4)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 12%|█▎        | 22/176 [00:24<02:32,  1.01it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.75it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.81it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.84it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.39it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.08it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.92it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.80it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 12%|█▎        | 22/176 [00:29<02:32,  1.01it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8199    0.8696    0.8440      3475
     PERCENT     0.6504    0.9639    0.7767        83
        TYPE     0.9373    0.9499    0.9436     10485
      VOLUME     0.3704    0.1923    0.2532        52

   micro avg     0.9039    0.9274    0.9155     14095
   macro avg     0.6945    0.7439    0.7044     14095
weighted avg     0.9046    0.9274    0.9155     14095

{'eval_loss': 0.2960531711578369, 'eval_f1_macro': 0.7043659246356732, 'eval_precision': 0.9038860461900152, 'eval_recall': 0.9274210713018801, 'eval_f1': 0.9155023286759814, 'eval_accuracy': 0.9106664386294966, 'eval_runtime': 4.8379, 'eval_samples_per_second': 1139.12, 'eval_steps_per_second': 2.274, 'epoch': 1.0}


 25%|██▌       | 44/176 [00:54<02:11,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.84it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.36it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 25%|██▌       | 44/176 [00:59<02:11,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8829    0.9304    0.9060      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9544    0.9611    0.9577     10485
      VOLUME     0.8889    0.7692    0.8247        52

   micro avg     0.9360    0.9529    0.9443     14095
   macro avg     0.9226    0.9091    0.9146     14095
weighted avg     0.9365    0.9529    0.9445     14095

{'eval_loss': 0.203060120344162, 'eval_f1_macro': 0.9146231786010806, 'eval_precision': 0.9359581881533101, 'eval_recall': 0.9528910961333806, 'eval_f1': 0.9443487431886097, 'eval_accuracy': 0.9375178154039109, 'eval_runtime': 4.8545, 'eval_samples_per_second': 1135.239, 'eval_steps_per_second': 2.266, 'epoch': 2.0}


 38%|███▊      | 66/176 [01:24<01:49,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.71it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.79it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.37it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 38%|███▊      | 66/176 [01:29<01:49,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9229    0.9160    0.9194      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9540    0.9716    0.9627     10485
      VOLUME     0.9583    0.8846    0.9200        52

   micro avg     0.9466    0.9576    0.9520     14095
   macro avg     0.9499    0.9370    0.9430     14095
weighted avg     0.9464    0.9576    0.9519     14095

{'eval_loss': 0.20814327895641327, 'eval_f1_macro': 0.9430471441536581, 'eval_precision': 0.9465600673258995, 'eval_recall': 0.9575736076622916, 'eval_f1': 0.952034986245327, 'eval_accuracy': 0.9448150048457898, 'eval_runtime': 5.0834, 'eval_samples_per_second': 1084.11, 'eval_steps_per_second': 2.164, 'epoch': 3.0}


 50%|█████     | 88/176 [01:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 88/176 [01:59<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9141    0.9243    0.9192      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9546    0.9697    0.9621     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9448    0.9584    0.9515     14095
   macro avg     0.9484    0.9513    0.9497     14095
weighted avg     0.9447    0.9584    0.9515     14095

{'eval_loss': 0.23129570484161377, 'eval_f1_macro': 0.9496566013491725, 'eval_precision': 0.9447513812154696, 'eval_recall': 0.9584249733948209, 'eval_f1': 0.9515390575473691, 'eval_accuracy': 0.9441308933356137, 'eval_runtime': 4.8532, 'eval_samples_per_second': 1135.533, 'eval_steps_per_second': 2.267, 'epoch': 4.0}


 57%|█████▋    | 100/176 [02:14<01:30,  1.19s/it]

{'loss': 0.3069, 'grad_norm': 0.28412893414497375, 'learning_rate': 0.00015743025112236692, 'epoch': 4.55}


 62%|██████▎   | 110/176 [02:25<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 62%|██████▎   | 110/176 [02:30<01:06,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9296    0.9117    0.9205      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9555    0.9662    0.9608     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9493    0.9527    0.9510     14095
   macro avg     0.9524    0.9472    0.9497     14095
weighted avg     0.9492    0.9527    0.9509     14095

{'eval_loss': 0.26528090238571167, 'eval_f1_macro': 0.949681647549135, 'eval_precision': 0.9493142937932985, 'eval_recall': 0.9527492018446257, 'eval_f1': 0.9510286462943947, 'eval_accuracy': 0.9417935123425119, 'eval_runtime': 4.8542, 'eval_samples_per_second': 1135.3, 'eval_steps_per_second': 2.266, 'epoch': 5.0}


 75%|███████▌  | 132/176 [02:55<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 75%|███████▌  | 132/176 [03:00<00:44,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9167    0.9217    0.9192      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9537    0.9714    0.9625     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9448    0.9591    0.9519     14095
   macro avg     0.9488    0.9510    0.9498     14095
weighted avg     0.9447    0.9591    0.9518     14095

{'eval_loss': 0.30187422037124634, 'eval_f1_macro': 0.9497660174396647, 'eval_precision': 0.9447861336315349, 'eval_recall': 0.9590634976942178, 'eval_f1': 0.9518712812026898, 'eval_accuracy': 0.9429907074853201, 'eval_runtime': 4.8271, 'eval_samples_per_second': 1141.679, 'eval_steps_per_second': 2.279, 'epoch': 6.0}


 88%|████████▊ | 154/176 [03:25<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 88%|████████▊ | 154/176 [03:30<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9207    0.9194    0.9201      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9551    0.9698    0.9624     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9468    0.9573    0.9520     14095
   macro avg     0.9501    0.9501    0.9500     14095
weighted avg     0.9467    0.9573    0.9520     14095

{'eval_loss': 0.30454710125923157, 'eval_f1_macro': 0.949957722359746, 'eval_precision': 0.9468107501227984, 'eval_recall': 0.9572898190847818, 'eval_f1': 0.9520214492344599, 'eval_accuracy': 0.9436748189954962, 'eval_runtime': 4.8291, 'eval_samples_per_second': 1141.204, 'eval_steps_per_second': 2.278, 'epoch': 7.0}


100%|██████████| 176/176 [03:55<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.66it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 176/176 [04:01<00:00,  1.01s/it]A
100%|██████████| 176/176 [04:01<00:00,  1.37s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9181    0.9223    0.9202      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9556    0.9689    0.9622     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9465    0.9574    0.9519     14095
   macro avg     0.9496    0.9506    0.9499     14095
weighted avg     0.9464    0.9574    0.9519     14095

{'eval_loss': 0.30770745873451233, 'eval_f1_macro': 0.949939862019334, 'eval_precision': 0.9464824296836641, 'eval_recall': 0.9573607662291592, 'eval_f1': 0.9518905191873589, 'eval_accuracy': 0.9432757539478935, 'eval_runtime': 5.1396, 'eval_samples_per_second': 1072.265, 'eval_steps_per_second': 2.14, 'epoch': 8.0}
{'train_runtime': 241.1914, 'train_samples_per_second': 731.071, 'train_steps_per_second': 0.73, 'train_loss': 0.18124265630136838, 'epoch': 8.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.36it/s]
[I 2025-10-02 08:38:50,114] Trial 12 finished with value: 0.949939862019334 and parameters: {'learning_rate': 0.0003278965254073193, 'weight_decay': 0.031106055098595146, 'num_train_epochs': 8}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9181    0.9223    0.9202      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9556    0.9689    0.9622     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9465    0.9574    0.9519     14095
   macro avg     0.9496    0.9506    0.9499     14095
weighted avg     0.9464    0.9574    0.9519     14095

✅ Trial 12 finished | F1: 0.9499

=== Trial 13 ===
{}

=== Current parameters: {'learning_rate': 0.00048497763563738093, 'weight_decay': 0.022005472171740544, 'num_train_epochs': 4} ===


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: e5200e3b-fd36-4d7c-ae1b-37d6e364f6c9)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 25%|██▌       | 22/88 [00:25<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.51it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.06it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 25%|██▌       | 22/88 [00:29<01:06,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8773    0.8578    0.8675      3475
     PERCENT     0.8764    0.9398    0.9070        83
        TYPE     0.9336    0.9681    0.9506     10485
      VOLUME     0.2951    0.3462    0.3186        52

   micro avg     0.9173    0.9385    0.9278     14095
   macro avg     0.7456    0.7780    0.7609     14095
weighted avg     0.9170    0.9385    0.9275     14095

{'eval_loss': 0.2665961682796478, 'eval_f1_macro': 0.7608925832577924, 'eval_precision': 0.917273420705915, 'eval_recall': 0.9384888258247606, 'eval_f1': 0.9277598541169869, 'eval_accuracy': 0.9232084829827262, 'eval_runtime': 4.8909, 'eval_samples_per_second': 1126.781, 'eval_steps_per_second': 2.249, 'epoch': 1.0}


 50%|█████     | 44/88 [00:55<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 50%|█████     | 44/88 [01:00<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8895    0.9263    0.9075      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9523    0.9711    0.9616     10485
      VOLUME     0.8636    0.7308    0.7917        52

   micro avg     0.9363    0.9593    0.9476     14095
   macro avg     0.9147    0.9040    0.9078     14095
weighted avg     0.9365    0.9593    0.9477     14095

{'eval_loss': 0.20266874134540558, 'eval_f1_macro': 0.9078044122422869, 'eval_precision': 0.9362925005193546, 'eval_recall': 0.9592763391273501, 'eval_f1': 0.9476450798990749, 'eval_accuracy': 0.9411664101248504, 'eval_runtime': 4.8787, 'eval_samples_per_second': 1129.596, 'eval_steps_per_second': 2.255, 'epoch': 2.0}


 75%|███████▌  | 66/88 [01:25<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 75%|███████▌  | 66/88 [01:30<00:22,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9140    0.9235    0.9187      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9559    0.9711    0.9634     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9457    0.9592    0.9524     14095
   macro avg     0.9484    0.9466    0.9473     14095
weighted avg     0.9456    0.9592    0.9524     14095

{'eval_loss': 0.20086199045181274, 'eval_f1_macro': 0.9472517838670642, 'eval_precision': 0.9456529341819963, 'eval_recall': 0.9592053919829727, 'eval_f1': 0.9523809523809524, 'eval_accuracy': 0.9461262185736276, 'eval_runtime': 5.1342, 'eval_samples_per_second': 1073.399, 'eval_steps_per_second': 2.143, 'epoch': 3.0}


100%|██████████| 88/88 [01:55<00:00,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.53it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
100%|██████████| 88/88 [02:00<00:00,  1.00s/it][A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9207    0.9183    0.9195      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9568    0.9699    0.9633     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9481    0.9570    0.9525     14095
   macro avg     0.9503    0.9450    0.9474     14095
weighted avg     0.9480    0.9570    0.9524     14095

{'eval_loss': 0.2117178589105606, 'eval_f1_macro': 0.9474100894211659, 'eval_precision': 0.9480601630587574, 'eval_recall': 0.9570060305072721, 'eval_f1': 0.952512092645553, 'eval_accuracy': 0.9453850977709367, 'eval_runtime': 4.8961, 'eval_samples_per_second': 1125.6, 'eval_steps_per_second': 2.247, 'epoch': 4.0}



100%|██████████| 88/88 [02:00<00:00,  1.37s/it][A

{'train_runtime': 120.8491, 'train_samples_per_second': 729.538, 'train_steps_per_second': 0.728, 'train_loss': 0.31268568472428754, 'epoch': 4.0}



100%|██████████| 11/11 [00:02<00:00,  4.95it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.36it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9207    0.9183    0.9195      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9568    0.9699    0.9633     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9481    0.9570    0.9525     14095
   macro avg     0.9503    0.9450    0.9474     14095
weighted avg     0.9480    0.9570    0.9524     14095




[I 2025-10-02 08:40:58,200] Trial 13 finished with value: 0.9474100894211659 and parameters: {'learning_rate': 0.00048497763563738093, 'weight_decay': 0.022005472171740544, 'num_train_epochs': 4}. Best is trial 6 with value: 0.9512251951099278.


✅ Trial 13 finished | F1: 0.9474

=== Trial 14 ===
{}

=== Current parameters: {'learning_rate': 0.00036853625668387634, 'weight_decay': 0.020596759076493557, 'num_train_epochs': 9} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dersty/distilrubert_X5_ner_MLM


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 11%|█         | 22/198 [00:25<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===


                                                
 11%|█         | 22/198 [00:29<02:56,  1.00s/it][A
                                               [A


              precision    recall  f1-score   support

       BRAND     0.8683    0.8388    0.8533      3475
     PERCENT     0.8100    0.9759    0.8852        83
        TYPE     0.9338    0.9665    0.9499     10485
      VOLUME     0.7742    0.4615    0.5783        52

   micro avg     0.9173    0.9332    0.9252     14095
   macro avg     0.8466    0.8107    0.8167     14095
weighted avg     0.9164    0.9332    0.9243     14095

{'eval_loss': 0.27870631217956543, 'eval_f1_macro': 0.8166989068135377, 'eval_precision': 0.9172942817294282, 'eval_recall': 0.93323873714083, 'eval_f1': 0.9251978195885352, 'eval_accuracy': 0.9179066187788609, 'eval_runtime': 4.8898, 'eval_samples_per_second': 1127.032, 'eval_steps_per_second': 2.25, 'epoch': 1.0}


 22%|██▏       | 44/198 [00:55<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.79it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.85it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.39it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.97it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 22%|██▏       | 44/198 [01:00<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8326    0.9447    0.8851      3475
     PERCENT     0.9419    0.9759    0.9586        83
        TYPE     0.9482    0.9515    0.9498     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9170    0.9498    0.9331     14095
   macro avg     0.9205    0.9440    0.9311     14095
weighted avg     0.9197    0.9498    0.9339     14095

{'eval_loss': 0.23149654269218445, 'eval_f1_macro': 0.9310602636361915, 'eval_precision': 0.916980615110624, 'eval_recall': 0.9497694217807733, 'eval_f1': 0.9330870565274971, 'eval_accuracy': 0.9286243657716208, 'eval_runtime': 4.8751, 'eval_samples_per_second': 1130.444, 'eval_steps_per_second': 2.256, 'epoch': 2.0}


 33%|███▎      | 66/198 [01:25<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.29it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9187    0.9203    0.9195      3475
     PERCENT     0.9634    0.9518    0.9576        83
        TYPE     0.9508    0.9775    0.9640     10485
      VOLUME     0.9608    0.9423    0.9515        52

   micro avg     0.9432    0.9631    0.9530     14095
   macro avg     0.9484    0.9480    0.9481     14095
weighted avg     0.9430    0.9631    0.9529     14095




                                                
 33%|███▎      | 66/198 [01:30<02:12,  1.00s/it][A
                                               [A

{'eval_loss': 0.22721423208713531, 'eval_f1_macro': 0.948125676123411, 'eval_precision': 0.9431668172028069, 'eval_recall': 0.9631074849237318, 'eval_f1': 0.9530328559393428, 'eval_accuracy': 0.9452710791859072, 'eval_runtime': 5.2285, 'eval_samples_per_second': 1054.029, 'eval_steps_per_second': 2.104, 'epoch': 3.0}


 44%|████▍     | 88/198 [01:55<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 44%|████▍     | 88/198 [02:00<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9043    0.9275    0.9158      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9550    0.9663    0.9607     10485
      VOLUME     0.9412    0.9231    0.9320        52

   micro avg     0.9424    0.9567    0.9495     14095
   macro avg     0.9412    0.9482    0.9446     14095
weighted avg     0.9425    0.9567    0.9495     14095

{'eval_loss': 0.24635204672813416, 'eval_f1_macro': 0.9446265280580379, 'eval_precision': 0.9424098406485882, 'eval_recall': 0.9566512947853849, 'eval_f1': 0.9494771679047989, 'eval_accuracy': 0.942306595975144, 'eval_runtime': 4.888, 'eval_samples_per_second': 1127.444, 'eval_steps_per_second': 2.25, 'epoch': 4.0}


 51%|█████     | 100/198 [02:15<01:57,  1.20s/it]

{'loss': 0.2944, 'grad_norm': 0.40607115626335144, 'learning_rate': 0.00021662175936326965, 'epoch': 4.55}


 56%|█████▌    | 110/198 [02:26<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.59it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 56%|█████▌    | 110/198 [02:30<01:28,  1.00s/it]A
 56%|█████▌    | 110/198 [02:31<01:28,  1.00s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9218    0.9157    0.9187      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9524    0.9671    0.9597     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9450    0.9542    0.9496     14095
   macro avg     0.9445    0.9376    0.9410     14095
weighted avg     0.9449    0.9542    0.9495     14095

{'eval_loss': 0.27849626541137695, 'eval_f1_macro': 0.9409574705684249, 'eval_precision': 0.9449831365935919, 'eval_recall': 0.9541681447321745, 'eval_f1': 0.9495534295901437, 'eval_accuracy': 0.9419645402200558, 'eval_runtime': 4.8433, 'eval_samples_per_second': 1137.872, 'eval_steps_per_second': 2.271, 'epoch': 5.0}
{'train_runtime': 151.0376, 'train_samples_per_second': 1313.375, 'train_steps_per_second': 1.311, 'train_loss': 0.270517283948985, 'epoch': 5.0}


 56%|█████▌    | 110/198 [02:31<02:00,  1.37s/it]
100%|██████████| 11/11 [00:02<00:00,  4.97it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.38it/s]
[I 2025-10-02 08:43:35,063] Trial 14 finished with value: 0.9409574705684249 and parameters: {'learning_rate': 0.00036853625668387634, 'weight_decay': 0.020596759076493557, 'num_train_epochs': 9}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9218    0.9157    0.9187      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9524    0.9671    0.9597     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9450    0.9542    0.9496     14095
   macro avg     0.9445    0.9376    0.9410     14095
weighted avg     0.9449    0.9542    0.9495     14095

✅ Trial 14 finished | F1: 0.9410

=== Trial 15 ===
{}

=== Current parameters: {'learning_rate': 0.0002869130027170793, 'weight_decay': 0.08281393077633224, 'num_train_epochs': 8} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dersty/distilrubert_X5_ner_MLM


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 12%|█▎        | 22/176 [00:25<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.36it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 12%|█▎        | 22/176 [00:29<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.7959    0.8866    0.8388      3475
     PERCENT     0.7297    0.9759    0.8351        83
        TYPE     0.9238    0.9390    0.9313     10485
      VOLUME     0.8182    0.3462    0.4865        52

   micro avg     0.8884    0.9241    0.9059     14095
   macro avg     0.8169    0.7869    0.7729     14095
weighted avg     0.8907    0.9241    0.9063     14095

{'eval_loss': 0.32658645510673523, 'eval_f1_macro': 0.7729208556637011, 'eval_precision': 0.8884114316895164, 'eval_recall': 0.9240865555161405, 'eval_f1': 0.9058978995687857, 'eval_accuracy': 0.9011458867795451, 'eval_runtime': 4.86, 'eval_samples_per_second': 1133.95, 'eval_steps_per_second': 2.263, 'epoch': 1.0}


 25%|██▌       | 44/176 [00:55<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.70it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 25%|██▌       | 44/176 [01:00<02:12,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8792    0.9338    0.9057      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9521    0.9639    0.9580     10485
      VOLUME     0.8810    0.7115    0.7872        52

   micro avg     0.9333    0.9557    0.9444     14095
   macro avg     0.9164    0.8993    0.9053     14095
weighted avg     0.9339    0.9557    0.9445     14095

{'eval_loss': 0.20527750253677368, 'eval_f1_macro': 0.9053308414313166, 'eval_precision': 0.9332825273659415, 'eval_recall': 0.9557289819084782, 'eval_f1': 0.9443723930036103, 'eval_accuracy': 0.938201926914087, 'eval_runtime': 4.8715, 'eval_samples_per_second': 1131.283, 'eval_steps_per_second': 2.258, 'epoch': 2.0}


 38%|███▊      | 66/176 [01:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 38%|███▊      | 66/176 [01:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9312    0.9076    0.9193      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9514    0.9767    0.9639     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9467    0.9594    0.9530     14095
   macro avg     0.9515    0.9410    0.9460     14095
weighted avg     0.9465    0.9594    0.9528     14095

{'eval_loss': 0.20424184203147888, 'eval_f1_macro': 0.9459806633823074, 'eval_precision': 0.94672360683282, 'eval_recall': 0.959418233416105, 'eval_f1': 0.9530286479439023, 'eval_accuracy': 0.946924348668833, 'eval_runtime': 5.1624, 'eval_samples_per_second': 1067.533, 'eval_steps_per_second': 2.131, 'epoch': 3.0}


 50%|█████     | 88/176 [01:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.55it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 88/176 [02:00<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9186    0.9223    0.9204      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9551    0.9711    0.9630     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9462    0.9590    0.9526     14095
   macro avg     0.9496    0.9511    0.9502     14095
weighted avg     0.9462    0.9590    0.9525     14095

{'eval_loss': 0.2254972904920578, 'eval_f1_macro': 0.9502084939616382, 'eval_precision': 0.9462373118655932, 'eval_recall': 0.9589925505498403, 'eval_f1': 0.9525722339675828, 'eval_accuracy': 0.9458411721110541, 'eval_runtime': 4.884, 'eval_samples_per_second': 1128.382, 'eval_steps_per_second': 2.252, 'epoch': 4.0}


 57%|█████▋    | 100/176 [02:14<01:30,  1.19s/it]

{'loss': 0.3127, 'grad_norm': 0.5069321990013123, 'learning_rate': 0.00013775317079652675, 'epoch': 4.55}


 62%|██████▎   | 110/176 [02:25<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.53it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 62%|██████▎   | 110/176 [02:30<01:06,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9141    0.9223    0.9182      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9548    0.9700    0.9623     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9449    0.9580    0.9514     14095
   macro avg     0.9481    0.9430    0.9453     14095
weighted avg     0.9449    0.9580    0.9514     14095

{'eval_loss': 0.27264508605003357, 'eval_f1_macro': 0.9453246618071309, 'eval_precision': 0.944926522043387, 'eval_recall': 0.9579992905285563, 'eval_f1': 0.9514180024660913, 'eval_accuracy': 0.9435037911179522, 'eval_runtime': 4.8699, 'eval_samples_per_second': 1131.643, 'eval_steps_per_second': 2.259, 'epoch': 5.0}


 75%|███████▌  | 132/176 [02:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 75%|███████▌  | 132/176 [03:01<00:44,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9037    0.9258    0.9146      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9556    0.9678    0.9616     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9427    0.9573    0.9499     14095
   macro avg     0.9459    0.9481    0.9469     14095
weighted avg     0.9428    0.9573    0.9500     14095

{'eval_loss': 0.28666916489601135, 'eval_f1_macro': 0.9468562521069686, 'eval_precision': 0.9427094249982534, 'eval_recall': 0.9572898190847818, 'eval_f1': 0.9499436778372289, 'eval_accuracy': 0.942306595975144, 'eval_runtime': 4.8273, 'eval_samples_per_second': 1141.629, 'eval_steps_per_second': 2.279, 'epoch': 6.0}


 75%|███████▌  | 132/176 [03:01<01:00,  1.37s/it]

{'train_runtime': 181.2272, 'train_samples_per_second': 972.967, 'train_steps_per_second': 0.971, 'train_loss': 0.24253654118740198, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.97it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.41it/s]
[I 2025-10-02 08:46:42,082] Trial 15 finished with value: 0.9468562521069686 and parameters: {'learning_rate': 0.0002869130027170793, 'weight_decay': 0.08281393077633224, 'num_train_epochs': 8}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9037    0.9258    0.9146      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9556    0.9678    0.9616     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9427    0.9573    0.9499     14095
   macro avg     0.9459    0.9481    0.9469     14095
weighted avg     0.9428    0.9573    0.9500     14095

✅ Trial 15 finished | F1: 0.9469

=== Trial 16 ===
{}

=== Current parameters: {'learning_rate': 0.0002189109785385183, 'weight_decay': 0.013898461131290469, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:49,  1.00it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.92it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:29<01:49,  1.00it/s][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.7860    0.8929    0.8361      3475
     PERCENT     0.7551    0.8916    0.8177        83
        TYPE     0.9354    0.9403    0.9378     10485
      VOLUME     0.2500    0.0769    0.1176        52

   micro avg     0.8930    0.9252    0.9088     14095
   macro avg     0.6816    0.7004    0.6773     14095
weighted avg     0.8950    0.9252    0.9090     14095

{'eval_loss': 0.3138595521450043, 'eval_f1_macro': 0.6773031602437323, 'eval_precision': 0.8930283522805095, 'eval_recall': 0.9251507626818021, 'eval_f1': 0.9088057985155243, 'eval_accuracy': 0.9053075651331167, 'eval_runtime': 4.8119, 'eval_samples_per_second': 1145.281, 'eval_steps_per_second': 2.286, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.83it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.39it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.08it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.90it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [00:59<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8633    0.9381    0.8992      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9527    0.9595    0.9560     10485
      VOLUME     0.8864    0.7500    0.8125        52

   micro avg     0.9292    0.9536    0.9412     14095
   macro avg     0.9168    0.9089    0.9110     14095
weighted avg     0.9305    0.9536    0.9416     14095

{'eval_loss': 0.2050071507692337, 'eval_f1_macro': 0.9109808405498416, 'eval_precision': 0.9292084341514, 'eval_recall': 0.9536005675771551, 'eval_f1': 0.9412464985994399, 'eval_accuracy': 0.936605666723676, 'eval_runtime': 4.8056, 'eval_samples_per_second': 1146.783, 'eval_steps_per_second': 2.289, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.68it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.08it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.91it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.78it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9156    0.9246    0.9201      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9558    0.9764    0.9660     10485
      VOLUME     0.9000    0.8654    0.8824        52

   micro avg     0.9458    0.9633    0.9545     14095
   macro avg     0.9340    0.9386    0.9362     14095
weighted avg     0.9457    0.9633    0.9544     14095

{'eval_loss': 0.18971821665763855, 'eval_f1_macro': 0.9361580657128264, 'eval_precision': 0.9458066313736417, 'eval_recall': 0.9633203263568642, 'eval_f1': 0.9544831464623388, 'eval_accuracy': 0.9474944415939798, 'eval_runtime': 5.0737, 'eval_samples_per_second': 1086.189, 'eval_steps_per_second': 2.168, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:55<00:44,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9181    0.9223    0.9202      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9582    0.9697    0.9639     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9484    0.9579    0.9532     14095
   macro avg     0.9503    0.9508    0.9504     14095
weighted avg     0.9484    0.9579    0.9531     14095




                                                
 67%|██████▋   | 88/132 [02:00<00:44,  1.00s/it][A
                                               [A

{'eval_loss': 0.20941251516342163, 'eval_f1_macro': 0.9503689492886491, 'eval_precision': 0.9484405731947176, 'eval_recall': 0.9579283433841788, 'eval_f1': 0.953160848540468, 'eval_accuracy': 0.9457271535260248, 'eval_runtime': 4.901, 'eval_samples_per_second': 1124.469, 'eval_steps_per_second': 2.244, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:14<00:38,  1.20s/it]

{'loss': 0.3077, 'grad_norm': 0.5296498537063599, 'learning_rate': 3.959615987515651e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:25<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:30<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9203    0.9272    0.9237      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9607    0.9668    0.9637     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9507    0.9570    0.9539     14095
   macro avg     0.9514    0.9513    0.9512     14095
weighted avg     0.9507    0.9570    0.9539     14095

{'eval_loss': 0.2221229076385498, 'eval_f1_macro': 0.9512090122092474, 'eval_precision': 0.9507330138144912, 'eval_recall': 0.9570060305072721, 'eval_f1': 0.9538592087119471, 'eval_accuracy': 0.9452140698933926, 'eval_runtime': 4.9107, 'eval_samples_per_second': 1122.243, 'eval_steps_per_second': 2.24, 'epoch': 5.0}


100%|██████████| 132/132 [02:56<00:00,  1.02s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.66it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 132/132 [03:01<00:00,  1.02s/it]A
100%|██████████| 132/132 [03:01<00:00,  1.37s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9202    0.9260    0.9231      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9604    0.9675    0.9639     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9505    0.9572    0.9539     14095
   macro avg     0.9513    0.9511    0.9511     14095
weighted avg     0.9505    0.9572    0.9539     14095

{'eval_loss': 0.2240985482931137, 'eval_f1_macro': 0.9511064639514164, 'eval_precision': 0.9505424827391856, 'eval_recall': 0.9572188719404044, 'eval_f1': 0.9538689950157306, 'eval_accuracy': 0.9456131349409954, 'eval_runtime': 4.9369, 'eval_samples_per_second': 1116.287, 'eval_steps_per_second': 2.228, 'epoch': 6.0}
{'train_runtime': 181.2082, 'train_samples_per_second': 729.801, 'train_steps_per_second': 0.728, 'train_loss': 0.24111207687493527, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.98it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.35it/s]
[I 2025-10-02 08:49:49,118] Trial 16 finished with value: 0.9511064639514164 and parameters: {'learning_rate': 0.0002189109785385183, 'weight_decay': 0.013898461131290469, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9202    0.9260    0.9231      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9604    0.9675    0.9639     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9505    0.9572    0.9539     14095
   macro avg     0.9513    0.9511    0.9511     14095
weighted avg     0.9505    0.9572    0.9539     14095

✅ Trial 16 finished | F1: 0.9511

=== Trial 17 ===
{}

=== Current parameters: {'learning_rate': 0.00018629576835296982, 'weight_decay': 0.014565594107530442, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.69it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8188    0.8164    0.8176      3475
     PERCENT     0.8026    0.7349    0.7673        83
        TYPE     0.9115    0.9397    0.9254     10485
      VOLUME     0.2667    0.0769    0.1194        52

   micro avg     0.8879    0.9049    0.8963     14095
   macro avg     0.6999    0.6420    0.6574     14095
weighted avg     0.8856    0.9049    0.8949     14095

{'eval_loss': 0.3521401882171631, 'eval_f1_macro': 0.6574148445607428, 'eval_precision': 0.8878602255325073, 'eval_recall': 0.904930826534232, 'eval_f1': 0.8963142545940059, 'eval_accuracy': 0.893677669460122, 'eval_runtime': 4.9333, 'eval_samples_per_second': 1117.109, 'eval_steps_per_second': 2.23, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.50it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [01:00<01:28,  1.01s/it][A


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8858    0.9220    0.9036      3475
     PERCENT     0.9634    0.9518    0.9576        83
        TYPE     0.9485    0.9700    0.9591     10485
      VOLUME     0.8667    0.7500    0.8041        52

   micro avg     0.9327    0.9572    0.9448     14095
   macro avg     0.9161    0.8984    0.9061     14095
weighted avg     0.9328    0.9572    0.9448     14095

{'eval_loss': 0.20346739888191223, 'eval_f1_macro': 0.906092510251755, 'eval_precision': 0.9326697082814877, 'eval_recall': 0.9572188719404044, 'eval_f1': 0.9447848464689612, 'eval_accuracy': 0.9388860384242631, 'eval_runtime': 4.9328, 'eval_samples_per_second': 1117.226, 'eval_steps_per_second': 2.23, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it][A
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.53it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9251    0.9137    0.9194      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9545    0.9756    0.9649     10485
      VOLUME     0.8913    0.7885    0.8367        52

   micro avg     0.9472    0.9597    0.9534     14095
   macro avg     0.9311    0.9164    0.9229     14095
weighted avg     0.9470    0.9597    0.9532     14095

{'eval_loss': 0.17915239930152893, 'eval_f1_macro': 0.9228537599485896, 'eval_precision': 0.9472025768503606, 'eval_recall': 0.9597020219936148, 'eval_f1': 0.9534113335212856, 'eval_accuracy': 0.9480645345191266, 'eval_runtime': 5.1604, 'eval_samples_per_second': 1067.95, 'eval_steps_per_second': 2.132, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.56it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.89it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 67%|██████▋   | 88/132 [02:01<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9262    0.9212    0.9237      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9579    0.9722    0.9650     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9502    0.9594    0.9548     14095
   macro avg     0.9521    0.9481    0.9500     14095
weighted avg     0.9501    0.9594    0.9547     14095

{'eval_loss': 0.20231491327285767, 'eval_f1_macro': 0.9499711650877258, 'eval_precision': 0.9502494554142366, 'eval_recall': 0.959418233416105, 'eval_f1': 0.9548118336510627, 'eval_accuracy': 0.9477794880565532, 'eval_runtime': 4.9518, 'eval_samples_per_second': 1112.923, 'eval_steps_per_second': 2.221, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:15<00:38,  1.20s/it]

{'loss': 0.3259, 'grad_norm': 0.5850766897201538, 'learning_rate': 3.369678888202207e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:26<00:22,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.89it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:31<00:22,  1.00s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9173    0.9223    0.9198      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9615    0.9686    0.9650     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9506    0.9571    0.9539     14095
   macro avg     0.9509    0.9505    0.9505     14095
weighted avg     0.9506    0.9571    0.9539     14095

{'eval_loss': 0.21373490989208221, 'eval_f1_macro': 0.9505467212058205, 'eval_precision': 0.9506059751972943, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9538657333757556, 'eval_accuracy': 0.9467533207912889, 'eval_runtime': 4.9496, 'eval_samples_per_second': 1113.425, 'eval_steps_per_second': 2.222, 'epoch': 5.0}


100%|██████████| 132/132 [02:56<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 132/132 [03:01<00:00,  1.01s/it]A
100%|██████████| 132/132 [03:01<00:00,  1.38s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9145    0.9235    0.9190      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9613    0.9686    0.9649     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9497    0.9574    0.9535     14095
   macro avg     0.9500    0.9478    0.9488     14095
weighted avg     0.9498    0.9574    0.9535     14095

{'eval_loss': 0.21703693270683289, 'eval_f1_macro': 0.9487836464540449, 'eval_precision': 0.9497466216216216, 'eval_recall': 0.9573607662291592, 'eval_f1': 0.9535384941525633, 'eval_accuracy': 0.9464112650362009, 'eval_runtime': 4.9023, 'eval_samples_per_second': 1124.176, 'eval_steps_per_second': 2.244, 'epoch': 6.0}
{'train_runtime': 181.9812, 'train_samples_per_second': 726.701, 'train_steps_per_second': 0.725, 'train_loss': 0.2561102816552827, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.37it/s]
[I 2025-10-02 08:52:56,930] Trial 17 finished with value: 0.9487836464540449 and parameters: {'learning_rate': 0.00018629576835296982, 'weight_decay': 0.014565594107530442, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9145    0.9235    0.9190      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9613    0.9686    0.9649     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9497    0.9574    0.9535     14095
   macro avg     0.9500    0.9478    0.9488     14095
weighted avg     0.9498    0.9574    0.9535     14095

✅ Trial 17 finished | F1: 0.9488

=== Trial 18 ===
{}

=== Current parameters: {'learning_rate': 0.00013040896062839577, 'weight_decay': 0.004290161322084861, 'num_train_epochs': 4} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 25%|██▌       | 22/88 [00:25<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.51it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 25%|██▌       | 22/88 [00:30<01:06,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8183    0.8256    0.8219      3475
     PERCENT     0.8000    0.3855    0.5203        83
        TYPE     0.9135    0.9355    0.9244     10485
      VOLUME     0.3333    0.0385    0.0690        52

   micro avg     0.8896    0.9019    0.8957     14095
   macro avg     0.7163    0.5463    0.5839     14095
weighted avg     0.8872    0.9019    0.8936     14095

{'eval_loss': 0.3527527451515198, 'eval_f1_macro': 0.583902624975787, 'eval_precision': 0.8895731280615815, 'eval_recall': 0.9018800993260021, 'eval_f1': 0.8956843403205919, 'eval_accuracy': 0.8927655207798871, 'eval_runtime': 4.941, 'eval_samples_per_second': 1115.358, 'eval_steps_per_second': 2.226, 'epoch': 1.0}


 50%|█████     | 44/88 [00:55<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 50%|█████     | 44/88 [01:00<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8742    0.9275    0.9000      3475
     PERCENT     0.9524    0.9639    0.9581        83
        TYPE     0.9524    0.9658    0.9590     10485
      VOLUME     0.8125    0.7500    0.7800        52

   micro avg     0.9320    0.9555    0.9436     14095
   macro avg     0.8979    0.9018    0.8993     14095
weighted avg     0.9326    0.9555    0.9438     14095

{'eval_loss': 0.20583850145339966, 'eval_f1_macro': 0.8992873748998544, 'eval_precision': 0.9319770258113625, 'eval_recall': 0.9555161404753458, 'eval_f1': 0.9435998038254045, 'eval_accuracy': 0.9379738897440283, 'eval_runtime': 4.9539, 'eval_samples_per_second': 1112.447, 'eval_steps_per_second': 2.22, 'epoch': 2.0}


 75%|███████▌  | 66/88 [01:25<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.52it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.73it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.70it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
 75%|███████▌  | 66/88 [01:30<00:22,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9096    0.9203    0.9149      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9548    0.9730    0.9638     10485
      VOLUME     0.8837    0.7308    0.8000        52

   micro avg     0.9435    0.9591    0.9513     14095
   macro avg     0.9281    0.9000    0.9122     14095
weighted avg     0.9434    0.9591    0.9512     14095

{'eval_loss': 0.1853184998035431, 'eval_f1_macro': 0.9121917939025601, 'eval_precision': 0.943537130094919, 'eval_recall': 0.9591344448385952, 'eval_f1': 0.9512718572986665, 'eval_accuracy': 0.9447009862607605, 'eval_runtime': 5.2265, 'eval_samples_per_second': 1054.441, 'eval_steps_per_second': 2.105, 'epoch': 3.0}


100%|██████████| 88/88 [01:56<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.54it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                               
100%|██████████| 88/88 [02:01<00:00,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9114    0.9177    0.9145      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9552    0.9727    0.9639     10485
      VOLUME     0.8837    0.7308    0.8000        52

   micro avg     0.9444    0.9584    0.9513     14095
   macro avg     0.9288    0.9023    0.9137     14095
weighted avg     0.9442    0.9584    0.9512     14095

{'eval_loss': 0.18475991487503052, 'eval_f1_macro': 0.9136569368407088, 'eval_precision': 0.944351230425056, 'eval_recall': 0.9583540262504434, 'eval_f1': 0.9513011021514842, 'eval_accuracy': 0.9449290234308192, 'eval_runtime': 4.9623, 'eval_samples_per_second': 1110.58, 'eval_steps_per_second': 2.217, 'epoch': 4.0}


100%|██████████| 88/88 [02:01<00:00,  1.38s/it]

{'train_runtime': 121.2877, 'train_samples_per_second': 726.9, 'train_steps_per_second': 0.726, 'train_loss': 0.38385907086459076, 'epoch': 4.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9114    0.9177    0.9145      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9552    0.9727    0.9639     10485
      VOLUME     0.8837    0.7308    0.8000        52

   micro avg     0.9444    0.9584    0.9513     14095
   macro avg     0.9288    0.9023    0.9137     14095
weighted avg     0.9442    0.9584    0.9512     14095



100%|██████████| 11/11 [00:04<00:00,  2.35it/s]
[I 2025-10-02 08:55:04,077] Trial 18 finished with value: 0.9136569368407088 and parameters: {'learning_rate': 0.00013040896062839577, 'weight_decay': 0.004290161322084861, 'num_train_epochs': 4}. Best is trial 6 with value: 0.9512251951099278.


✅ Trial 18 finished | F1: 0.9137

=== Trial 19 ===
{}

=== Current parameters: {'learning_rate': 0.00021547720704945532, 'weight_decay': 0.014577954742540832, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.54it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.7330    0.8878    0.8030      3475
     PERCENT     0.6000    0.9759    0.7431        83
        TYPE     0.9371    0.8959    0.9160     10485
      VOLUME     0.2353    0.0769    0.1159        52

   micro avg     0.8733    0.8914    0.8823     14095
   macro avg     0.6263    0.7091    0.6445     14095
weighted avg     0.8822    0.8914    0.8842     14095

{'eval_loss': 0.38856926560401917, 'eval_f1_macro': 0.644517363813225, 'eval_precision': 0.8733490893924649, 'eval_recall': 0.8913799219581412, 'eval_f1': 0.8822723921210631, 'eval_accuracy': 0.8795963742089961, 'eval_runtime': 4.9453, 'eval_samples_per_second': 1114.389, 'eval_steps_per_second': 2.224, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.82it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [01:00<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8748    0.9306    0.9018      3475
     PERCENT     0.9634    0.9518    0.9576        83
        TYPE     0.9511    0.9614    0.9562     10485
      VOLUME     0.8800    0.8462    0.8627        52

   micro avg     0.9314    0.9533    0.9422     14095
   macro avg     0.9173    0.9225    0.9196     14095
weighted avg     0.9321    0.9533    0.9425     14095

{'eval_loss': 0.21130192279815674, 'eval_f1_macro': 0.9195954995306069, 'eval_precision': 0.9313786650031192, 'eval_recall': 0.9533167789996453, 'eval_f1': 0.9422200406703597, 'eval_accuracy': 0.9371187503563081, 'eval_runtime': 4.9574, 'eval_samples_per_second': 1111.668, 'eval_steps_per_second': 2.219, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9253    0.9088    0.9170      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9509    0.9766    0.9636     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9448    0.9596    0.9522     14095
   macro avg     0.9451    0.9413    0.9430     14095
weighted avg     0.9446    0.9596    0.9520     14095

{'eval_loss': 0.19424022734165192, 'eval_f1_macro': 0.9430422149738245, 'eval_precision': 0.9448169879854708, 'eval_recall': 0.9596310748492373, 'eval_f1': 0.952166414416951, 'eval_accuracy': 0.9461262185736276, 'eval_runtime': 5.157, 'eval_samples_per_second': 1068.644, 'eval_steps_per_second': 2.133, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.55it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.89it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 67%|██████▋   | 88/132 [02:01<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9146    0.9246    0.9196      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9581    0.9685    0.9633     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9474    0.9575    0.9524     14095
   macro avg     0.9442    0.9432    0.9436     14095
weighted avg     0.9474    0.9575    0.9524     14095

{'eval_loss': 0.209930881857872, 'eval_f1_macro': 0.9436236556885822, 'eval_precision': 0.9473536431278955, 'eval_recall': 0.9575026605179141, 'eval_f1': 0.9524011149924139, 'eval_accuracy': 0.9457841628185394, 'eval_runtime': 4.9395, 'eval_samples_per_second': 1115.698, 'eval_steps_per_second': 2.227, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:15<00:38,  1.20s/it]

{'loss': 0.3129, 'grad_norm': 0.43861374258995056, 'learning_rate': 3.897506647105495e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:26<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:31<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9238    0.9212    0.9225      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9570    0.9716    0.9642     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9488    0.9589    0.9538     14095
   macro avg     0.9434    0.9431    0.9431     14095
weighted avg     0.9487    0.9589    0.9538     14095

{'eval_loss': 0.22950325906276703, 'eval_f1_macro': 0.9431385534856624, 'eval_precision': 0.9488241488241488, 'eval_recall': 0.958921603405463, 'eval_f1': 0.9538461538461539, 'eval_accuracy': 0.9477224787640386, 'eval_runtime': 4.9271, 'eval_samples_per_second': 1118.515, 'eval_steps_per_second': 2.233, 'epoch': 5.0}


100%|██████████| 132/132 [02:57<00:00,  1.03s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.59it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 132/132 [03:01<00:00,  1.03s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9156    0.9275    0.9215      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9595    0.9666    0.9630     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9485    0.9568    0.9526     14095
   macro avg     0.9420    0.9435    0.9426     14095
weighted avg     0.9486    0.9568    0.9527     14095

{'eval_loss': 0.2326624095439911, 'eval_f1_macro': 0.9426016444919412, 'eval_precision': 0.9485159656773104, 'eval_recall': 0.9567931890741398, 'eval_f1': 0.9526365980291739, 'eval_accuracy': 0.9457841628185394, 'eval_runtime': 4.9359, 'eval_samples_per_second': 1116.515, 'eval_steps_per_second': 2.229, 'epoch': 6.0}


100%|██████████| 132/132 [03:02<00:00,  1.38s/it]

{'train_runtime': 182.0474, 'train_samples_per_second': 726.437, 'train_steps_per_second': 0.725, 'train_loss': 0.24551113265933414, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.93it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.35it/s]

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9156    0.9275    0.9215      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9595    0.9666    0.9630     10485
      VOLUME     0.9400    0.9038    0.9216        52

   micro avg     0.9485    0.9568    0.9526     14095
   macro avg     0.9420    0.9435    0.9426     14095
weighted avg     0.9486    0.9568    0.9527     14095




[I 2025-10-02 08:58:11,962] Trial 19 finished with value: 0.9426016444919412 and parameters: {'learning_rate': 0.00021547720704945532, 'weight_decay': 0.014577954742540832, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


✅ Trial 19 finished | F1: 0.9426

=== Trial 20 ===
{}

=== Current parameters: {'learning_rate': 0.0001604426060060793, 'weight_decay': 0.014131885174846657, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8177    0.8337    0.8256      3475
     PERCENT     0.7971    0.6627    0.7237        83
        TYPE     0.9183    0.9406    0.9293     10485
      VOLUME     0.4615    0.1154    0.1846        52

   micro avg     0.8925    0.9095    0.9009     14095
   macro avg     0.7487    0.6381    0.6658     14095
weighted avg     0.8911    0.9095    0.8998     14095

{'eval_loss': 0.3432011604309082, 'eval_f1_macro': 0.6658040559567375, 'eval_precision': 0.8925090504037873, 'eval_recall': 0.9095423909187655, 'eval_f1': 0.9009452194384905, 'eval_accuracy': 0.8977253292286643, 'eval_runtime': 4.9468, 'eval_samples_per_second': 1114.048, 'eval_steps_per_second': 2.224, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.96it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [01:00<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8782    0.9298    0.9033      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9512    0.9680    0.9595     10485
      VOLUME     0.8667    0.7500    0.8041        52

   micro avg     0.9325    0.9579    0.9450     14095
   macro avg     0.9151    0.9059    0.9092     14095
weighted avg     0.9330    0.9579    0.9452     14095

{'eval_loss': 0.2083366960287094, 'eval_f1_macro': 0.9092482885334692, 'eval_precision': 0.9324538987499137, 'eval_recall': 0.9578573962398014, 'eval_f1': 0.9449849513543782, 'eval_accuracy': 0.9377458525739696, 'eval_runtime': 4.9415, 'eval_samples_per_second': 1115.252, 'eval_steps_per_second': 2.226, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.56it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.66it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9183    0.9249    0.9216      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9551    0.9749    0.9649     10485
      VOLUME     0.8936    0.8077    0.8485        52

   micro avg     0.9460    0.9620    0.9540     14095
   macro avg     0.9329    0.9239    0.9278     14095
weighted avg     0.9459    0.9620    0.9539     14095

{'eval_loss': 0.18423134088516235, 'eval_f1_macro': 0.9277959278550558, 'eval_precision': 0.9460025115110925, 'eval_recall': 0.9620432777580702, 'eval_f1': 0.9539554680080199, 'eval_accuracy': 0.9478935066415826, 'eval_runtime': 5.154, 'eval_samples_per_second': 1069.258, 'eval_steps_per_second': 2.134, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.02it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 67%|██████▋   | 88/132 [02:01<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9184    0.9258    0.9220      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9594    0.9717    0.9655     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9493    0.9601    0.9547     14095
   macro avg     0.9453    0.9425    0.9437     14095
weighted avg     0.9492    0.9601    0.9547     14095

{'eval_loss': 0.20411249995231628, 'eval_f1_macro': 0.9436566086164494, 'eval_precision': 0.9492845117845118, 'eval_recall': 0.9601277048598794, 'eval_f1': 0.9546753200945293, 'eval_accuracy': 0.9473234137164358, 'eval_runtime': 4.9194, 'eval_samples_per_second': 1120.269, 'eval_steps_per_second': 2.236, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:15<00:38,  1.20s/it]

{'loss': 0.3345, 'grad_norm': 0.677898108959198, 'learning_rate': 2.9020522956940877e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:26<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.80it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:31<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9196    0.9217    0.9207      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9598    0.9682    0.9640     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9500    0.9567    0.9533     14095
   macro avg     0.9510    0.9502    0.9505     14095
weighted avg     0.9499    0.9567    0.9533     14095

{'eval_loss': 0.20950765907764435, 'eval_f1_macro': 0.9505112050424014, 'eval_precision': 0.9499823881648468, 'eval_recall': 0.9567222419297623, 'eval_f1': 0.953340402969247, 'eval_accuracy': 0.94567014423351, 'eval_runtime': 4.9313, 'eval_samples_per_second': 1117.547, 'eval_steps_per_second': 2.231, 'epoch': 5.0}


100%|██████████| 132/132 [02:56<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.50it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.70it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
100%|██████████| 132/132 [03:01<00:00,  1.01s/it]A
100%|██████████| 132/132 [03:02<00:00,  1.38s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9194    0.9223    0.9208      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9598    0.9695    0.9646     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9499    0.9578    0.9538     14095
   macro avg     0.9510    0.9507    0.9507     14095
weighted avg     0.9499    0.9578    0.9538     14095

{'eval_loss': 0.21088004112243652, 'eval_f1_macro': 0.9507039855595067, 'eval_precision': 0.9499014916971573, 'eval_recall': 0.9577864490954239, 'eval_f1': 0.9538276751333591, 'eval_accuracy': 0.946582292913745, 'eval_runtime': 4.9429, 'eval_samples_per_second': 1114.934, 'eval_steps_per_second': 2.225, 'epoch': 6.0}
{'train_runtime': 181.999, 'train_samples_per_second': 726.63, 'train_steps_per_second': 0.725, 'train_loss': 0.2640026067242478, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.98it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.36it/s]
[I 2025-10-02 09:01:19,800] Trial 20 finished with value: 0.9507039855595067 and parameters: {'learning_rate': 0.0001604426060060793, 'weight_decay': 0.014131885174846657, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9194    0.9223    0.9208      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9598    0.9695    0.9646     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9499    0.9578    0.9538     14095
   macro avg     0.9510    0.9507    0.9507     14095
weighted avg     0.9499    0.9578    0.9538     14095

✅ Trial 20 finished | F1: 0.9507

=== Trial 21 ===
{}

=== Current parameters: {'learning_rate': 0.00015743577526467078, 'weight_decay': 0.012583681171932263, 'num_train_epochs': 6} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 17%|█▋        | 22/132 [00:25<01:50,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 17%|█▋        | 22/132 [00:30<01:50,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8159    0.8314    0.8235      3475
     PERCENT     0.7966    0.5663    0.6620        83
        TYPE     0.9174    0.9403    0.9287     10485
      VOLUME     0.4444    0.0769    0.1311        52

   micro avg     0.8915    0.9081    0.8997     14095
   macro avg     0.7436    0.6037    0.6363     14095
weighted avg     0.8899    0.9081    0.8983     14095

{'eval_loss': 0.3451519310474396, 'eval_f1_macro': 0.6363395229230203, 'eval_precision': 0.8915436054611312, 'eval_recall': 0.9080525008868393, 'eval_f1': 0.8997223296193455, 'eval_accuracy': 0.8964711247933413, 'eval_runtime': 4.917, 'eval_samples_per_second': 1120.812, 'eval_steps_per_second': 2.237, 'epoch': 1.0}


 33%|███▎      | 44/132 [00:55<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.59it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 44/132 [01:00<01:28,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8771    0.9283    0.9020      3475
     PERCENT     0.9639    0.9639    0.9639        83
        TYPE     0.9494    0.9679    0.9585     10485
      VOLUME     0.8478    0.7500    0.7959        52

   micro avg     0.9308    0.9573    0.9439     14095
   macro avg     0.9095    0.9025    0.9051     14095
weighted avg     0.9313    0.9573    0.9440     14095

{'eval_loss': 0.20792727172374725, 'eval_f1_macro': 0.9050767503549141, 'eval_precision': 0.9308084988962473, 'eval_recall': 0.9572898190847818, 'eval_f1': 0.943863453534329, 'eval_accuracy': 0.9375178154039109, 'eval_runtime': 4.9508, 'eval_samples_per_second': 1113.148, 'eval_steps_per_second': 2.222, 'epoch': 2.0}


 50%|█████     | 66/132 [01:25<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.59it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.73it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.28it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.00it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.91it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 50%|█████     | 66/132 [01:30<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9118    0.9283    0.9200      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9551    0.9738    0.9643     10485
      VOLUME     0.8936    0.8077    0.8485        52

   micro avg     0.9443    0.9620    0.9531     14095
   macro avg     0.9313    0.9244    0.9273     14095
weighted avg     0.9443    0.9620    0.9531     14095

{'eval_loss': 0.18422995507717133, 'eval_f1_macro': 0.9272564436328485, 'eval_precision': 0.9442896935933147, 'eval_recall': 0.9620432777580702, 'eval_f1': 0.9530838165524511, 'eval_accuracy': 0.9469813579613477, 'eval_runtime': 5.214, 'eval_samples_per_second': 1056.954, 'eval_steps_per_second': 2.11, 'epoch': 3.0}


 67%|██████▋   | 88/132 [01:56<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 67%|██████▋   | 88/132 [02:01<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9212    0.9246    0.9229      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9591    0.9719    0.9654     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9497    0.9600    0.9548     14095
   macro avg     0.9459    0.9423    0.9438     14095
weighted avg     0.9497    0.9600    0.9548     14095

{'eval_loss': 0.20279061794281006, 'eval_f1_macro': 0.9438447169220192, 'eval_precision': 0.9497438057134835, 'eval_recall': 0.9599858105711245, 'eval_f1': 0.9548373438712864, 'eval_accuracy': 0.9477224787640386, 'eval_runtime': 4.9294, 'eval_samples_per_second': 1117.977, 'eval_steps_per_second': 2.231, 'epoch': 4.0}


 76%|███████▌  | 100/132 [02:15<00:38,  1.20s/it]

{'loss': 0.3368, 'grad_norm': 0.6846614480018616, 'learning_rate': 2.847665370219083e-05, 'epoch': 4.55}


 83%|████████▎ | 110/132 [02:26<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.68it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.74it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.60it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.55it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 83%|████████▎ | 110/132 [02:31<00:22,  1.01s/it]A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9195    0.9237    0.9216      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9604    0.9683    0.9643     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9503    0.9571    0.9537     14095
   macro avg     0.9458    0.9412    0.9433     14095
weighted avg     0.9502    0.9571    0.9537     14095

{'eval_loss': 0.20912165939807892, 'eval_f1_macro': 0.9432589398799621, 'eval_precision': 0.950271184052969, 'eval_recall': 0.957147924796027, 'eval_f1': 0.9536971582072671, 'eval_accuracy': 0.9462402371586569, 'eval_runtime': 4.9776, 'eval_samples_per_second': 1107.167, 'eval_steps_per_second': 2.21, 'epoch': 5.0}


100%|██████████| 132/132 [02:57<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.30it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.89it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===


                                                 
100%|██████████| 132/132 [03:02<00:00,  1.01s/it]A
100%|██████████| 132/132 [03:02<00:00,  1.38s/it]A


              precision    recall  f1-score   support

       BRAND     0.9201    0.9240    0.9220      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9600    0.9702    0.9651     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9501    0.9586    0.9544     14095
   macro avg     0.9459    0.9417    0.9436     14095
weighted avg     0.9501    0.9586    0.9543     14095

{'eval_loss': 0.21057216823101044, 'eval_f1_macro': 0.9435522579539931, 'eval_precision': 0.9501441530131496, 'eval_recall': 0.9586378148279532, 'eval_f1': 0.9543720864528888, 'eval_accuracy': 0.9469813579613477, 'eval_runtime': 4.9556, 'eval_samples_per_second': 1112.075, 'eval_steps_per_second': 2.22, 'epoch': 6.0}
{'train_runtime': 182.1178, 'train_samples_per_second': 726.156, 'train_steps_per_second': 0.725, 'train_loss': 0.266014118086208, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.35it/s]
[I 2025-10-02 09:04:27,774] Trial 21 finished with value: 0.9435522579539931 and parameters: {'learning_rate': 0.00015743577526467078, 'weight_decay': 0.012583681171932263, 'num_train_epochs': 6}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9201    0.9240    0.9220      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9600    0.9702    0.9651     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9501    0.9586    0.9544     14095
   macro avg     0.9459    0.9417    0.9436     14095
weighted avg     0.9501    0.9586    0.9543     14095

✅ Trial 21 finished | F1: 0.9436

=== Trial 22 ===
{}

=== Current parameters: {'learning_rate': 0.00018621858997258596, 'weight_decay': 0.004938557538335691, 'num_train_epochs': 5} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 20%|██        | 22/110 [00:25<01:28,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.75it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.07it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.63it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 20%|██        | 22/110 [00:30<01:28,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8208    0.8527    0.8364      3475
     PERCENT     0.7957    0.8916    0.8409        83
        TYPE     0.9221    0.9486    0.9352     10485
      VOLUME     0.4762    0.1923    0.2740        52

   micro avg     0.8955    0.9218    0.9084     14095
   macro avg     0.7537    0.7213    0.7216     14095
weighted avg     0.8947    0.9218    0.9078     14095

{'eval_loss': 0.31237950921058655, 'eval_f1_macro': 0.7216166511327977, 'eval_precision': 0.8954514128187457, 'eval_recall': 0.9218162468960625, 'eval_f1': 0.9084425799685371, 'eval_accuracy': 0.9059916766432928, 'eval_runtime': 4.8722, 'eval_samples_per_second': 1131.109, 'eval_steps_per_second': 2.258, 'epoch': 1.0}


 40%|████      | 44/110 [00:55<01:06,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.70it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.82it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.34it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 40%|████      | 44/110 [01:00<01:06,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8814    0.9301    0.9051      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9527    0.9676    0.9601     10485
      VOLUME     0.8750    0.8077    0.8400        52

   micro avg     0.9344    0.9578    0.9459     14095
   macro avg     0.9183    0.9203    0.9188     14095
weighted avg     0.9349    0.9578    0.9461     14095

{'eval_loss': 0.2015845775604248, 'eval_f1_macro': 0.9187982099387006, 'eval_precision': 0.9343853820598007, 'eval_recall': 0.9577864490954239, 'eval_f1': 0.945941211505448, 'eval_accuracy': 0.9399122056895274, 'eval_runtime': 4.8774, 'eval_samples_per_second': 1129.912, 'eval_steps_per_second': 2.255, 'epoch': 2.0}


 60%|██████    | 66/110 [01:25<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.70it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.77it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.88it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.69it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 60%|██████    | 66/110 [01:30<00:44,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9273    0.9134    0.9203      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9532    0.9771    0.9650     10485
      VOLUME     0.8913    0.7885    0.8367        52

   micro avg     0.9468    0.9608    0.9537     14095
   macro avg     0.9313    0.9167    0.9231     14095
weighted avg     0.9466    0.9608    0.9535     14095

{'eval_loss': 0.18413971364498138, 'eval_f1_macro': 0.9231057347505898, 'eval_precision': 0.94679437880165, 'eval_recall': 0.9607662291592763, 'eval_f1': 0.9537291358546377, 'eval_accuracy': 0.9479505159340973, 'eval_runtime': 5.0994, 'eval_samples_per_second': 1080.707, 'eval_steps_per_second': 2.157, 'epoch': 3.0}


 80%|████████  | 88/110 [01:55<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.77it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.85it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 80%|████████  | 88/110 [02:00<00:22,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9179    0.9272    0.9225      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9597    0.9713    0.9654     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9494    0.9603    0.9549     14095
   macro avg     0.9506    0.9524    0.9513     14095
weighted avg     0.9494    0.9603    0.9548     14095

{'eval_loss': 0.2038535624742508, 'eval_f1_macro': 0.9513401471455831, 'eval_precision': 0.9494283509854808, 'eval_recall': 0.9603405462930117, 'eval_f1': 0.9548532731376975, 'eval_accuracy': 0.9480075252266119, 'eval_runtime': 4.9399, 'eval_samples_per_second': 1115.611, 'eval_steps_per_second': 2.227, 'epoch': 4.0}


 91%|█████████ | 100/110 [02:15<00:11,  1.20s/it]

{'loss': 0.3121, 'grad_norm': 0.5947083830833435, 'learning_rate': 5.615177561095096e-06, 'epoch': 4.55}


100%|██████████| 110/110 [02:26<00:00,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.73it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9237    0.9226    0.9231      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9599    0.9712    0.9655     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9510    0.9590    0.9550     14095
   macro avg     0.9468    0.9416    0.9439     14095
weighted avg     0.9509    0.9590    0.9549     14095




                                                 
100%|██████████| 110/110 [02:31<00:00,  1.01s/it]A
100%|██████████| 110/110 [02:31<00:00,  1.37s/it]A

{'eval_loss': 0.20268280804157257, 'eval_f1_macro': 0.9439340938720883, 'eval_precision': 0.9510307464996834, 'eval_recall': 0.9589925505498403, 'eval_f1': 0.9549950544015825, 'eval_accuracy': 0.9482925716891853, 'eval_runtime': 4.8777, 'eval_samples_per_second': 1129.831, 'eval_steps_per_second': 2.255, 'epoch': 5.0}
{'train_runtime': 151.1974, 'train_samples_per_second': 728.881, 'train_steps_per_second': 0.728, 'train_loss': 0.2886811326850544, 'epoch': 5.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.25it/s]
[I 2025-10-02 09:07:05,036] Trial 22 finished with value: 0.9439340938720883 and parameters: {'learning_rate': 0.00018621858997258596, 'weight_decay': 0.004938557538335691, 'num_train_epochs': 5}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9237    0.9226    0.9231      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9599    0.9712    0.9655     10485
      VOLUME     0.9388    0.8846    0.9109        52

   micro avg     0.9510    0.9590    0.9550     14095
   macro avg     0.9468    0.9416    0.9439     14095
weighted avg     0.9509    0.9590    0.9549     14095

✅ Trial 22 finished | F1: 0.9439

=== Trial 23 ===
{}

=== Current parameters: {'learning_rate': 0.00015609538941172252, 'weight_decay': 0.020478666537567105, 'num_train_epochs': 7} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 14%|█▍        | 22/154 [00:25<02:12,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.68it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.05it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 14%|█▍        | 22/154 [00:30<02:12,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8095    0.8452    0.8270      3475
     PERCENT     0.8448    0.5904    0.6950        83
        TYPE     0.9215    0.9470    0.9341     10485
      VOLUME     0.0769    0.0192    0.0308        52

   micro avg     0.8924    0.9164    0.9042     14095
   macro avg     0.6632    0.6004    0.6217     14095
weighted avg     0.8903    0.9164    0.9029     14095

{'eval_loss': 0.3363797664642334, 'eval_f1_macro': 0.6217084430325777, 'eval_precision': 0.8923587121735526, 'eval_recall': 0.9163533167789997, 'eval_f1': 0.9041968567328222, 'eval_accuracy': 0.9007468217319423, 'eval_runtime': 4.8685, 'eval_samples_per_second': 1131.962, 'eval_steps_per_second': 2.259, 'epoch': 1.0}


 29%|██▊       | 44/154 [00:55<01:50,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.58it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.71it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.76it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.67it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 29%|██▊       | 44/154 [01:00<01:50,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8839    0.9200    0.9016      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9491    0.9719    0.9603     10485
      VOLUME     0.8605    0.7115    0.7789        52

   micro avg     0.9325    0.9582    0.9452     14095
   macro avg     0.9117    0.8978    0.9028     14095
weighted avg     0.9327    0.9582    0.9452     14095

{'eval_loss': 0.2024366557598114, 'eval_f1_macro': 0.9028162514249194, 'eval_precision': 0.9325416004971345, 'eval_recall': 0.9582121319616885, 'eval_f1': 0.9452026034012176, 'eval_accuracy': 0.9399692149820421, 'eval_runtime': 4.8881, 'eval_samples_per_second': 1127.428, 'eval_steps_per_second': 2.25, 'epoch': 2.0}


 43%|████▎     | 66/154 [01:25<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.62it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 43%|████▎     | 66/154 [01:30<01:28,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9211    0.9177    0.9194      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9503    0.9783    0.9640     10485
      VOLUME     0.9574    0.8654    0.9091        52

   micro avg     0.9433    0.9630    0.9530     14095
   macro avg     0.9456    0.9373    0.9407     14095
weighted avg     0.9431    0.9630    0.9529     14095

{'eval_loss': 0.19032418727874756, 'eval_f1_macro': 0.9407429471035742, 'eval_precision': 0.9432900132045312, 'eval_recall': 0.9629655906349769, 'eval_f1': 0.9530262603566915, 'eval_accuracy': 0.946582292913745, 'eval_runtime': 5.1678, 'eval_samples_per_second': 1066.421, 'eval_steps_per_second': 2.129, 'epoch': 3.0}


 57%|█████▋    | 88/154 [01:56<01:06,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.65it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 57%|█████▋    | 88/154 [02:01<01:06,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9233    0.9255    0.9244      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9560    0.9745    0.9652     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9481    0.9623    0.9551     14095
   macro avg     0.9480    0.9480    0.9477     14095
weighted avg     0.9480    0.9623    0.9550     14095

{'eval_loss': 0.20063818991184235, 'eval_f1_macro': 0.9476755880579155, 'eval_precision': 0.9480637494757445, 'eval_recall': 0.9622561191912026, 'eval_f1': 0.9551072145346995, 'eval_accuracy': 0.9489766831993615, 'eval_runtime': 4.8997, 'eval_samples_per_second': 1124.753, 'eval_steps_per_second': 2.245, 'epoch': 4.0}


 65%|██████▍   | 100/154 [02:15<01:04,  1.20s/it]

{'loss': 0.3551, 'grad_norm': 0.7404721975326538, 'learning_rate': 5.359192035465401e-05, 'epoch': 4.55}


 71%|███████▏  | 110/154 [02:26<00:44,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.61it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.73it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.75it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.60it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)
=== seqeval classification_report ===


                                                 
 71%|███████▏  | 110/154 [02:31<00:44,  1.01s/it]A
                                               [A


              precision    recall  f1-score   support

       BRAND     0.9131    0.9258    0.9194      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9589    0.9698    0.9643     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9476    0.9588    0.9531     14095
   macro avg     0.9462    0.9468    0.9462     14095
weighted avg     0.9476    0.9588    0.9531     14095

{'eval_loss': 0.2267146110534668, 'eval_f1_macro': 0.9462017491831426, 'eval_precision': 0.9475529378768757, 'eval_recall': 0.9587797091167081, 'eval_f1': 0.9531332651549883, 'eval_accuracy': 0.9463542557436863, 'eval_runtime': 4.8793, 'eval_samples_per_second': 1129.475, 'eval_steps_per_second': 2.254, 'epoch': 5.0}


 86%|████████▌ | 132/154 [02:56<00:22,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.57it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.67it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.75it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.31it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  4.99it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.83it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.71it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.63it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.92it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 86%|████████▌ | 132/154 [03:01<00:22,  1.01s/it]A
 86%|████████▌ | 132/154 [03:01<00:30,  1.38s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9123    0.9214    0.9168      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9557    0.9701    0.9628     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9451    0.9579    0.9514     14095
   macro avg     0.9452    0.9458    0.9452     14095
weighted avg     0.9450    0.9579    0.9514     14095

{'eval_loss': 0.23132698237895966, 'eval_f1_macro': 0.9451931964610154, 'eval_precision': 0.945054945054945, 'eval_recall': 0.9579283433841788, 'eval_f1': 0.9514481009090269, 'eval_accuracy': 0.9453850977709367, 'eval_runtime': 4.9399, 'eval_samples_per_second': 1115.621, 'eval_steps_per_second': 2.227, 'epoch': 6.0}
{'train_runtime': 181.8652, 'train_samples_per_second': 848.359, 'train_steps_per_second': 0.847, 'train_loss': 0.2784254930236123, 'epoch': 6.0}



100%|██████████| 11/11 [00:02<00:00,  4.96it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.38it/s]
[I 2025-10-02 09:10:12,682] Trial 23 finished with value: 0.9451931964610154 and parameters: {'learning_rate': 0.00015609538941172252, 'weight_decay': 0.020478666537567105, 'num_train_epochs': 7}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9123    0.9214    0.9168      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9557    0.9701    0.9628     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9451    0.9579    0.9514     14095
   macro avg     0.9452    0.9458    0.9452     14095
weighted avg     0.9450    0.9579    0.9514     14095

✅ Trial 23 finished | F1: 0.9452

=== Trial 24 ===
{}

=== Current parameters: {'learning_rate': 0.00025187860901377916, 'weight_decay': 0.010573122833122712, 'num_train_epochs': 9} ===


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Dersty/distilrubert_X5_ner_MLM


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
 11%|█         | 22/198 [00:25<02:56,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.74it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.81it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.35it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.76it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.65it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.95it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 11%|█         | 22/198 [00:29<02:56,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8221    0.8098    0.8159      3475
     PERCENT     0.8022    0.8795    0.8391        83
        TYPE     0.9099    0.9433    0.9263     10485
      VOLUME     0.3125    0.0962    0.1471        52

   micro avg     0.8876    0.9069    0.8972     14095
   macro avg     0.7117    0.6822    0.6821     14095
weighted avg     0.8854    0.9069    0.8957     14095

{'eval_loss': 0.34777340292930603, 'eval_f1_macro': 0.6820812515136578, 'eval_precision': 0.8876466912019999, 'eval_recall': 0.9069173465768002, 'eval_f1': 0.8971785513756316, 'eval_accuracy': 0.8951599110655036, 'eval_runtime': 4.8622, 'eval_samples_per_second': 1133.446, 'eval_steps_per_second': 2.262, 'epoch': 1.0}


 22%|██▏       | 44/198 [00:55<02:34,  1.00s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.63it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.69it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.78it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.84it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.64it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.58it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 22%|██▏       | 44/198 [01:00<02:34,  1.00s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.8730    0.9321    0.9016      3475
     PERCENT     0.9535    0.9880    0.9704        83
        TYPE     0.9525    0.9680    0.9602     10485
      VOLUME     0.8889    0.7692    0.8247        52

   micro avg     0.9320    0.9586    0.9451     14095
   macro avg     0.9170    0.9143    0.9142     14095
weighted avg     0.9327    0.9586    0.9453     14095

{'eval_loss': 0.20459848642349243, 'eval_f1_macro': 0.9142441261695115, 'eval_precision': 0.9319859281230599, 'eval_recall': 0.9585668676835757, 'eval_f1': 0.9450895355344151, 'eval_accuracy': 0.9392851034718659, 'eval_runtime': 4.8505, 'eval_samples_per_second': 1136.18, 'eval_steps_per_second': 2.268, 'epoch': 2.0}


 33%|███▎      | 66/198 [01:25<02:12,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.70it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.86it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.62it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.57it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.90it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 33%|███▎      | 66/198 [01:30<02:12,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9340    0.9076    0.9206      3475
     PERCENT     0.9647    0.9880    0.9762        83
        TYPE     0.9519    0.9806    0.9661     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9478    0.9624    0.9550     14095
   macro avg     0.9525    0.9450    0.9484     14095
weighted avg     0.9476    0.9624    0.9548     14095

{'eval_loss': 0.1911604255437851, 'eval_f1_macro': 0.9483929149267567, 'eval_precision': 0.9478060368921185, 'eval_recall': 0.9623980134799575, 'eval_f1': 0.9550462914070474, 'eval_accuracy': 0.9478364973490679, 'eval_runtime': 5.1703, 'eval_samples_per_second': 1065.892, 'eval_steps_per_second': 2.128, 'epoch': 3.0}


 44%|████▍     | 88/198 [01:55<01:50,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.64it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.72it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.79it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.32it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.87it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.66it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.61it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.93it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                
 44%|████▍     | 88/198 [02:00<01:50,  1.01s/it][A
                                               [A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9176    0.9197    0.9187      3475
     PERCENT     0.9643    0.9759    0.9701        83
        TYPE     0.9552    0.9673    0.9612     10485
      VOLUME     0.9600    0.9231    0.9412        52

   micro avg     0.9460    0.9554    0.9507     14095
   macro avg     0.9493    0.9465    0.9478     14095
weighted avg     0.9460    0.9554    0.9507     14095

{'eval_loss': 0.2251274436712265, 'eval_f1_macro': 0.9477703721921009, 'eval_precision': 0.9460484720758693, 'eval_recall': 0.9554451933309684, 'eval_f1': 0.9507236145428873, 'eval_accuracy': 0.9430477167778348, 'eval_runtime': 4.8428, 'eval_samples_per_second': 1137.971, 'eval_steps_per_second': 2.271, 'epoch': 4.0}


 51%|█████     | 100/198 [02:15<01:57,  1.20s/it]

{'loss': 0.321, 'grad_norm': 0.4789588451385498, 'learning_rate': 0.00014805161348708376, 'epoch': 4.55}


 56%|█████▌    | 110/198 [02:26<01:28,  1.01s/it]
  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:00,  9.60it/s][A
 27%|██▋       | 3/11 [00:00<00:01,  6.76it/s][A
 36%|███▋      | 4/11 [00:00<00:01,  5.83it/s][A
 45%|████▌     | 5/11 [00:00<00:01,  5.33it/s][A
 55%|█████▍    | 6/11 [00:01<00:00,  5.04it/s][A
 64%|██████▎   | 7/11 [00:01<00:00,  4.89it/s][A
 73%|███████▎  | 8/11 [00:01<00:00,  4.77it/s][A
 82%|████████▏ | 9/11 [00:01<00:00,  4.68it/s][A
 91%|█████████ | 10/11 [00:01<00:00,  4.62it/s][A
100%|██████████| 11/11 [00:02<00:00,  4.94it/s][A

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)



                                                 
 56%|█████▌    | 110/198 [02:30<01:28,  1.01s/it]A
 56%|█████▌    | 110/198 [02:31<02:00,  1.37s/it]A

=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9100    0.9255    0.9177      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9601    0.9706    0.9653     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9476    0.9593    0.9534     14095
   macro avg     0.9456    0.9440    0.9445     14095
weighted avg     0.9477    0.9593    0.9534     14095

{'eval_loss': 0.24248403310775757, 'eval_f1_macro': 0.9444967148901053, 'eval_precision': 0.9476450798990749, 'eval_recall': 0.9592763391273501, 'eval_f1': 0.9534252371046786, 'eval_accuracy': 0.94567014423351, 'eval_runtime': 4.8304, 'eval_samples_per_second': 1140.907, 'eval_steps_per_second': 2.277, 'epoch': 5.0}
{'train_runtime': 151.05, 'train_samples_per_second': 1313.267, 'train_steps_per_second': 1.311, 'train_loss': 0.2952461768280376, 'epoch': 5.0}



100%|██████████| 11/11 [00:02<00:00,  4.95it/s]

DEBUG: predictions type: <class 'tuple'>, shape: No shape
DEBUG: labels type: <class 'numpy.ndarray'>, shape: (5511, 16)


100%|██████████| 11/11 [00:04<00:00,  2.27it/s]
[I 2025-10-02 09:12:49,738] Trial 24 finished with value: 0.9444967148901053 and parameters: {'learning_rate': 0.00025187860901377916, 'weight_decay': 0.010573122833122712, 'num_train_epochs': 9}. Best is trial 6 with value: 0.9512251951099278.


=== seqeval classification_report ===
              precision    recall  f1-score   support

       BRAND     0.9100    0.9255    0.9177      3475
     PERCENT     0.9529    0.9759    0.9643        83
        TYPE     0.9601    0.9706    0.9653     10485
      VOLUME     0.9592    0.9038    0.9307        52

   micro avg     0.9476    0.9593    0.9534     14095
   macro avg     0.9456    0.9440    0.9445     14095
weighted avg     0.9477    0.9593    0.9534     14095

✅ Trial 24 finished | F1: 0.9445
Best trial:
  Value: 0.9512
  Params:
    learning_rate: 0.0004026606979829003
    weight_decay: 0.008132177642386032
    num_train_epochs: 6


## Обучаем новую модель

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          use_fast=True,
                                          add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=len(lbls_in_dataset),
    id2label=id2label,
    label2id=label2id,
).to(device)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8de2b58d-6a8b-42ea-b30b-c54abb53e1cb)')' thrown while requesting HEAD https://huggingface.co/Dersty/distilrubert_X5_ner_MLM/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at Dersty/distilrubert_X5_ner_MLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:

ds_tokenized = ds.map(tokenize_and_align_labels,
              batched=True,
              fn_kwargs={'tokenizer': tokenizer})

training_args = TrainingArguments(
    eval_strategy="no", 
    torch_compile=False, # попробую отключить для Я Датасферы
    per_device_train_batch_size=1024,
    learning_rate=0.0004026606979829003,      
    weight_decay= 0.008132177642386032,
    num_train_epochs=6,        
    
    lr_scheduler_type="cosine",
    
    
    seed=42,
    data_seed=24,
    gradient_accumulation_steps=1,
    warmup_ratio=0.1,
    report_to=None,
    logging_dir="./logs",
    logging_steps=20,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,

)

print("Starting training without evaluation...")
trainer.train()
trained_model = trainer.model

Map: 100%|██████████| 27552/27552 [00:01<00:00, 23378.41 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Starting training without evaluation...


 12%|█▏        | 20/162 [00:22<02:38,  1.12s/it]

{'loss': 0.8937, 'grad_norm': 2.991623878479004, 'learning_rate': 0.00040247170963608265, 'epoch': 0.74}


 25%|██▍       | 40/162 [00:44<02:17,  1.12s/it]

{'loss': 0.2463, 'grad_norm': 0.9486817121505737, 'learning_rate': 0.0003802192915249885, 'epoch': 1.48}


 37%|███▋      | 60/162 [01:07<01:55,  1.13s/it]

{'loss': 0.1505, 'grad_norm': 0.48738670349121094, 'learning_rate': 0.0003248994025355572, 'epoch': 2.22}


 49%|████▉     | 80/162 [01:29<01:33,  1.14s/it]

{'loss': 0.0927, 'grad_norm': 0.4525924623012543, 'learning_rate': 0.0002467378777151837, 'epoch': 2.96}


 62%|██████▏   | 100/162 [01:52<01:11,  1.15s/it]

{'loss': 0.0507, 'grad_norm': 0.8014702796936035, 'learning_rate': 0.0001601828093226701, 'epoch': 3.7}


 74%|███████▍  | 120/162 [02:15<00:48,  1.16s/it]

{'loss': 0.033, 'grad_norm': 0.32024821639060974, 'learning_rate': 8.123382910107581e-05, 'epoch': 4.44}


 86%|████████▋ | 140/162 [02:38<00:25,  1.16s/it]

{'loss': 0.022, 'grad_norm': 0.25954264402389526, 'learning_rate': 2.448458977917428e-05, 'epoch': 5.19}


 99%|█████████▉| 160/162 [03:02<00:02,  1.17s/it]

{'loss': 0.016, 'grad_norm': 0.22310884296894073, 'learning_rate': 4.2514061360391687e-07, 'epoch': 5.93}


100%|██████████| 162/162 [03:42<00:00,  1.37s/it]

{'train_runtime': 222.6735, 'train_samples_per_second': 742.396, 'train_steps_per_second': 0.728, 'train_loss': 0.18602542599870098, 'epoch': 6.0}





## Сохраняем дообученную версию

In [52]:
model.save_pretrained("./distilrubert_optuna_ft_09344")
tokenizer.save_pretrained("./distilrubert_optuna_ft_09344")

('./distilrubert_optuna_ft_09344/tokenizer_config.json',
 './distilrubert_optuna_ft_09344/special_tokens_map.json',
 './distilrubert_optuna_ft_09344/vocab.txt',
 './distilrubert_optuna_ft_09344/added_tokens.json',
 './distilrubert_optuna_ft_09344/tokenizer.json')