In [1]:
import pandas as pd
import ast
from tqdm import tqdm
from collections import Counter
import datasets
from datasets import (Dataset, Features, Sequence, Value, ClassLabel, load_dataset,
                    load_from_disk, concatenate_datasets, DatasetDict)
from sklearn.model_selection import KFold
from transformers import (AutoTokenizer, AutoModel, AutoModelForTokenClassification,
                         pipeline, PreTrainedTokenizerFast, TrainingArguments, Trainer,AutoModelForMaskedLM,
                         DataCollatorForTokenClassification, EarlyStoppingCallback,
                        DataCollatorForLanguageModeling, DataCollatorForWholeWordMask)
import torch
import optuna
import os
os.environ['WANDB_DISABLED'] = 'true'
import pickle
import numpy as np

from typing import List, Optional
import random
seed=42
random.seed(seed)

import math


2025-09-30 07:29:02.040712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759217342.361204      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759217342.456681      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df_mlm = pd.read_csv('/kaggle/input/mlm-v1/augmented_dataset_mlm.csv')
df_mlm.shape

(188060, 3)

In [3]:
df = df_mlm.iloc[::4]
df.shape

(47015, 3)

In [4]:
model_name_or_path = "numind/NuNER-multilingual-v0.1"
model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at numind/NuNER-multilingual-v0.1 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [5]:
def mix_and_subsample(originals: List[str], augmented: List[str], mix_ratio: float = 0.75, seed: int = 42) -> List[str]:
    # mix_ratio = fraction of examples from originals
    random.seed(seed)
    n_total = len(originals) + len(augmented)
    n_from_orig = int(n_total * mix_ratio)
    n_from_aug = n_total - n_from_orig

    chosen_orig = random.choices(originals, k=max(1, n_from_orig)) if originals else []
    chosen_aug = random.choices(augmented, k=max(1, n_from_aug)) if augmented else []
    combined = chosen_orig + chosen_aug
    random.shuffle(combined)
    return combined

In [6]:
def tokenize_and_group_texts(lines: List[str], tokenizer: PreTrainedTokenizerFast, block_size: int = 32, use_wwm: bool = False, seed=42):
    """Tokenize lines and group into blocks of `block_size`. If use_wwm=True we use is_split_into_words approach.

    Returns a HuggingFace Dataset with columns 'input_ids' and 'labels'
    """
    # Build dataset
    ds = Dataset.from_dict({'text': lines})

    def tokenize_func(examples):
        if use_wwm:
            # split into whitespace tokens for whole-word masking
            words = [t.split() for t in examples['text']]
            return tokenizer(words, is_split_into_words=True, add_special_tokens=True)
        else:
            return tokenizer(examples['text'], add_special_tokens=True)

    tokenized = ds.map(tokenize_func, batched=True, remove_columns=['text'])

    # concatenate and group
    def group_texts(examples):
        concatenated = sum(examples['input_ids'], [])
        total_length = len(concatenated)
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        else:
            total_length = 0
        result = {}
        if total_length == 0:
            result['input_ids'] = []
            result['labels'] = []
            return result
        result['input_ids'] = [concatenated[i:i+block_size] for i in range(0, total_length, block_size)]
        result['labels'] = [list(ids) for ids in result['input_ids']]
        return result

    lm_dataset = tokenized.map(group_texts, batched=True, remove_columns=tokenized.column_names)
    # drop empty rows
    lm_dataset = lm_dataset.filter(lambda ex: len(ex['input_ids']) > 0)

    # train/val split
    split = lm_dataset.train_test_split(test_size=0.01, seed=seed)
    return DatasetDict({'train': split['train'], 'validation': split['test']})

In [7]:
def build_data_collator(tokenizer: PreTrainedTokenizerFast, use_wwm: bool = False, mlm_probability: float = 0.15):
    if use_wwm:
        # DataCollatorForWholeWordMask expects tokenizer with word_ids support
        return DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability)
    else:
        return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability)

In [8]:
use_wwm = True
output_dir = "./results_ft"
num_epochs=5

In [9]:
os.makedirs(output_dir, exist_ok=True)
random.seed(seed)

originals = df['original'].tolist()
augmented = df['variant'].tolist()


# # Basic exploration
# print('Exploring data distributions...')
# s_orig = simple_text_stats(originals) if originals else {}
# s_aug = simple_text_stats(augmented) if augmented else {}
# print('Originals stats:', json.dumps(s_orig, ensure_ascii=False, indent=2))
# print('Augmented stats:', json.dumps(s_aug, ensure_ascii=False, indent=2))

# Combine corpus according to mix_ratio
combined = mix_and_subsample(originals, augmented, mix_ratio=0.75, seed=seed)
print(f'Combined corpus size: {len(combined)}')

# Tokenizer
# tokenizer = None
# tokenizer_path = os.path.join(".", 'tokenizer.json')
print(f'Loading tokenizer from pretrained model: "{model_name_or_path}"')
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
# ensure mask token exists
# if tokenizer.mask_token is None:
#     tokenizer.add_special_tokens({'mask_token': '[MASK]'})

# Tokenize and group
tokenized_ds = tokenize_and_group_texts(combined, tokenizer, use_wwm=use_wwm)
print('Train examples (blocks):', len(tokenized_ds['train']))
print('Validation examples (blocks):', len(tokenized_ds['validation']))


Combined corpus size: 94030
Loading tokenizer from pretrained model: "numind/NuNER-multilingual-v0.1"


Map:   0%|          | 0/94030 [00:00<?, ? examples/s]

Map:   0%|          | 0/94030 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66960 [00:00<?, ? examples/s]

Train examples (blocks): 66290
Validation examples (blocks): 670


In [14]:
tokenized_ds["train"].select(range(10)).to_pandas()

Unnamed: 0,input_ids,labels
0,"[50298, 40643, 11307, 10191, 13012, 10868, 104...","[50298, 40643, 11307, 10191, 13012, 10868, 104..."
1,"[80062, 36694, 10648, 543, 97744, 54453, 12265...","[80062, 36694, 10648, 543, 97744, 54453, 12265..."
2,"[10123, 10191, 117, 18302, 10241, 10517, 102, ...","[10123, 10191, 117, 18302, 10241, 10517, 102, ..."
3,"[10191, 85710, 10656, 67482, 108276, 117, 122,...","[10191, 85710, 10656, 67482, 108276, 117, 122,..."
4,"[10332, 94383, 14708, 10241, 14208, 557, 13157...","[10332, 94383, 14708, 10241, 14208, 557, 13157..."
5,"[101, 551, 17961, 91680, 10179, 14816, 55399, ...","[101, 551, 17961, 91680, 10179, 14816, 55399, ..."
6,"[30977, 10241, 31399, 88535, 76316, 12868, 103...","[30977, 10241, 31399, 88535, 76316, 12868, 103..."
7,"[50154, 117, 14048, 10823, 102, 101, 10122, 80...","[50154, 117, 14048, 10823, 102, 101, 10122, 80..."
8,"[10757, 10241, 10517, 102, 101, 11279, 40703, ...","[10757, 10241, 10517, 102, 101, 11279, 40703, ..."
9,"[10196, 77202, 31066, 553, 69605, 12202, 10191...","[10196, 77202, 31066, 553, 69605, 12202, 10191..."


In [16]:
tokenizer.tokenize("как делать [MASK]")

['как', 'дела', '##ть', '[MASK]']

In [17]:
tokenizer.encode("как делать [MASK]")

[101, 10949, 25195, 11258, 103, 102]

In [27]:
tokenizer.decode(tokenized_ds["train"].select(range(10)).to_pandas()['input_ids'][0])

'##азированныи без сахар, 50г [SEP] [CLS] хлебцы take a bieт ккууризhо - рсиоы'

In [10]:
# Model
print('Loading model...')
model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
# # if tokenizer added tokens, resize
# try:
#     model.resize_token_embeddings(len(tokenizer))
# except Exception:
#     pass

# Data collator
data_collator = build_data_collator(tokenizer, use_wwm=use_wwm, mlm_probability=0.2)


Loading model...


Some weights of BertForMaskedLM were not initialized from the model checkpoint at numind/NuNER-multilingual-v0.1 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# collated_batch = data_collator(tokenized_ds["train"].select(range(10)))
# print(collated_batch['input_ids'])

In [34]:
tokenizer.decode(collated_batch['input_ids'][2])

'omtalt [MASK], 270мл [SEP] [CLS] печенье lotte choco pie [MASK] [MASK] [MASK] [MASK] глазированное, 336г [SEP] [CLS] перчат'

In [12]:

# Training args
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    eval_strategy="steps",    # evaluate by steps
    eval_steps=200,                 # every 200 steps

    logging_strategy="steps",       # log by steps
    logging_steps=200,

    save_strategy="epoch",          # save by epoch
    save_total_limit=num_epochs,    # keep last N checkpoints (can also set smaller)
    learning_rate=3e-5,
    weight_decay=0.01,
    seed=seed,
    fp16=False,
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
)

# Train
print('Starting training...')
trainer.train()

# Eval
print('Running final evaluation...')
metrics = trainer.evaluate()
loss = metrics.get('eval_loss')
if loss is not None:
    try:
        ppl = math.exp(loss)
    except OverflowError:
        ppl = float('inf')
    metrics['perplexity'] = ppl
print('Eval metrics:', metrics)

# Save
print('Saving tokenizer and model...')
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('Done. Model saved to', output_dir)


Starting training...




Step,Training Loss,Validation Loss
200,5.6894,4.722475
400,4.5865,4.259158
600,4.2805,4.02582
800,4.1225,3.917448
1000,4.0076,3.8221
1200,3.927,3.779876




Running final evaluation...




Eval metrics: {'eval_loss': 3.9210193157196045, 'eval_runtime': 6.1764, 'eval_samples_per_second': 108.478, 'eval_steps_per_second': 6.8, 'epoch': 5.0, 'perplexity': 50.45184493574677}
Saving tokenizer and model...
Done. Model saved to ./results_ft


In [None]:
model == trainer.model

In [28]:
df_mlm.sample().original.iloc[0]

'игристое вино мысхако русток феркаль кубань белое полусухое, 750мл'

In [29]:
maska = "[MASK]"
text = f'игристое вино мысхако русток феркаль {maska} белое полусухое, 750мл'

inputs = tokenizer(text, return_tensors="pt").to("cuda")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> игристое вино мысхако русток феркаль село белое полусухое, 750мл'
'>>> игристое вино мысхако русток феркаль ##ное белое полусухое, 750мл'
'>>> игристое вино мысхако русток феркаль , белое полусухое, 750мл'
'>>> игристое вино мысхако русток феркаль select белое полусухое, 750мл'
'>>> игристое вино мысхако русток феркаль ##е белое полусухое, 750мл'


In [None]:
torch.save(model.state_dict(), "/kaggle/working/rubert_tiny2_MLM_290925")

In [None]:
torch.save(tokenizer, "/kaggle/working/rubert_tiny2_MLM_290925_tokenizer")

In [None]:
model.save_pretrained("my_mlm_checkpoint")
tokenizer.save_pretrained("my_mlm_checkpoint")

In [30]:
from huggingface_hub import login
login(token="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")

In [31]:
trainer.push_to_hub(commit_message="MLM по выборке названий продуктов nuner")  # Explicitly push if not done automatically

Uploading...:   0%|          | 0.00/712M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Dersty/results_ft/commit/51f2aa01066f9feb30e163ee5c1f981a18197c8c', commit_message='MLM по выборке названий продуктов nuner', commit_description='', oid='51f2aa01066f9feb30e163ee5c1f981a18197c8c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Dersty/results_ft', endpoint='https://huggingface.co', repo_type='model', repo_id='Dersty/results_ft'), pr_revision=None, pr_num=None)