In [1]:
import pandas as pd
import ast
from tqdm import tqdm
from collections import Counter
import datasets
from datasets import (Dataset, Features, Sequence, Value, ClassLabel, load_dataset,
                    load_from_disk, concatenate_datasets, DatasetDict)
from sklearn.model_selection import KFold
from transformers import (AutoTokenizer, AutoModel, AutoModelForTokenClassification,
                         pipeline, PreTrainedTokenizerFast, TrainingArguments, Trainer,AutoModelForMaskedLM,
                         DataCollatorForTokenClassification, EarlyStoppingCallback,
                        DataCollatorForLanguageModeling, DataCollatorForWholeWordMask)
import torch
import optuna
import os
os.environ['WANDB_DISABLED'] = 'true'
import pickle
import numpy as np

from typing import List, Optional
import random
seed=42
random.seed(seed)

import math


2025-10-01 13:23:44.913616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759325025.251002      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759325025.350887      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Загрузка модели и датасета

In [2]:
df_mlm = pd.read_csv('/kaggle/input/mlm-v1/augmented_dataset_mlm.csv')
df_mlm.shape

(188060, 3)

In [3]:
df = df_mlm.iloc[::]
df.shape

(188060, 3)

In [None]:
model_name_or_path = "DeepPavlov/distilrubert-base-cased-conversational"
model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

## Подготовка

In [None]:
def mix_and_subsample(originals: List[str], augmented: List[str], mix_ratio: float = 0.75, seed: int = 42) -> List[str]:
    random.seed(seed)
    n_total = len(originals) + len(augmented)
    n_from_orig = int(n_total * mix_ratio)
    n_from_aug = n_total - n_from_orig

    chosen_orig = random.choices(originals, k=max(1, n_from_orig)) if originals else []
    chosen_aug = random.choices(augmented, k=max(1, n_from_aug)) if augmented else []
    combined = chosen_orig + chosen_aug
    random.shuffle(combined)
    return combined

In [None]:
def tokenize_and_group_texts(lines: List[str], tokenizer: PreTrainedTokenizerFast, block_size: int = 32, use_wwm: bool = False, seed=42):

    ds = Dataset.from_dict({'text': lines})

    def tokenize_func(examples):
        if use_wwm:
            words = [t.split() for t in examples['text']]
            return tokenizer(words, is_split_into_words=True, add_special_tokens=True)
        else:
            return tokenizer(examples['text'], add_special_tokens=True)

    tokenized = ds.map(tokenize_func, batched=True, remove_columns=['text'])

    def group_texts(examples):
        concatenated = sum(examples['input_ids'], [])
        total_length = len(concatenated)
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        else:
            total_length = 0
        result = {}
        if total_length == 0:
            result['input_ids'] = []
            result['labels'] = []
            return result
        result['input_ids'] = [concatenated[i:i+block_size] for i in range(0, total_length, block_size)]
        result['labels'] = [list(ids) for ids in result['input_ids']]
        return result

    lm_dataset = tokenized.map(group_texts, batched=True, remove_columns=tokenized.column_names)
    lm_dataset = lm_dataset.filter(lambda ex: len(ex['input_ids']) > 0)

    split = lm_dataset.train_test_split(test_size=0.01, seed=seed)
    return DatasetDict({'train': split['train'], 'validation': split['test']})

In [None]:
def build_data_collator(tokenizer: PreTrainedTokenizerFast, use_wwm: bool = False, mlm_probability: float = 0.15):
    if use_wwm:
        return DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability)
    else:
        return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability)

In [11]:
use_wwm = True
output_dir = "./results_ft"
num_epochs=4

In [None]:
os.makedirs(output_dir, exist_ok=True)
random.seed(seed)

originals = df['original'].tolist()
augmented = df['variant'].tolist()


combined = mix_and_subsample(originals, augmented, mix_ratio=0.75, seed=seed)
print(f'Combined corpus size: {len(combined)}')

print(f'Loading tokenizer from pretrained model: "{model_name_or_path}"')
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)

tokenized_ds = tokenize_and_group_texts(combined, tokenizer, use_wwm=use_wwm)
print('Train examples (blocks):', len(tokenized_ds['train']))
print('Validation examples (blocks):', len(tokenized_ds['validation']))


Combined corpus size: 376120
Loading tokenizer from pretrained model: "DeepPavlov/distilrubert-base-cased-conversational"


Map:   0%|          | 0/376120 [00:00<?, ? examples/s]

Map:   0%|          | 0/376120 [00:00<?, ? examples/s]

Filter:   0%|          | 0/206706 [00:00<?, ? examples/s]

Train examples (blocks): 204638
Validation examples (blocks): 2068


In [12]:
tokenized_ds["train"].select(range(10)).to_pandas()

Unnamed: 0,input_ids,labels
0,"[102, 101, 6359, 130, 30396, 949, 26051, 34206...","[102, 101, 6359, 130, 30396, 949, 26051, 34206..."
1,"[336, 58124, 1455, 949, 15752, 22963, 128, 530...","[336, 58124, 1455, 949, 15752, 22963, 128, 530..."
2,"[101, 10206, 28826, 14422, 96300, 1405, 1638, ...","[101, 10206, 28826, 14422, 96300, 1405, 1638, ..."
3,"[4242, 828, 32948, 21595, 17230, 36324, 102, 1...","[4242, 828, 32948, 21595, 17230, 36324, 102, 1..."
4,"[102, 101, 93065, 244, 14932, 12039, 241, 2376...","[102, 101, 93065, 244, 14932, 12039, 241, 2376..."
5,"[1454, 106, 963, 128, 8210, 10906, 102, 101, 3...","[1454, 106, 963, 128, 8210, 10906, 102, 101, 3..."
6,"[58398, 294, 17230, 36324, 102, 101, 60699, 69...","[58398, 294, 17230, 36324, 102, 101, 60699, 69..."
7,"[367, 255, 56986, 333, 39934, 3459, 297, 850, ...","[367, 255, 56986, 333, 39934, 3459, 297, 850, ..."
8,"[11854, 22382, 261, 7386, 355, 323, 14362, 323...","[11854, 22382, 261, 7386, 355, 323, 14362, 323..."
9,"[949, 21595, 20669, 7751, 76870, 141, 102, 101...","[949, 21595, 20669, 7751, 76870, 141, 102, 101..."


In [14]:
tokenizer.tokenize("как делать [MASK]")

['как', 'делать', '[MASK]']

In [13]:
tokenizer.encode("как делать [MASK]")

[101, 879, 1634, 103, 102]

In [27]:
tokenizer.decode(tokenized_ds["train"].select(range(10)).to_pandas()['input_ids'][0])

'##азированныи без сахар, 50г [SEP] [CLS] хлебцы take a bieт ккууризhо - рсиоы'

In [None]:
# Model
print('Loading model...')
model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
data_collator = build_data_collator(tokenizer, use_wwm=use_wwm, mlm_probability=0.2)


Loading model...


## Обучение

In [None]:

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=1,
    eval_strategy="steps",
    eval_steps=200,   

    logging_strategy="steps",  
    logging_steps=200,

    save_strategy="epoch",   
    save_total_limit=num_epochs,   
    learning_rate=2e-4,
    weight_decay=0.01,
    seed=seed,
    fp16=False,
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
)

# Train
print('Starting training...')
trainer.train()

# Eval
print('Running final evaluation...')
metrics = trainer.evaluate()
loss = metrics.get('eval_loss')
if loss is not None:
    try:
        ppl = math.exp(loss)
    except OverflowError:
        ppl = float('inf')
    metrics['perplexity'] = ppl
print('Eval metrics:', metrics)

# Save
print('Saving tokenizer and model...')
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('Done. Model saved to', output_dir)


Starting training...




Step,Training Loss,Validation Loss
200,4.2358,3.551408
400,3.4076,3.140821
600,3.0609,2.88312
800,2.8596,2.725362
1000,2.6537,2.616949
1200,2.5488,2.398414
1400,2.4349,2.391071
1600,2.3377,2.25644
1800,2.2242,2.191037
2000,2.1617,2.148824


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Running final evaluation...




Eval metrics: {'eval_loss': 1.9294145107269287, 'eval_runtime': 15.218, 'eval_samples_per_second': 135.891, 'eval_steps_per_second': 8.542, 'epoch': 4.0, 'perplexity': 6.885477687861801}
Saving tokenizer and model...
Done. Model saved to ./results_ft


In [14]:
model == trainer.model

True

In [15]:
df_mlm.sample().original.iloc[0]

'яблоки джерамин 4шт.'

In [18]:
maska = "[MASK]"
text = f'{maska} джерамин 4шт.'

inputs = tokenizer(text, return_tensors="pt").to("cuda")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> яблоки джерамин 4шт.'
'>>> спички джерамин 4шт.'
'>>> груши джерамин 4шт.'
'>>> губки джерамин 4шт.'
'>>> подгузники джерамин 4шт.'


In [None]:
from huggingface_hub import login
login(token="TOKEN")

In [None]:
trainer.push_to_hub("Dersty/distilbert_rubert_X5_ner_MLM")

Uploading...:   0%|          | 0.00/542M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Dersty/results_ft/commit/12c0960e78472fa3bc6d51676702b6cc0f8b2baf', commit_message='Dersty/distilbert_rubert_X5_ner_MLM', commit_description='', oid='12c0960e78472fa3bc6d51676702b6cc0f8b2baf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Dersty/results_ft', endpoint='https://huggingface.co', repo_type='model', repo_id='Dersty/results_ft'), pr_revision=None, pr_num=None)