In [None]:
!pip install datasets==3.6.0

In [None]:
import logging
import torch
import warnings
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
from datasets import load_dataset, Dataset, Audio

ds_cy = load_dataset("Elormiden/Thesaurus-Cypriot-Greek-Dialect")

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-multilingual-cased")

In [None]:
"""
Dividing our initial dataset by columns
"""
train_cy = ds_cy['train']
val_cy = ds_cy['validation']

In [None]:
def tokenize_for_mlm(batch):
    tokenized_texts = []

    for word, description, greek_word, greek_desc in tqdm(
        zip(batch['word'], batch['description'], batch['greek_word'], batch['greek_description']),
        total=len(batch['word']), desc="Tokenizing for MLM"):

        full_text = f"{word} - {description} [SEP] {greek_word} - {greek_desc}"
        tokenized_texts.append(tokenizer.encode(
            full_text,
            add_special_tokens=True,
            max_length=512,           # ← 512 is the limit
            truncation=True
            )
        )

    input_ids_tensors = [torch.tensor(ids, dtype=torch.long) for ids in tokenized_texts]
    input_ids_padded = pad_sequence(input_ids_tensors, batch_first=True, padding_value=tokenizer.pad_token_id)

    input_ids_masked, labels = mask_tokens(input_ids_padded, tokenizer, mlm_probability=0.15)
    attention_mask = (input_ids_masked != tokenizer.pad_token_id).long()

    return {
        "input_ids": input_ids_masked,  # С [MASK] токенам
        "attention_mask": attention_mask,
        "labels": labels
    }

def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    labels = inputs.clone()

    probability_matrix = torch.full(labels.shape, mlm_probability)
    masked_indices = torch.bernoulli(probability_matrix).bool()

    special_tokens_mask = torch.zeros_like(labels, dtype=torch.bool)
    for token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
        special_tokens_mask |= (labels == token_id)

    masked_indices &= ~special_tokens_mask

    labels[~masked_indices] = -100
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.mask_token_id

    return inputs, labels

In [None]:
"""
Filling into the function
"""
train_cyprus_tokenized = tokenize_for_mlm(train_cy)
val_cyprus_tokenized = tokenize_for_mlm(val_cy)

In [None]:
"""
Converting dicts to Dataset HuggingFace format
"""
train_hf = Dataset.from_dict(train_cyprus_tokenized)
val_hf = Dataset.from_dict(val_cyprus_tokenized)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"./cypriot-corrector-bert-mlm-lr5e-5-batch16",
    num_train_epochs=8,
    
    ################# 
    per_device_train_batch_size=16,        
    per_device_eval_batch_size=16,         
    gradient_accumulation_steps=1,       
    ################
    
    learning_rate=5e-5,
    warmup_steps=1000,
    
    #################### A100 
    gradient_checkpointing=False,        
    bf16=True,                           
    dataloader_pin_memory=True,        
    dataloader_num_workers=4,            
    #################
    
    save_steps=200,
    eval_steps=50,                      
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to='wandb',
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=50,                    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=val_hf,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()