In [None]:
# !pip install datasets==3.6.0

In [None]:
import logging
import torch
import warnings
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
from datasets import load_dataset, Dataset, Audio

ds_cy = load_dataset("Elormiden/Thesaurus-Cypriot-Greek-Dialect")

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-multilingual-cased")

In [None]:
"""
Dividing our initial dataset by columns
"""
train_cy = ds_cy['train']
val_cy = ds_cy['validation']

In [None]:
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch

"""
Converting text into tokens
"hello world" → [101, 1234, 5678, 102]
"""
def tokenize_multilingual_text(text):
    ids = tokenizer(text, max_length=256, truncation=True)['input_ids']
    return ids

"""
1. Creating of prompts:
input_text = f"correct cypriot to greek: {word} - {description}"
target_text = f"{greek_word} - {greek_desc}"
2. Tokenization of the whole batch
3. Converting PyTorch list into tensors
4. Texts has different length, make the equal padding
5. Converting 0 to -100, saying to a model not to count these positions
"""
def tokenize_text_pairs(batch):
    tokenized_input_texts = []
    tokenized_target_texts = []

    for word, description, greek_word, greek_desc in tqdm(
        zip(batch['word'], batch['description'], batch['greek_word'], batch['greek_description']),
        total=len(batch['word']), desc="Tokenizing batch"):

        input_text = f"correct cypriot to greek: {word} - {description}"
        target_text = f"{greek_word} - {greek_desc}"

        tokenized_input_texts.append(tokenize_multilingual_text(input_text))
        tokenized_target_texts.append(tokenize_multilingual_text(target_text))

    input_ids_tensors = [torch.tensor(ids, dtype=torch.long) for ids in tokenized_input_texts]
    labels_tensors = [torch.tensor(ids, dtype=torch.long) for ids in tokenized_target_texts]

    input_ids_padded = pad_sequence(input_ids_tensors, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels_tensors, batch_first=True, padding_value=-100)

    attention_mask = (input_ids_padded != tokenizer.pad_token_id).long()
    labels_padded[labels_padded == tokenizer.pad_token_id] = -100

    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_mask,
        "labels": labels_padded
    }

In [None]:
"""
Filling into the function
"""
train_cyprus_tokenized = tokenize_text_pairs(train_cy)
val_cyprus_tokenized = tokenize_text_pairs(val_cy)

In [None]:
"""
Converting dicts to Dataset HuggingFace format
"""
train_hf = Dataset.from_dict(train_cyprus_tokenized)
val_hf = Dataset.from_dict(val_cyprus_tokenized)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="./cypriot-corrector-bert",
    num_train_epochs=8,
    ################# ~ 10gb of GPU
    per_device_train_batch_size=24,
    per_device_eval_batch_size=20,
    gradient_accumulation_steps=2,
    ################
    learning_rate=5e-4,
    warmup_steps=500,
    #################### GPU, eat less memory
    gradient_checkpointing=True,
    fp16=True,
    #################
    save_steps=200,
    eval_steps=50,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to='wandb',
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=val_hf,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()