In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("Elormiden/bert-base-cypriot-greek")
model = AutoModelForMaskedLM.from_pretrained("Elormiden/bert-base-cypriot-greek")

In [None]:
his_tokenizer = AutoTokenizer.from_pretrained("petros/bert-base-cypriot-uncased-v1")
his_model = AutoModelForMaskedLM.from_pretrained("petros/bert-base-cypriot-uncased-v1")

In [None]:
# !pip install jiwer

In [None]:
from datasets import load_dataset
import torch
import jiwer
from tqdm import tqdm
import numpy as np
from jiwer import wer, cer

In [None]:
ds = load_dataset("Elormiden/Thesaurus-Cypriot-Greek-Dialect")

In [None]:
class JiwerMetricsPipeline:
  def __init__(self, hypothesis: str, truth: str):
    self.hypothesis = hypothesis
    self.truth = truth

  def compute_metrics(self) -> tuple[float, float]:
    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemovePunctuation(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip()
    ])

    normalized_truth = transformation(self.truth)
    normalized_hypothesis = transformation(self.hypothesis)

    wer_score = wer(normalized_truth, normalized_hypothesis)
    cer_score = cer(normalized_truth, normalized_hypothesis)

    return wer_score, cer_score

In [None]:
def get_mlm_top1_prediction(model, tokenizer, text):
    """
    Given a text with a [MASK] token, returns the top 1 predicted token as a string.
    """
    inputs = tokenizer(text, return_tensors="pt")

    # Check if mask token exists in input_ids
    mask_token_indices = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    if mask_token_indices.numel() == 0:
        # No mask token found, return empty string or raise error
        # For this specific task, we expect a mask, so we can return an empty string to avoid crashes
        return ""

    mask_token_index = mask_token_indices[0] # Get the first mask token index

    with torch.no_grad():
        logits = model(**inputs).logits

    mask_token_logits = logits[0, mask_token_index, :]
    top_1_token_id = torch.topk(mask_token_logits, 1, dim=-1).indices.squeeze().item()

    predicted_token = tokenizer.decode([top_1_token_id]).strip()

    return predicted_token

In [None]:
# --- Συγκεντρωτικοί Πίνακες για τα Αποτελέσματα ---
elormiden_cypriot_wer_scores = []
elormiden_cypriot_cer_scores = []
elormiden_greek_wer_scores = []
elormiden_greek_cer_scores = []

petros_cypriot_wer_scores = []
petros_cypriot_cer_scores = []
petros_greek_wer_scores = []
petros_greek_cer_scores = []

In [None]:
def evaluate_mlm_with_thesaurus(
    elormiden_model, elormiden_tokenizer,
    petros_model, petros_tokenizer,
    dataset_name="Elormiden/Thesaurus-Cypriot-Greek-Dialect",
    split="test"
):
    """
    Оценивает две модели MLM (Elormiden и Petros) на основе тезауруса кипрско-греческого диалекта,
    используя WER и CER для измерения точности предсказаний.

    Args:
        elormiden_model: Модель AutoModelForMaskedLM для Elormiden.
        elormiden_tokenizer: Токенизатор AutoTokenizer для Elormiden.
        petros_model: Модель AutoModelForMaskedLM для Petros.
        petros_tokenizer: Токенизатор AutoTokenizer для Petros.
        dataset_name (str): Название набора данных для загрузки.
        split (str): Разделение набора данных (например, 'train', 'validation').

    Returns:
        dict: Словарь, содержащий средние WER и CER для каждой модели
              по предсказанию кипрских и стандартных греческих слов.
    """
    print(f"Загрузка данных из '{dataset_name}' (сплит: '{split}')...")
    ds = load_dataset(dataset_name)
    print("Данные успешно загружены.")

    elormiden_cypriot_wer_scores = []
    elormiden_cypriot_cer_scores = []
    elormiden_greek_wer_scores = []
    elormiden_greek_cer_scores = []

    petros_cypriot_wer_scores = []
    petros_cypriot_cer_scores = []
    petros_greek_wer_scores = []
    petros_greek_cer_scores = []

    cypriot_template = "Η λέξη {} είναι από την κυπριακή διάλεκτο." # "Слово {} из кипрского диалекта."
    standard_greek_template = "Η λέξη {} χρησιμοποιείται στα ελληνικά." # "Слово {} используется в греческом языке."

    print("\nНачало оценки моделей...")
    # Используем tqdm для отображения прогресса по всем записям в наборе данных
    for entry in tqdm(ds[split], desc="Оценка записей"):
        cypriot_word = entry['word']
        standard_greek_word = entry['greek_word']

        if not cypriot_word or not cypriot_word.strip() or not standard_greek_word or not standard_greek_word.strip():
            continue

        # --- Оценка модели "Elormiden/bert-base-cypriot-greek" ---
        masked_cypriot_sentence_elormiden = cypriot_template.format(elormiden_tokenizer.mask_token)
        elormiden_cypriot_hypothesis = get_mlm_top1_prediction(elormiden_model, elormiden_tokenizer, masked_cypriot_sentence_elormiden)

        metrics_elormiden_cypriot = JiwerMetricsPipeline(elormiden_cypriot_hypothesis, cypriot_word)
        wer_score, cer_score = metrics_elormiden_cypriot.compute_metrics()
        elormiden_cypriot_wer_scores.append(wer_score)
        elormiden_cypriot_cer_scores.append(cer_score)

        masked_standard_greek_sentence_elormiden = standard_greek_template.format(elormiden_tokenizer.mask_token)
        elormiden_greek_hypothesis = get_mlm_top1_prediction(elormiden_model, elormiden_tokenizer, masked_standard_greek_sentence_elormiden)

        metrics_elormiden_greek = JiwerMetricsPipeline(elormiden_greek_hypothesis, standard_greek_word)
        wer_score, cer_score = metrics_elormiden_greek.compute_metrics()
        elormiden_greek_wer_scores.append(wer_score)
        elormiden_greek_cer_scores.append(cer_score)

        # --- Оценка модели "petros/bert-base-cypriot-uncased-v1" ---
        masked_cypriot_sentence_petros = cypriot_template.format(petros_tokenizer.mask_token)
        petros_cypriot_hypothesis = get_mlm_top1_prediction(petros_model, petros_tokenizer, masked_cypriot_sentence_petros)

        metrics_petros_cypriot = JiwerMetricsPipeline(petros_cypriot_hypothesis, cypriot_word)
        wer_score, cer_score = metrics_petros_cypriot.compute_metrics()
        petros_cypriot_wer_scores.append(wer_score)
        petros_cypriot_cer_scores.append(cer_score)

        masked_standard_greek_sentence_petros = standard_greek_template.format(petros_tokenizer.mask_token)
        petros_greek_hypothesis = get_mlm_top1_prediction(petros_model, petros_tokenizer, masked_standard_greek_sentence_petros)

        metrics_petros_greek = JiwerMetricsPipeline(petros_greek_hypothesis, standard_greek_word)
        wer_score, cer_score = metrics_petros_greek.compute_metrics()
        petros_greek_wer_scores.append(wer_score)
        petros_greek_cer_scores.append(cer_score)

    results = {
        "Elormiden_Cypriot_WER": np.mean(elormiden_cypriot_wer_scores),
        "Elormiden_Cypriot_CER": np.mean(elormiden_cypriot_cer_scores),
        "Elormiden_StandardGreek_WER": np.mean(elormiden_greek_wer_scores),
        "Elormiden_StandardGreek_CER": np.mean(elormiden_greek_cer_scores),
        "Petros_Cypriot_WER": np.mean(petros_cypriot_wer_scores),
        "Petros_Cypriot_CER": np.mean(petros_cypriot_cer_scores),
        "Petros_StandardGreek_WER": np.mean(petros_greek_wer_scores),
        "Petros_StandardGreek_CER": np.mean(petros_greek_cer_scores),
    }

    return results

In [None]:
evaluation_results = evaluate_mlm_with_thesaurus(
        elormiden_model=model,
        elormiden_tokenizer=tokenizer,
        petros_model=his_model,
        petros_tokenizer=his_tokenizer
)

In [None]:
#Note:
#WER, CER approach does not work for either models, because models don't give the exact transcription, only simmilar ones

"""
--- Средние значения WER/CER (чем ниже, тем лучше) ---

Модель: Elormiden/bert-base-cypriot-greek
  Предсказание кипрского слова - Средний WER: 0.9983, Средний CER: 0.9034
  Предсказание стандартного греческого слова - Средний WER: 1.0000, Средний CER: 1.0910

Модель: petros/bert-base-cypriot-uncased-v1
  Предсказание кипрского слова - Средний WER: 1.0000, Средний CER: 0.8959
  Предсказание стандартного греческого слова - Средний WER: 0.9995, Средний CER: 0.9155
"""

In [None]:
import random

In [None]:
# --- Вспомогательная функция для MLM предсказаний (обновлена для возврата топ-K) ---
def get_mlm_top_k_predictions(model, tokenizer, text, top_k=5):
    """
    Дано предложение с токеном [MASK], возвращает топ-K предсказанных токенов и их оценки.
    """
    inputs = tokenizer(text, return_tensors="pt")

    mask_token_indices = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    if mask_token_indices.numel() == 0:
        return [] # Возвращаем пустой список, если маска не найдена

    mask_token_index = mask_token_indices[0]

    with torch.no_grad():
        logits = model(**inputs).logits

    mask_token_logits = logits[0, mask_token_index, :]
    top_k_results = torch.topk(mask_token_logits, top_k, dim=-1)

    predicted_tokens_info = []
    for score, token_id in zip(top_k_results.values, top_k_results.indices):
        token = tokenizer.decode([token_id]).strip()
        predicted_tokens_info.append((token, score.item()))

    return predicted_tokens_info


In [None]:
# --- New Helper Function for Random Masking ---
def mask_random_word(text, mask_token, min_words=3):
    """
    Masks a random word in the given text.
    Returns (masked_text, original_masked_word).
    """
    words = text.split()
    if len(words) < min_words: # Ensure enough words to mask meaningfully
        return None, None

    # Try to avoid masking very short or common words like articles/prepositions
    # This is a simple heuristic; more sophisticated methods could be used.
    maskable_indices = [
        i for i, word in enumerate(words)
        if len(word.strip(".,;!?'\"")) > 2 and word.lower() not in ['ο', 'η', 'το', 'του', 'της', 'των', 'και', 'με', 'για', 'από', 'σε', 'ένα', 'μια', 'ένας']
    ]

    if not maskable_indices: # If no suitable words found, just pick a random one
        mask_idx = random.randint(0, len(words) - 1)
    else:
        mask_idx = random.choice(maskable_indices)

    original_masked_word = words[mask_idx].strip(".,;!?'\"").lower() # Store original, normalized
    words[mask_idx] = mask_token # Replace with mask token
    masked_text = " ".join(words)
    return masked_text, original_masked_word

In [None]:
# --- REVISED analyze_mlm_predictions_qualitatively Function ---
def analyze_mlm_predictions_qualitatively(
    elormiden_model, elormiden_tokenizer,
    petros_model, petros_tokenizer,
    dataset_name="Elormiden/Thesaurus-Cypriot-Greek-Dialect",
    split="train",
    num_examples=10, # More examples by default for better qualitative feel
    top_k=10 # Show more top predictions
):
    """
    Performs qualitative analysis of MLM predictions by selecting random examples
    from the thesaurus's descriptions and masking a random word within them.
    Displays top-K predictions for each model in different contexts.
    """
    print(f"\nЗагрузка данных для качественного анализа из '{dataset_name}' (сплит: '{split}')...")
    ds = load_dataset(dataset_name)
    print("Данные успешно загружены.")

    # Filter for valid entries with non-empty descriptions
    valid_entries = [
        entry for entry in ds[split]
        if entry['description'] and entry['description'].strip() and \
           entry['greek_description'] and entry['greek_description'].strip()
    ]
    random_examples = random.sample(valid_entries, min(num_examples, len(valid_entries)))

    print(f"\n--- Качественный Анализ Топ-{top_k} Предсказаний (с маскировкой описаний) для {num_examples} Случайных Примеров ---")

    for i, entry in enumerate(random_examples):
        cypriot_desc = entry['description']
        greek_desc = entry['greek_description']
        cypriot_word = entry['word'] # Keep for reference, even if not masked directly
        greek_word = entry['greek_word'] # Keep for reference

        print(f"\n=== Пример {i+1} ===")
        print(f"  > Оригинальное кипрское слово: '{cypriot_word}'")
        print(f"  > Оригинальное стандартное греческое слово: '{greek_word}'")
        print(f"  > Истинное кипрское описание: '{cypriot_desc}'")
        print(f"  > Истинное стандартное греческое описание: '{greek_desc}'")

        # --- Elormiden Model ---
        print("\n  **Модель: Elormiden/bert-base-cypriot-greek**")

        # Cypriot Description Context
        masked_cypriot_desc_elormiden, original_masked_cypriot_word_elormiden = mask_random_word(cypriot_desc, elormiden_tokenizer.mask_token)
        if masked_cypriot_desc_elormiden:
            elormiden_cypriot_preds = get_mlm_top_k_predictions(elormiden_model, elormiden_tokenizer, masked_cypriot_desc_elormiden, top_k)
            print(f"    Предсказания для КИПРСКОГО описания (Оригинал: '{cypriot_desc}', Маска: '{masked_cypriot_desc_elormiden}', Истинное замаскированное: '{original_masked_cypriot_word_elormiden}'):")
            for rank, (token, score) in enumerate(elormiden_cypriot_preds):
                print(f"      {rank+1}. '{token}' (Score: {score:.4f})")
        else:
            print("    Недостаточно слов для маскировки в кипрском описании.")

        # Standard Greek Description Context
        masked_greek_desc_elormiden, original_masked_greek_word_elormiden = mask_random_word(greek_desc, elormiden_tokenizer.mask_token)
        if masked_greek_desc_elormiden:
            elormiden_greek_preds = get_mlm_top_k_predictions(elormiden_model, elormiden_tokenizer, masked_greek_desc_elormiden, top_k)
            print(f"    Предсказания для СТАНДАРТНОГО ГРЕЧЕСКОГО описания (Оригинал: '{greek_desc}', Маска: '{masked_greek_desc_elormiden}', Истинное замаскированное: '{original_masked_greek_word_elormiden}'):")
            for rank, (token, score) in enumerate(elormiden_greek_preds):
                print(f"      {rank+1}. '{token}' (Score: {score:.4f})")
        else:
            print("    Недостаточно слов для маскировки в стандартном греческом описании.")

        # --- Petros Model ---
        print("\n  **Модель: petros/bert-base-cypriot-uncased-v1**")

        # Cypriot Description Context
        masked_cypriot_desc_petros, original_masked_cypriot_word_petros = mask_random_word(cypriot_desc, petros_tokenizer.mask_token)
        if masked_cypriot_desc_petros:
            petros_cypriot_preds = get_mlm_top_k_predictions(petros_model, petros_tokenizer, masked_cypriot_desc_petros, top_k)
            print(f"    Предсказания для КИПРСКОГО описания (Оригинал: '{cypriot_desc}', Маска: '{masked_cypriot_desc_petros}', Истинное замаскированное: '{original_masked_cypriot_word_petros}'):")
            for rank, (token, score) in enumerate(petros_cypriot_preds):
                print(f"      {rank+1}. '{token}' (Score: {score:.4f})")
        else:
            print("    Недостаточно слов для маскировки в кипрском описании.")

        # Standard Greek Description Context
        masked_greek_desc_petros, original_masked_greek_word_petros = mask_random_word(greek_desc, petros_tokenizer.mask_token)
        if masked_greek_desc_petros:
            petros_greek_preds = get_mlm_top_k_predictions(petros_model, petros_tokenizer, masked_greek_desc_petros, top_k)
            print(f"    Предсказания для СТАНДАРТНОГО ГРЕЧЕСКОГО описания (Оригинал: '{greek_desc}', Маска: '{masked_greek_desc_petros}', Истинное замаскированное: '{original_masked_greek_word_petros}'):")
            for rank, (token, score) in enumerate(petros_greek_preds):
                print(f"      {rank+1}. '{token}' (Score: {score:.4f})")
        else:
            print("    Недостаточно слов для маскировки в стандартном греческом описании.")

        print("-" * 50) # Separator for readability

In [None]:
analyze_mlm_predictions_qualitatively(
        elormiden_model=model,
        elormiden_tokenizer=tokenizer,
        petros_model=his_model,
        petros_tokenizer=his_tokenizer,
        num_examples=10,
        top_k=10
    )