In [1]:
# General imports
import json
import os
from contextlib import redirect_stdout
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader

## Load data

In [2]:
!kaggle datasets download -d andreazenotto/semeval-data
!unzip -q "semeval-data.zip" -d "semeval-data"

Dataset URL: https://www.kaggle.com/datasets/andreazenotto/semeval-data
License(s): MIT
Downloading semeval-data.zip to /kaggle/working
  0%|                                                | 0.00/662k [00:00<?, ?B/s]
100%|█████████████████████████████████████████| 662k/662k [00:00<00:00, 102MB/s]


## Load COMET model

In [3]:
%%capture
!pip install -q unbabel-comet

# Import the comet module for the evaluation
from comet import download_model, load_from_checkpoint

COMET_MODEL_NAME = "Unbabel/wmt22-comet-da"
# Download the model
model_path = download_model(COMET_MODEL_NAME)
# Load the model
model = load_from_checkpoint(model_path)

# Evaluate the predictions

In [4]:
def get_data(SYSTEM_NAME, TARGET_LANGUAGE):
    DATA_DIR = "semeval-data"
    SPLIT = "validation"
    
    # The path to the references is formatted as follows:
    # data/references/{split}/{target_language}.jsonl
    PATH_TO_REFERENCES = os.path.join(
        DATA_DIR,
        "references",
        SPLIT,
        f"{TARGET_LANGUAGE}.jsonl",
    )

    # The path to the predictions is formatted as follows:
    # data/predictions/{system_name}/{split}/{target_language}.jsonl
    PATH_TO_PREDICTIONS = os.path.join(
        DATA_DIR,
        "predictions",
        SYSTEM_NAME,
        f"{TARGET_LANGUAGE}.jsonl",
    )

    # Load the references
    references = {}
    
    with open(PATH_TO_REFERENCES, "r") as f:
    
        for line in f:
            data = json.loads(line)
            references[data["id"]] = data
            if len(references) == 50:
                break
    
    # print(f"Loaded {len(references)} references from {PATH_TO_REFERENCES}")
    
    # Load the predictions
    predictions = {}
    
    with open(PATH_TO_PREDICTIONS, "r") as f:
    
        for line in f:
            data = json.loads(line)
            predictions[data["id"]] = data
    
    # print(f"Loaded {len(predictions)} predictions from {PATH_TO_PREDICTIONS}")
    
    # Get all those references that have a corresponding prediction
    ids = set(references.keys()) & set(predictions.keys())
    num_missing_predictions = len(references) - len(ids)
    
    if num_missing_predictions > 0:
        print(f"Missing predictions for {num_missing_predictions} references")
    
    return references, predictions

In [5]:
def get_instances(references, predictions):
    instance_ids = {}
    instances = []
    current_index = 0

    # Itera direttamente su references e predictions mantenendo lo stesso ordine
    for idx, (reference, prediction) in enumerate(zip(references.values(), predictions.values())):
        for target in reference["targets"]:
            instances.append(
                {
                    "src": reference["source"],
                    "ref": target["translation"],
                    "mt": prediction["prediction"],
                }
            )

        instance_ids[idx] = [current_index, current_index + len(reference["targets"])]
        current_index += len(reference["targets"])  # Aggiorna l'indice corrente

    # print(f"Created {len(instances)} instances")


    # Rimuovere i duplicati basati sul campo 'src'
    unique_instances = []
    unique_instance_ids = {}
    seen_src = set()
    current_index = 0

    for idx, (start, end) in instance_ids.items():
        # Prendi gli elementi della fascia di indici corrente
        range_instances = instances[start:end]

        for instance in range_instances:
            if instance['src'] not in seen_src:
                unique_instances.append(instance)
                seen_src.add(instance['src'])

                # Aggiorna gli indici nel nuovo dizionario instance_ids
                if idx not in unique_instance_ids:
                    unique_instance_ids[idx] = [current_index, current_index + 1]
                else:
                    unique_instance_ids[idx][1] += 1

                current_index += 1

    # Output della lista aggiornata
    instances = unique_instances
    instances_ids = unique_instance_ids

    # print(f"Created {len(instances)} instances")
    return instances, instances_ids

In [6]:
def get_mentions_from_references(references):
    mentions = {}

    for instance in references:
        instance_mentions = set()

        for target in references[instance]["targets"]:
            mention = target["mention"]
            instance_mentions.add(mention)

        mentions[instance] = instance_mentions

    return mentions

In [7]:
def compute_m_eta(predictions, mentions, verbose = False):
    correct, total = 0, 0

    for instance_id, instance_mentions in mentions.items():
        # Check that there is at least one entity mention for the instance.
        assert instance_mentions, f"No mentions for instance {instance_id}"

        # Increment the total count of instances (for recall calculation).
        total += 1

        # Check that there is a prediction for the instance.
        if instance_id not in predictions:
            if verbose:
                print(
                    f"No prediction for instance {instance_id}. Check that this is expected behavior, as it may affect the evaluation."
                )
            continue

        prediction = predictions[instance_id]["prediction"]
        normalized_translation = prediction.casefold()
        entity_match = False

        for mention in instance_mentions:
            normalized_mention = mention.casefold()

            # Check if the normalized mention is a substring of the normalized translation.
            # If it is, consider the prediction (the entity name translation) correct.
            if normalized_mention in normalized_translation:
                correct += 1
                entity_match = True
                break

        # Log the prediction and the ground truth mentions for every wrong match if verbose is set.
        if not entity_match and verbose:
            print(f"Prediction: {prediction}")
            print(f"Ground truth mentions: {instance_mentions}")
            print("")

    return {
        "correct": correct,
        "total": total,
        "accuracy": correct / total if total > 0 else 0.0,
    }

In [8]:
def compute_metrics(system_name):
    print("")
    print("=============================================")
    print(f"Scores for {system_name}")
    print("=============================================")
    
    NUM_GPUS = 2
    BATCH_SIZE = 32
    
    comet_score = 0
    m_eta_score = 0
    
    languages = [
        "ar_AE",  # Arabic (United Arab Emirates)
        "de_DE",  # German (Germany)
        "es_ES",  # Spanish (Spain)
        "fr_FR",  # French (France)
        "it_IT",  # Italian (Italy)
        "ja_JP",  # Japanese (Japan)
        "ko_KR",  # Korean (South Korea)
        "th_TH",  # Thai (Thailand)
        "tr_TR",  # Turkish (Turkey)
        "zh_TW"   # Chinese (Traditional)
    ]
    
    
    for lang in languages:
        references, predictions = get_data(system_name, lang)
        instances, instances_ids = get_instances(references, predictions)
        print(f"Language: {lang}")
        
        mentions = get_mentions_from_references(references)
        m_eta = entity_name_translation_accuracy = compute_m_eta(predictions, mentions)
        accuracy = m_eta["accuracy"]
        print(f"\tM-ETA Score: {100.*accuracy:.2f}")
        m_eta_score += accuracy
    
        # Compute the scores
        outputs = model.predict(instances, batch_size=BATCH_SIZE, gpus=1, progress_bar=False)
    
        # Extract the scores
        scores = outputs.scores
        max_scores = []
    
        for id, indices in instances_ids.items():
            # Get the max score for each reference
            max_score = max(scores[indices[0] : indices[1]])
            max_scores.append(max_score)
    
        # Compute the average score while taking into account the missing predictions (which are considered as 0)
        system_score = sum(max_scores) / len(max_scores)
        print(f"\tCOMET Score: {100.*system_score:.2f}")
        comet_score += system_score
    
    print(f"\nAverage COMET score: {100.*(comet_score/10):.2f}")
    print(f"Average M-ETA score: {100.*(m_eta_score/10):.2f}")


In [10]:
SYSTEMS = ["m2m100_418M", 
           "m2m100_1.2B", 
           "qwen2.5_3B-Instruct", 
           "qwen2.5_7B-Instruct", 
           "llama3.1-8B-Instruct",
           "gemma2-9B-it", 
           "gemma2+entity_linking", 
           "ner+gemma2+entity_linking"]

for system_name in SYSTEMS:
    compute_metrics(system_name)


Scores for m2m100_418M
Language: ar_AE
	M-ETA Score: 0.00
	COMET Score: 59.02
Language: de_DE
	M-ETA Score: 12.00
	COMET Score: 71.24
Language: es_ES
	M-ETA Score: 0.00
	COMET Score: 73.53
Language: fr_FR
	M-ETA Score: 12.00
	COMET Score: 75.37
Language: it_IT
	M-ETA Score: 0.00
	COMET Score: 72.15
Language: ja_JP
	M-ETA Score: 0.00
	COMET Score: 61.87
Language: ko_KR
	M-ETA Score: 0.00
	COMET Score: 63.27
Language: th_TH
	M-ETA Score: 0.00
	COMET Score: 55.93
Language: tr_TR
	M-ETA Score: 0.00
	COMET Score: 66.44
Language: zh_TW
	M-ETA Score: 0.00
	COMET Score: 62.02

Average COMET score: 66.08
Average M-ETA score: 2.40

Scores for m2m100_1.2B
Language: ar_AE
	M-ETA Score: 0.00
	COMET Score: 61.22
Language: de_DE
	M-ETA Score: 12.00
	COMET Score: 73.44
Language: es_ES
	M-ETA Score: 0.00
	COMET Score: 76.10
Language: fr_FR
	M-ETA Score: 24.00
	COMET Score: 79.78
Language: it_IT
	M-ETA Score: 0.00
	COMET Score: 75.87
Language: ja_JP
	M-ETA Score: 0.00
	COMET Score: 63.81
Language: ko_K