In [None]:
import json
import re

In [None]:
file_path = "english_french_with_entities_fixed.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file, strict=False)

In [None]:
print(f"Total samples: {len(data)}")
print("Sample entry:", json.dumps(data[0], indent=4, ensure_ascii=False))

Total samples: 5531
Sample entry: {
    "id": "a9011ddf",
    "source_locale": "en",
    "target_locale": "fr",
    "source": "What is the seventh tallest mountain in North America?",
    "target": "Quelle est la septième plus haute montagne d’Amérique du Nord ?",
    "entities": [
        {
            "fr": "Amérique du Nord",
            "en": "North America"
        }
    ],
    "from": "mintaka"
}


In [None]:
entity_map = {}
for entry in data:
    for entity in entry["entities"]:
        en_entity = entity["en"]
        fr_entity = entity["fr"]
        entity_map[en_entity] = fr_entity

In [None]:
len(entity_map)

1551

In [None]:
import re

masked_data = []

for entry in data:
    source_text = entry["source"]
    entity_list = entry["entities"]

    entity_mask_map = {}  # Stores entity replacements

    # Replace entities with <ENT_1>, <ENT_2>, ...
    for i, entity in enumerate(entity_list):
        en_entity = entity["en"]
        mask_token = f"<ENT_{i+1}>"
        entity_mask_map[mask_token] = en_entity
        source_text = re.sub(rf"\b{re.escape(en_entity)}\b", mask_token, source_text)

    # Store masked data
    masked_data.append({
        "id": entry["id"],
        "masked_source": source_text,
        "entity_mask_map": entity_mask_map
    })

# Print sample masked output
print(json.dumps(masked_data[:3], indent=4, ensure_ascii=False))

[
    {
        "id": "a9011ddf",
        "masked_source": "What is the seventh tallest mountain in <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "North America"
        }
    },
    {
        "id": "982450cf",
        "masked_source": "Who is the youngest current US <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "governor"
        }
    },
    {
        "id": "b218d184",
        "masked_source": "Has Bernie Sanders ever been president of the <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "United States"
        }
    }
]


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained English-to-French translation model
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model.generate(**inputs)
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

# Translate masked sentences
for entry in masked_data:
    entry["masked_translation"] = translate(entry["masked_source"])

# Print some translated masked sentences
print(json.dumps(masked_data[:3], indent=4, ensure_ascii=False))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

[
    {
        "id": "a9011ddf",
        "masked_source": "What is the seventh tallest mountain in <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "North America"
        },
        "masked_translation": "Quelle est la septième montagne la plus haute de <ENT_1>?"
    },
    {
        "id": "982450cf",
        "masked_source": "Who is the youngest current US <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "governor"
        },
        "masked_translation": "Qui est le plus jeune actuel des États-Unis <ENT_1>?"
    },
    {
        "id": "b218d184",
        "masked_source": "Has Bernie Sanders ever been president of the <ENT_1>?",
        "entity_mask_map": {
            "<ENT_1>": "United States"
        },
        "masked_translation": "Bernie Sanders a-t-elle déjà été présidente du <ENT_1>?"
    }
]


In [None]:
final_translations = []

for entry in masked_data:
    translated_text = entry["masked_translation"]
    entity_map = entry["entity_mask_map"]

    # Replace <ENT_1>, <ENT_2>, etc., with correct French entities
    for mask_token, en_entity in entity_map.items():
        # Find the corresponding French entity from original data
        for data_entry in data:
            if data_entry["id"] == entry["id"]:
                for entity in data_entry["entities"]:
                    if entity["en"] == en_entity:
                        fr_entity = entity["fr"]
                        translated_text = translated_text.replace(mask_token, fr_entity)

    # Store final translation
    final_translations.append({
        "id": entry["id"],
        "final_translation": translated_text
    })

# Print sample final translations
print(json.dumps(final_translations[:3], indent=4, ensure_ascii=False))


[
    {
        "id": "a9011ddf",
        "final_translation": "Quelle est la septième montagne la plus haute de Amérique du Nord?"
    },
    {
        "id": "982450cf",
        "final_translation": "Qui est le plus jeune actuel des États-Unis gouverneur?"
    },
    {
        "id": "b218d184",
        "final_translation": "Bernie Sanders a-t-elle déjà été présidente du États-Unis?"
    }
]


In [None]:
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []
correct_entities = 0
total_entities = 0

for entry in final_translations:
    # Get reference translation
    ref_entry = next(item for item in data if item["id"] == entry["id"])
    reference = ref_entry["target"]  # Original French text
    hypothesis = entry["final_translation"]  # Our model's output

    # Compute BLEU Score
    bleu = sentence_bleu([reference.split()], hypothesis.split())
    bleu_scores.append(bleu)

    # Entity Accuracy Calculation
    for entity in ref_entry["entities"]:
        en_entity = entity["en"]
        fr_entity = entity["fr"]
        total_entities += 1
        if fr_entity in hypothesis:
            correct_entities += 1

# Compute average BLEU score and entity accuracy
avg_bleu = sum(bleu_scores) / len(bleu_scores)
entity_accuracy = correct_entities / total_entities if total_entities > 0 else 0

print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Entity Accuracy: {entity_accuracy:.4f}")


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.2629
Entity Accuracy: 0.8223
