In [None]:
import json
import requests
from tqdm import tqdm  

def read_jsonl(file_path):
    """Reads a JSONL file and returns a list of dictionaries."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    return data

def get_wikidata_label(entity_id, preferred_langs):
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={entity_id}&format=json&props=labels"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if entity_id in data["entities"]:
            if "labels" in data["entities"][entity_id]:
                labels = data["entities"][entity_id]["labels"]
                result = {}
                for lang in preferred_langs:
                    if lang in labels:
                        result[lang] = labels[lang]["value"]
                return result  # Return labels for both languages if found
            return None
        else:
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Wikidata data: {e}")
        return None

train_data = read_jsonl('train.jsonl')

for i, item in tqdm(enumerate(train_data), total=len(train_data), desc="Processing train_data"):
    source_locale = item.get('source_locale', 'en')
    target_locale = item.get('target_locale', 'fr')
    preferred_langs = [target_locale, source_locale]
    new_entities = []

    for entity_id in item.get("entities", []):
        labels = get_wikidata_label(entity_id, preferred_langs)
        if labels:
            new_entities.append(labels) 
        else:
            print(f"Could not retrieve label for entity ID: {entity_id} in item {i}")


    item["entities"] = new_entities


Processing train_data: 100%|██████████| 5531/5531 [23:20<00:00,  3.95it/s]


In [45]:
output_file = 'english_french_with_entities.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

print(f"Updated data has been saved to {output_file}")

Updated data has been saved to english_french_with_entities.json


In [46]:
train_data[:3]

[{'id': 'a9011ddf',
  'source_locale': 'en',
  'target_locale': 'fr',
  'source': 'What is the seventh tallest mountain in North America?',
  'target': 'Quelle est la septième plus haute montagne d’Amérique du Nord ?',
  'entities': [{'fr': 'Amérique du Nord', 'en': 'North America'}],
  'from': 'mintaka'},
 {'id': '982450cf',
  'source_locale': 'en',
  'target_locale': 'fr',
  'source': 'Who is the youngest current US governor?',
  'target': 'Qui est l’actuel plus jeune gouverneur américain ?',
  'entities': [{'fr': 'gouverneur', 'en': 'governor'}],
  'from': 'mintaka'},
 {'id': 'b218d184',
  'source_locale': 'en',
  'target_locale': 'fr',
  'source': 'Has Bernie Sanders ever been president of the United States?',
  'target': 'Bernie Sanders a-t-il déjà été Président des États-Unis ?',
  'entities': [{'fr': 'États-Unis', 'en': 'United States'}],
  'from': 'mintaka'}]