# Translate.py Notebook
A Jupyter notebook version of `Translate.py` for batch translation using Ollama and LangChain.

## 1. Import Required Libraries
Import all necessary libraries, including langchain, torch, tqdm, json, glob, and os.

In [None]:
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import zipfile
import json
import glob
import sys
import tqdm
import requests
# from comet.models import download_model, load_from_checkpoint
from sentence_transformers import SentenceTransformer, util
import torch
import os

## 2. Define Language Mapping Function
Define the `get_language_name` function to map language short codes to full language names.

In [2]:
def get_language_name(short_code):
    lang_map = {
        'ar': 'Arabic',
        'zh': 'Chinese (Traditional)',
        'fr': 'French',
        'de': 'German',
        'it': 'Italian',
        'ja': 'Japanese',
        'ko': 'Korean',
        'es': 'Spanish',
        'th': 'Thai',
        'tr': 'Turkish',
        'en': 'English',
        # Add more as needed
    }
    return lang_map.get(short_code, short_code)

## 3. Check CUDA Availability
Check if CUDA is available using `torch.cuda.is_available()` and print the result.

In [3]:
gpu_av = torch.cuda.is_available()
print(f"CUDA available: {gpu_av}")

CUDA available: True


## 4. Initialize Model and Prompt
Initialize the Ollama model and create a `PromptTemplate` for translation. Set up the translation chain.

In [None]:
model_name = "llama3.1_8b-q4_K_M_few(3)_shot_prompting"
llm = Ollama(model="llama3.1:8b")

In [5]:
# One-shot examples for each language
one_shot_examples = {
    "French": {
        "text": "How many seasons of The Tiger Brigades have been released?",
        "translation": "Combien de saisons de Les Brigades du Tigre ont été publiées?",
        "entity_types": ["TV series"],
        "entity translation": "Les Brigades du Tigre"
    },
    "German": {
        "text": "How long did it take to build the Stockholm Court House?",
        "translation": "Wie lange hat es gedauert, das Gerichtshaus von Stockholm zu bauen?",
        "entity_types": ["Landmark"],
        "entity translation": "Gerichtshaus von Stockholm"
    },
    "Spanish": {
        "text": "Who played the lead role in the movie Torrente, the dumb arm of the law?",
        "translation": "¿Quién interpretó el papel principal en la película Torrente, el brazo tonto de la ley?",
        "entity_types": ["Movie"],
        "entity translation": "Torrente, el brazo tonto de la ley"
    },
    "Italian": {
        "text": "Is The Three Musketeers: D'Artagnan part of a duology or a trilogy?",
        "translation": "I tre moschettieri - D'Artagnan fa parte di una dilogia o di una trilogia?",
        "entity_types": ["Movie"],
        "entity translation": "I tre moschettieri - D'Artagnan"
    },
    "Japanese": {
        "text": "What is the significance of the Wudang Sect in the overall storyline of wuxia fiction?",
        "translation": "武侠小説の全体的なストーリーラインにおける武当派の重要性は何ですか？",
        "entity_types": ["Fictional entity"],
        "entity translation": "武当派"
    },
    "Korean": {
        "text": "Can visitors enter the Rüstem Pasha Mosque to explore its interior?",
        "translation": "방문객들은 뤼스템 파샤 모스크 내부를 둘러볼 수 있나요?",
        "entity_types": ["Place of worship"],
        "entity translation": "뤼스템 파샤 모스크"
    },
    "Turkish": {
        "text": "Is Clockwork Prince a standalone novel or part of a series?",
        "translation": "Mekanik Prens bir roman mı yoksa bir serinin parçası mı?",
        "entity_types": ["Artwork", "Book"],
        "entity translation": "Mekanik Prens"
    },
    "Arabic": {
        "text": "What is the purpose of Intermediate System to Intermediate System in networking?",
        "translation": "ما هو الغرض من بروتوكول الربط بين الأنظمة الوسيطية في الشبكات؟",
        "entity_types": ["Book"],
        "entity translation": "بروتوكول الربط بين الأنظمة الوسيطية"
    },
    "Chinese (Traditional)": {
        "text": "What are some characteristics of the White Queen's personality in the story?",
        "translation": "故事中白王后的個性有哪些特徵？",
        "entity_types": ["Person", "Fictional entity"],
        "entity translation": "白王后"
    },
    "Thai": {
        "text": "What is the architectural style of the Pagoda of Fogong Temple?",
        "translation": "รูปแบบสถาปัตยกรรมของเจดีย์วัดฝอกงคืออะไร?",
        "entity_types": ["Place of worship"],
        "entity translation": "เจดีย์วัดฝอกง"
    }
}

In [6]:
one_shot_examples.get("French")

{'text': 'How many seasons of The Tiger Brigades have been released?',
 'translation': 'Combien de saisons de Les Brigades du Tigre ont été publiées?',
 'entity_types': ['TV series'],
 'entity translation': 'Les Brigades du Tigre'}

In [7]:
# Map file names to language names (adjust as needed)
lang_map = {
    'fr_FR': 'French',
    'de_DE': 'German',
    'es_ES': 'Spanish',
    'it_IT': 'Italian',
    'ja_JP': 'Japanese',
    'ko_KR': 'Korean',
    'th_TH': 'Thai',
    'tr_TR': 'Turkish',
    'zh_TW': 'Chinese (Traditional)',
    'ar_AE': 'Arabic'
}

In [8]:
few_shot_examples = {}
SAMPLE_DIR = r'd:\Python_projects\DL_project\data\semeval\sample'

for file in glob.glob(os.path.join(SAMPLE_DIR, '*.jsonl')):
    lang_code = os.path.splitext(os.path.basename(file))[0]
    language = lang_map.get(lang_code, lang_code)
    examples = []
    with open(file, encoding='utf-8') as f:
        for line in f:
            if len(examples) >= 3:
                break
            data = json.loads(line)
            # Use the first target as the canonical translation/mention
            if data.get("targets"):
                target = data["targets"][0]
                examples.append({
                    "text": data["source"],
                    "translation": target["translation"],
                    "entity_types": data.get("entity_types", []),
                    "entity translation": target.get("mention", "")
                })
    if examples:
        few_shot_examples[language] = examples

# Print a sample for French
from pprint import pprint
pprint(few_shot_examples)

{'Arabic': [{'entity translation': 'بروتوكول الربط بين الأنظمة الوسيطية',
             'entity_types': ['Book'],
             'text': 'What is the purpose of Intermediate System to '
                     'Intermediate System in networking?',
             'translation': 'ما هو الغرض من بروتوكول الربط بين الأنظمة '
                            'الوسيطية في الشبكات؟'},
            {'entity translation': 'التوقيت الذري العالمي',
             'entity_types': ['Book'],
             'text': 'In which fields of study is International Atomic Time '
                     'particularly important?',
             'translation': 'في أي من مجالات الدراسة يعد التوقيت الذري العالمي '
                            'مهم بشكل خاص؟'},
            {'entity translation': 'التوقيت الذري العالمي',
             'entity_types': ['Book'],
             'text': 'Can International Atomic Time be used as a standard in '
                     'various timekeeping devices?',
             'translation': 'هل يمكن استخدام التو

In [9]:
# prompt = PromptTemplate(
#     input_variables=["text", "language"],
#     template='''Translate the following sentence to {language}.
#     Text: {text}
#     Only output the translated text.
#     Do not include any additional text or explanations.'''
# )

prompt = PromptTemplate(
    input_variables=["text", "language", "few_shot_examples"],
    template='''
You are a professional translator specialized in entity-aware translations from English to {language}.

Translate the following text while adhering to these guidelines:
- Keep named entities (people, organizations, places, dates, titles) in their original form unless a well-known equivalent exists in {language}.
- Ensure the meaning and context remain intact.
- Do not translate media names (books, movies, etc.) unless there is a widely accepted localized version.

Here is are some examples for a text, its translation, entity types, and entity translation:
{few_shot_examples}

In the example entity types and entity translation are provided to help you understand how to handle entities in the text but are not required in the output. Output must have only the translation of the text in plain text without notes, symbols or any markings, nothing else. Here is the text to translate:
text: {text}
translation:
'''
)


chain = prompt | llm

In [10]:
prompt.format(
        text="test_text",
        language="French",
        few_shot_examples=few_shot_examples.get("French")
    )

"\nYou are a professional translator specialized in entity-aware translations from English to French.\n\nTranslate the following text while adhering to these guidelines:\n- Keep named entities (people, organizations, places, dates, titles) in their original form unless a well-known equivalent exists in French.\n- Ensure the meaning and context remain intact.\n- Do not translate media names (books, movies, etc.) unless there is a widely accepted localized version.\n\nHere is are some examples for a text, its translation, entity types, and entity translation:\n[{'text': 'How many seasons of The Tiger Brigades have been released?', 'translation': 'Combien de saisons de Les Brigades du Tigre ont été publiées?', 'entity_types': ['TV series'], 'entity translation': 'Les Brigades du Tigre'}, {'text': 'Who are the main characters in The Tiger Brigades?', 'translation': 'Qui sont les personnages principaux dans Les Brigades du Tigre?', 'entity_types': ['TV series'], 'entity translation': 'Les B

## 5. Prepare Input and Output Paths
Set the input data folder, find all `.jsonl` files, and create the output directory for predictions.

In [11]:
input_data_folder = "data/references/validation/"
jsonl_files = glob.glob(f"{input_data_folder}/*.jsonl")

output_prediction_dir = os.path.join("data/predictions", model_name, "validation")
os.makedirs(output_prediction_dir, exist_ok=True)

## 6. Process and Translate JSONL Files
For each JSONL file, read the data, invoke the translation chain for each record, and collect results.

In [None]:
for file_path in jsonl_files:
    filename = os.path.basename(file_path)
    outfile_path = os.path.join(output_prediction_dir, filename)

    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))

    results = []
    for idx, record in enumerate(data, 1):
        id = record['id']
        source = record['source']
        source_locale = record['source_locale']
        source_language = get_language_name(source_locale)
        target_locale = record['target_locale']
        target_language = get_language_name(target_locale)
        result = chain.invoke({"text": source, "language": target_language, "few_shot_examples": few_shot_examples.get(target_language)})
        model_translation = result.strip()
        results.append({
            "id": id,
            "source_language": source_language,
            "target_language": target_language,
            "text": source,
            "prediction": model_translation,
        })

        if 'pbar' not in locals():
            pbar = tqdm.tqdm(total=len(data))

        pbar.update(1)

        if idx % 10 == 0 or idx == len(data):
            with open(outfile_path, 'w', encoding='utf-8') as f:
                for res in results:
                    f.write(json.dumps(res, ensure_ascii=False) + '\n')

    print(f"Translations saved to {outfile_path}")

## 7. Save Translations to Output Files
Write the translation results to output files in the specified directory, updating progress with tqdm.

In [None]:
if 'pbar' in locals():
    pbar.close()

7278it [1:30:02,  1.35it/s]
