In [None]:
%pip install unbabel-comet
%pip install langchain_community

Collecting unbabel-comet
  Downloading unbabel_comet-2.2.6-py3-none-any.whl.metadata (19 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.0.0,>=1.20.0 (from unbabel-comet)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading pytorch_lightning-2.5.2-py3-none-any.whl.metadata (2

Collecting langchain_community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain_community)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain_community)
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

# 🧪 Multilingual Translation Evaluation Framework with ChatGPT & COMET

This notebook evaluates English-to-multilingual translations using different prompting strategies with GPT-4o. It assesses both fluency (via COMET) and factual accuracy (via named entity preservation).

---

## 🔍 Evaluation Metrics

### 1. COMET Score (Semantic Quality)
COMET (Crosslingual Optimized Metric for Evaluation of Translation) is a neural metric that compares model-generated translations with human references.  
- Model used: `Unbabel/wmt22-comet-da`
- Scores range from 0 to 1 (higher is better)
- Captures fluency and semantic correctness

### 2. Meta Score (Entity Preservation)
A custom metric that checks if named entities (like people, places, products) are preserved correctly.  
- Extracts entity mentions from references
- Compares them against predictions
- Reports accuracy as a percentage

---

## 💬 Prompting Techniques

### Zero-Shot (Generic)
Basic instruction: “Translate the following English sentences into {target_language}.”  
No special guidance or examples.

### Zero-Shot (Entity-Aware)
Adds instructions to carefully translate named entities consistently and accurately.  
Improves factual precision.

### Few-Shot
Includes a few example translations (source + target) in the prompt to guide the model.  
Helps the model mimic correct patterns and handle low-resource scenarios.

### Chain-of-Thought (CoT)
Instructs the model to internally perform reasoning steps:  
1. Identify named entities  
2. Classify their types  
3. Translate entities contextually  
4. Translate the rest  
5. Combine everything  
Returns only final output in clean JSON format. Boosts consistency and control.

---

## ⚙️ Pipeline Summary

1. **Load input `.jsonl` files** from validation set
2. **Choose a prompt type** (zero-shot, few-shot, CoT)
3. **Translate in batches** (50 per request) with retry & rate limiting
4. **Save predictions** in structured JSON
5. **Evaluate** using:
   - `calculate_comet_scores()` for semantic fluency
   - `calculate_meta_score()` for entity translation accuracy
6. **Store results** per file in a `scores/` folder

---

## 📊 Example Evaluation Output

```json
{
  "correct_instances": 45,
  "total_instances": 50,
  "comet_score": 0.872,
  "meta_score": 90.0
}


In [None]:
# ✅ ChatGPT-Compatible Framework for Translation Evaluation

import os
import re
import json
from typing import Dict, List, Set
from comet import download_model, load_from_checkpoint
from langchain_core.prompts import ChatPromptTemplate

# === Paths and Configuration ===
# Define training and validation data paths along with COMET model configuration
TRAIN_DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/semeval.train.v2-e0d1c28b78c8dd4969d25eea5d3bc9cc/semeval/train'
VALIDATION_DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation'
COMET_MODEL_NAME = "Unbabel/wmt22-comet-da"
COMET_NUM_GPUS = 1
COMET_BATCH_SIZE = 32

# Entity types to be evaluated in entity-level accuracy
ENTITY_TYPES = [
    "Musical work", "Artwork", "Food", "Animal", "Plant", "Book", "Book series",
    "Fictional entity", "Landmark", "Movie", "Place of worship", "Natural place",
    "TV series", "Person"
]

# === Utilities ===
# Convert language code to full name
def get_language_name(short_code):
    lang_map = {
        'ar': 'Arabic', 'zh': 'Chinese (Traditional)', 'fr': 'French', 'de': 'German',
        'it': 'Italian', 'ja': 'Japanese', 'ko': 'Korean', 'es': 'Spanish',
        'th': 'Thai', 'tr': 'Turkish', 'en': 'English'
    }
    return lang_map.get(short_code, short_code)

# Compute COMET scores between predictions and references
def download_comet_model():
    path = download_model(COMET_MODEL_NAME)
    return load_from_checkpoint(path)

def calculate_comet_scores(model, references_path, predictions_path):
    refs = _load_jsonl_data(references_path)
    preds = _load_jsonl_data(predictions_path)
    ids = set(refs.keys()) & set(preds.keys())

    instances, idx_map, idx = [], {}, 0
    for i in sorted(ids):
        for t in refs[i]['targets']:
            instances.append({
                "src": refs[i]['source'],
                "ref": t['translation'],
                "mt": preds[i]['prediction']
            })
        idx_map[i] = [idx, idx + len(refs[i]['targets'])]
        idx += len(refs[i]['targets'])

    print(f"Created {len(instances)} instances")
    scores = model.predict(instances, batch_size=COMET_BATCH_SIZE, gpus=COMET_NUM_GPUS).scores

    # Compute max score for each sample and average across all
    max_scores = [max(scores[start:end]) for start, end in idx_map.values()]
    avg_score = sum(max_scores) / (len(max_scores) + len(preds) - len(ids))
    print(f"Average COMET score: {100 * avg_score:.2f}")
    return avg_score

# Evaluate entity-level name translation accuracy
def calculate_meta_score(ref_path, pred_path, verbose=False):
    refs = _load_references(ref_path, ENTITY_TYPES)
    mentions = _get_mentions_from_references(refs)
    preds = _load_predictions(pred_path)
    acc = _compute_entity_name_translation_accuracy(preds, mentions, verbose)
    return acc['correct'], acc['total'], acc['accuracy'] * 100

# === Internal Helpers ===

# Load JSONL data with `id` as key
def _load_jsonl_data(path):
    return {json.loads(l)['id']: json.loads(l) for l in open(path, encoding='utf-8') if l.strip()}

# Load references with filtering based on entity type
def _load_references(path: str, types: List[str]) -> List[dict]:
    data = []
    for line in open(path, encoding='utf-8'):
        if not line.strip(): continue
        record = json.loads(line)
        if not record['targets']: continue
        if types and not any(e in record['entity_types'] for e in types): continue
        data.append(record)
    return data

def _load_predictions(path: str) -> Dict[str, str]:
    data = {}
    for line in open(path, encoding='utf-8'):
        if not line.strip(): continue
        record = json.loads(line)
        match = re.match(r"Q[0-9]+_[0-9]", record['id'])
        if not match: raise ValueError(f"Bad ID: {record['id']}")
        data[match.group(0)] = record['prediction']
    return data

# Load model predictions with validation on ID format
def _compute_entity_name_translation_accuracy(preds: Dict[str, str], mentions: Dict[str, Set[str]], verbose=False) -> dict:
    correct = sum(
        1 for k, m in mentions.items()
        if k in preds and any(v.casefold() in preds[k].casefold() for v in m)
    )
    total = len(mentions)
    return {"correct": correct, "total": total, "accuracy": correct / total if total else 0.0}

# Extract entity mentions from reference data
def _get_mentions_from_references(data: List[dict]) -> Dict[str, Set[str]]:
    return {d['id']: set(t['mention'] for t in d['targets']) for d in data}


In [None]:
import json
import glob
import tqdm
import os
import time
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI  # ✅ ChatGPT model
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)


# Set your OpenAI API key (ensure it's stored securely in production)
os.environ["OPENAI_API_KEY"] = ""
model_name="gpt-4o"
# Instantiate the ChatGPT model (can switch between gpt-3.5-turbo, gpt-4, gpt-4o)
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)


  llm = ChatOpenAI(model_name="gpt-4o", temperature=0)


In [None]:
# Basic zero-shot translation prompt
ZERO_SHOT_PROMPT_TEMPLATE_STRING_1 = """
Your task is to translate the following English sentences into {target_language}.
Input sentences are provided below as a JSON array of objects, each with an "id" and a "text" field.
Provide the translations as a JSON array of objects, where each object contains the original "id" and its "translation".
Maintain the original order of sentences from the input. Please return the json array without any additional text or formatting.

Input JSON:
{source_texts_json}
"""

# Named-entity-aware zero-shot prompt
ZERO_SHOT_PROMPT_TEMPLATE_STRING_2 = """
Your task is to translate the following English sentences into {target_language}.
Input sentences are provided below as a JSON array of objects, each with an "id" and a "text" field.
Ensure that **all named entities** (e.g., people, organizations, locations, product names) are translated **correctly and consistently** into the target language.
Provide the translations as a JSON array of objects, where each object contains the original "id" and its "translation".
Maintain the original order of sentences from the input. Please return the json array without any additional text or formatting.

Input JSON:
{source_texts_json}
"""

In [None]:
# Rate limit for API calls (OpenAI recommends <20 RPM for gpt-4/gpt-4o)
REQUESTS_PER_MINUTE_LIMIT = 15

# Delay to enforce between requests (in seconds)
DELAY_BETWEEN_REQUESTS_SECONDS = 60 / REQUESTS_PER_MINUTE_LIMIT  # 4.0 sec

# Number of input samples to send per prompt
BATCH_SIZE = 50


In [None]:
# Counter for API requests
api_request_count = 96

Phase 5: Translation Execution Logic

 What This Does:
Sends a batch of source texts to the LLM with retry support

Parses and verifies the returned JSON translations

Handles malformed output with clear fallback errors

Ensures reliable translation under API limits and occasional model inconsistencies.

In [None]:
@retry(
    wait=wait_random_exponential(multiplier=1, min=DELAY_BETWEEN_REQUESTS_SECONDS, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type(Exception)
)
def get_translated_content_with_retries(source_records_batch, target_language, llm_instance, template):
    global api_request_count

    # Prepare JSON input for the prompt
    input_json_for_prompt = [{"id": rec['id'], "text": rec['source']} for rec in source_records_batch]
    source_texts_json_str = json.dumps(input_json_for_prompt, ensure_ascii=False)

    # Format the prompt using the selected template
    prompt_to_send = template.format(
        source_texts_json=source_texts_json_str,
        target_language=target_language
    )

    # Invoke LLM with the prompt
    response = llm_instance.invoke(prompt_to_send)
    api_request_count += 1

    try:
        # Clean potential ```json markdown wrapper
        cleaned_content = response.content.strip()
        if cleaned_content.startswith("```json") and cleaned_content.endswith("```"):
            cleaned_content = cleaned_content[7:-3].strip()

        # Parse model output
        translated_outputs = json.loads(cleaned_content)

        # Validate expected structure
        if not isinstance(translated_outputs, list) or \
           not all(isinstance(item, dict) and 'id' in item and 'translation' in item for item in translated_outputs):
            raise ValueError("Model did not return a valid JSON array of translation objects.")

        # Map translations back to original IDs
        translated_dict = {item['id']: item['translation'] for item in translated_outputs}
        ordered_translations = []
        for record in source_records_batch:
            ordered_translations.append(translated_dict.get(record['id'], "ERROR: ID not found in JSON output"))

        return ordered_translations

    # Handle different error scenarios with fallback error messages
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Invalid JSON response from model"] * len(source_records_batch)
    except ValueError as e:
        print(f"Value Error (JSON format issue): {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Invalid JSON structure from model"] * len(source_records_batch)
    except Exception as e:
        print(f"An unexpected error occurred while processing model output: {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Unexpected issue processing model output"] * len(source_records_batch)


In [None]:
# Folder containing input `.jsonl` files for validation
input_data_folder = "/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation"

# List all .jsonl files in the folder
jsonl_files = glob.glob(f"{input_data_folder}/*.jsonl")

# Create an output folder for saving model predictions
output_prediction_dir = os.path.join("/content/drive/MyDrive/Colab Notebooks", "data/predictions", model_name, "validation")
os.makedirs(output_prediction_dir, exist_ok=True)


In [None]:
jsonl_files = glob.glob(f"{input_data_folder}/*.jsonl")
jsonl_files

['/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/de_DE.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/fr_FR.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/th_TH.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/ko_KR.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/es_ES.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/it_IT.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/ja_JP.jsonl',
 '/content/drive/MyDrive/Colab Notebooks//semeval.validation.v2-889a1492ba6c3791baa8f4224bc8e685/validation/ar_AE.jsonl',
 '/content/drive/MyDrive

In [None]:
def zero_shot_eval(template, template_id):
    overall_pbar = tqdm.tqdm(jsonl_files, desc="Processing files")

    # Create subfolder for this specific prompt variant
    output_prediction_dir_1 = os.path.join(output_prediction_dir, template_id)
    os.makedirs(output_prediction_dir_1, exist_ok=True)

    for file_path in overall_pbar:
        filename = os.path.basename(file_path)
        outfile_path = os.path.join(output_prediction_dir_1, filename)

        # Read input records from JSONL file
        data_to_translate = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data_to_translate.append(json.loads(line))

        translated_results_for_file = []

        # Process in batches
        for i in tqdm.tqdm(range(0, len(data_to_translate), BATCH_SIZE), desc=f"Translating {filename} in batches", leave=False):
            batch_records = data_to_translate[i : i + BATCH_SIZE]

            target_locale = batch_records[0]['target_locale']
            target_language = get_language_name(target_locale)

            try:
                # Run translation with retry support
                translated_texts_batch = get_translated_content_with_retries(
                    batch_records, target_language, llm, template
                )

                # Store results
                for j, record in enumerate(batch_records):
                    record_id = record['id']
                    source_text = record['source']
                    source_locale = record['source_locale']
                    translated_text = translated_texts_batch[j] if j < len(translated_texts_batch) else "ERROR: Translation missing"
                    translated_results_for_file.append({
                        "id": record_id,
                        "source_language": get_language_name(source_locale),
                        "target_language": target_language,
                        "text": source_text,
                        "prediction": translated_text,
                    })

            except Exception as e:
                print(f"\nCRITICAL ERROR: Failed to translate a batch starting with ID '{batch_records[0]['id']}' after multiple retries. Error: {e}")
                for record in batch_records:
                    translated_results_for_file.append({
                        "id": record['id'],
                        "source_language": get_language_name(record['source_locale']),
                        "target_language": get_language_name(record['target_locale']),
                        "text": record['source'],
                        "prediction": "ERROR: Batch translation failed due to API issues/rate limits.",
                    })

            # Respect API rate limits between batches/files
            if i + BATCH_SIZE < len(data_to_translate) or overall_pbar.n < len(jsonl_files):
                 time.sleep(DELAY_BETWEEN_REQUESTS_SECONDS)

        # Save translated output to file
        with open(outfile_path, 'w', encoding='utf-8') as f:
            for res in translated_results_for_file:
                f.write(json.dumps(res, ensure_ascii=False) + '\n')

        print(f"\nTranslations for {filename} saved to {outfile_path}")

    overall_pbar.close()
    print("All files processed and translations saved.")


In [None]:
# Download and load the COMET model for evaluation
comet_model = download_comet_model()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [None]:
def calculate_scores(template_id):
    # Create directory to save evaluation scores
    scores_dir = os.path.join(output_prediction_dir, template_id, "scores")
    if not os.path.exists(scores_dir):
        os.makedirs(scores_dir, exist_ok=True)

    for file_path in jsonl_files:
        references_path = file_path
        filename = os.path.basename(file_path)
        predictions_path = os.path.join(output_prediction_dir, template_id, filename)

        # Compute COMET score
        comet_score = calculate_comet_scores(
            comet_model,
            references_path,
            predictions_path
        )

        # Compute entity-level accuracy
        correct_instances, total_instances, meta_score = calculate_meta_score(
            references_path,
            predictions_path)

        # Package results into JSON
        evaluation_results = {
            "correct_instances": correct_instances,
            "total_instances": total_instances,
            "comet_score": comet_score,
            "meta_score": meta_score
        }

        # Save scores to file
        evaluation_output_path = os.path.join(scores_dir, f"{os.path.splitext(filename)[0]}.json")
        with open(evaluation_output_path, 'w', encoding='utf-8') as json_file:
            json.dump(evaluation_results, json_file, ensure_ascii=False, indent=4)


In [None]:
# Build the zero-shot prompt using template string 1 (basic translation)
zero_shot_prompt_template = PromptTemplate(
    input_variables=["source_texts_json", "target_language"],
    template=ZERO_SHOT_PROMPT_TEMPLATE_STRING_1,
)

# Run evaluation pipeline: translate and save predictions
zero_shot_eval(zero_shot_prompt_template, "zero-shot-1")

# Evaluate predictions using COMET and entity-level accuracy
calculate_scores("zero-shot-1")


Processing files:   0%|          | 0/10 [00:00<?, ?it/s]
Translating de_DE.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating de_DE.jsonl in batches:   7%|▋         | 1/15 [00:20<04:46, 20.48s/it][A
Translating de_DE.jsonl in batches:  13%|█▎        | 2/15 [00:49<05:31, 25.54s/it][A
Translating de_DE.jsonl in batches:  20%|██        | 3/15 [01:05<04:13, 21.13s/it][A
Translating de_DE.jsonl in batches:  27%|██▋       | 4/15 [01:22<03:33, 19.39s/it][A
Translating de_DE.jsonl in batches:  33%|███▎      | 5/15 [01:50<03:47, 22.71s/it][A
Translating de_DE.jsonl in batches:  40%|████      | 6/15 [02:09<03:10, 21.20s/it][A
Translating de_DE.jsonl in batches:  47%|████▋     | 7/15 [02:38<03:12, 24.02s/it][A
Translating de_DE.jsonl in batches:  53%|█████▎    | 8/15 [03:02<02:48, 24.01s/it][A
Translating de_DE.jsonl in batches:  60%|██████    | 9/15 [03:24<02:19, 23.29s/it][A
Translating de_DE.jsonl in batches:  67%|██████▋   | 10/15 [03:40<01:44, 20.98s/it][A
Tran


Translations for de_DE.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/de_DE.jsonl



Translating fr_FR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating fr_FR.jsonl in batches:   7%|▋         | 1/15 [00:16<03:46, 16.20s/it][A
Translating fr_FR.jsonl in batches:  13%|█▎        | 2/15 [00:31<03:20, 15.44s/it][A
Translating fr_FR.jsonl in batches:  20%|██        | 3/15 [01:02<04:32, 22.74s/it][A
Translating fr_FR.jsonl in batches:  27%|██▋       | 4/15 [01:29<04:30, 24.60s/it][A
Translating fr_FR.jsonl in batches:  33%|███▎      | 5/15 [01:49<03:47, 22.71s/it][A
Translating fr_FR.jsonl in batches:  40%|████      | 6/15 [02:06<03:06, 20.78s/it][A
Translating fr_FR.jsonl in batches:  47%|████▋     | 7/15 [02:29<02:53, 21.68s/it][A
Translating fr_FR.jsonl in batches:  53%|█████▎    | 8/15 [03:20<03:36, 30.98s/it][A
Translating fr_FR.jsonl in batches:  60%|██████    | 9/15 [03:43<02:49, 28.32s/it][A
Translating fr_FR.jsonl in batches:  67%|██████▋   | 10/15 [04:18<02:32, 30.56s/it][A
Translating fr_FR.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for fr_FR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/fr_FR.jsonl



Translating th_TH.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating th_TH.jsonl in batches:   7%|▋         | 1/15 [00:28<06:44, 28.88s/it][A
Translating th_TH.jsonl in batches:  13%|█▎        | 2/15 [01:17<08:49, 40.76s/it][A
Translating th_TH.jsonl in batches:  20%|██        | 3/15 [02:04<08:41, 43.49s/it][A
Translating th_TH.jsonl in batches:  27%|██▋       | 4/15 [02:42<07:35, 41.39s/it][A
Translating th_TH.jsonl in batches:  33%|███▎      | 5/15 [03:38<07:45, 46.55s/it][A
Translating th_TH.jsonl in batches:  40%|████      | 6/15 [04:19<06:41, 44.56s/it][A
Translating th_TH.jsonl in batches:  47%|████▋     | 7/15 [04:49<05:19, 39.93s/it][A
Translating th_TH.jsonl in batches:  53%|█████▎    | 8/15 [05:16<04:10, 35.81s/it][A
Translating th_TH.jsonl in batches:  60%|██████    | 9/15 [05:42<03:16, 32.75s/it][A
Translating th_TH.jsonl in batches:  67%|██████▋   | 10/15 [05:59<02:18, 27.73s/it][A
Translating th_TH.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for th_TH.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/th_TH.jsonl



Translating ko_KR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating ko_KR.jsonl in batches:   7%|▋         | 1/15 [00:26<06:06, 26.18s/it][A
Translating ko_KR.jsonl in batches:  13%|█▎        | 2/15 [00:43<04:35, 21.18s/it][A
Translating ko_KR.jsonl in batches:  20%|██        | 3/15 [01:17<05:25, 27.09s/it][A
Translating ko_KR.jsonl in batches:  27%|██▋       | 4/15 [01:43<04:51, 26.46s/it][A
Translating ko_KR.jsonl in batches:  33%|███▎      | 5/15 [02:09<04:23, 26.34s/it][A
Translating ko_KR.jsonl in batches:  40%|████      | 6/15 [02:36<03:58, 26.49s/it][A
Translating ko_KR.jsonl in batches:  47%|████▋     | 7/15 [02:54<03:10, 23.83s/it][A
Translating ko_KR.jsonl in batches:  53%|█████▎    | 8/15 [03:23<02:57, 25.41s/it][A
Translating ko_KR.jsonl in batches:  60%|██████    | 9/15 [03:51<02:38, 26.34s/it][A
Translating ko_KR.jsonl in batches:  67%|██████▋   | 10/15 [04:26<02:25, 29.01s/it][A
Translating ko_KR.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for ko_KR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/ko_KR.jsonl



Translating es_ES.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating es_ES.jsonl in batches:   7%|▋         | 1/15 [00:25<05:59, 25.64s/it][A
Translating es_ES.jsonl in batches:  13%|█▎        | 2/15 [00:51<05:38, 26.05s/it][A
Translating es_ES.jsonl in batches:  20%|██        | 3/15 [01:16<05:03, 25.27s/it][A
Translating es_ES.jsonl in batches:  27%|██▋       | 4/15 [01:48<05:07, 27.99s/it][A
Translating es_ES.jsonl in batches:  33%|███▎      | 5/15 [02:16<04:40, 28.09s/it][A
Translating es_ES.jsonl in batches:  40%|████      | 6/15 [02:33<03:38, 24.25s/it][A
Translating es_ES.jsonl in batches:  47%|████▋     | 7/15 [02:51<02:56, 22.06s/it][A
Translating es_ES.jsonl in batches:  53%|█████▎    | 8/15 [03:15<02:39, 22.79s/it][A
Translating es_ES.jsonl in batches:  60%|██████    | 9/15 [03:33<02:07, 21.17s/it][A
Translating es_ES.jsonl in batches:  67%|██████▋   | 10/15 [04:01<01:57, 23.40s/it][A
Translating es_ES.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for es_ES.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/es_ES.jsonl



Translating it_IT.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating it_IT.jsonl in batches:   7%|▋         | 1/15 [00:29<06:54, 29.59s/it][A
Translating it_IT.jsonl in batches:  13%|█▎        | 2/15 [00:56<06:01, 27.83s/it][A
Translating it_IT.jsonl in batches:  20%|██        | 3/15 [01:23<05:30, 27.52s/it][A
Translating it_IT.jsonl in batches:  27%|██▋       | 4/15 [01:42<04:27, 24.29s/it][A
Translating it_IT.jsonl in batches:  33%|███▎      | 5/15 [01:59<03:35, 21.52s/it][A
Translating it_IT.jsonl in batches:  40%|████      | 6/15 [02:20<03:11, 21.31s/it][A
Translating it_IT.jsonl in batches:  47%|████▋     | 7/15 [02:44<02:58, 22.33s/it][A
Translating it_IT.jsonl in batches:  53%|█████▎    | 8/15 [03:02<02:26, 20.98s/it][A
Translating it_IT.jsonl in batches:  60%|██████    | 9/15 [03:21<02:02, 20.35s/it][A
Translating it_IT.jsonl in batches:  67%|██████▋   | 10/15 [03:48<01:51, 22.32s/it][A
Translating it_IT.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for it_IT.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/it_IT.jsonl



Translating ja_JP.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating ja_JP.jsonl in batches:   7%|▋         | 1/15 [00:20<04:49, 20.71s/it][A
Translating ja_JP.jsonl in batches:  13%|█▎        | 2/15 [00:54<06:09, 28.45s/it][A
Translating ja_JP.jsonl in batches:  20%|██        | 3/15 [01:17<05:10, 25.91s/it][A
Translating ja_JP.jsonl in batches:  27%|██▋       | 4/15 [01:46<04:57, 27.06s/it][A
Translating ja_JP.jsonl in batches:  33%|███▎      | 5/15 [02:09<04:15, 25.59s/it][A
Translating ja_JP.jsonl in batches:  40%|████      | 6/15 [02:36<03:55, 26.16s/it][A
Translating ja_JP.jsonl in batches:  47%|████▋     | 7/15 [03:16<04:05, 30.70s/it][A
Translating ja_JP.jsonl in batches:  53%|█████▎    | 8/15 [03:51<03:43, 31.92s/it][A
Translating ja_JP.jsonl in batches:  60%|██████    | 9/15 [04:29<03:23, 33.90s/it][A
Translating ja_JP.jsonl in batches:  67%|██████▋   | 10/15 [04:52<02:33, 30.61s/it][A
Translating ja_JP.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for ja_JP.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/ja_JP.jsonl



Translating ar_AE.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating ar_AE.jsonl in batches:   7%|▋         | 1/15 [00:27<06:26, 27.61s/it][A
Translating ar_AE.jsonl in batches:  13%|█▎        | 2/15 [00:56<06:06, 28.18s/it][A
Translating ar_AE.jsonl in batches:  20%|██        | 3/15 [01:17<04:57, 24.83s/it][A
Translating ar_AE.jsonl in batches:  27%|██▋       | 4/15 [01:36<04:10, 22.77s/it][A
Translating ar_AE.jsonl in batches:  33%|███▎      | 5/15 [02:02<03:59, 23.91s/it][A
Translating ar_AE.jsonl in batches:  40%|████      | 6/15 [02:26<03:35, 23.93s/it][A
Translating ar_AE.jsonl in batches:  47%|████▋     | 7/15 [03:04<03:47, 28.40s/it][A
Translating ar_AE.jsonl in batches:  53%|█████▎    | 8/15 [03:56<04:12, 36.08s/it][A
Translating ar_AE.jsonl in batches:  60%|██████    | 9/15 [04:22<03:17, 32.95s/it][A
Translating ar_AE.jsonl in batches:  67%|██████▋   | 10/15 [04:47<02:32, 30.42s/it][A
Translating ar_AE.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for ar_AE.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/ar_AE.jsonl



Translating tr_TR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating tr_TR.jsonl in batches:   7%|▋         | 1/15 [00:32<07:38, 32.72s/it][A
Translating tr_TR.jsonl in batches:  13%|█▎        | 2/15 [00:57<06:08, 28.31s/it][A
Translating tr_TR.jsonl in batches:  20%|██        | 3/15 [01:41<07:04, 35.36s/it][A
Translating tr_TR.jsonl in batches:  27%|██▋       | 4/15 [02:06<05:41, 31.02s/it][A
Translating tr_TR.jsonl in batches:  33%|███▎      | 5/15 [02:23<04:20, 26.00s/it][A
Translating tr_TR.jsonl in batches:  40%|████      | 6/15 [02:51<04:02, 26.94s/it][A
Translating tr_TR.jsonl in batches:  47%|████▋     | 7/15 [03:23<03:48, 28.51s/it][A
Translating tr_TR.jsonl in batches:  53%|█████▎    | 8/15 [03:56<03:29, 29.99s/it][A
Translating tr_TR.jsonl in batches:  60%|██████    | 9/15 [04:14<02:37, 26.29s/it][A
Translating tr_TR.jsonl in batches:  67%|██████▋   | 10/15 [04:40<02:10, 26.19s/it][A
Translating tr_TR.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for tr_TR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/tr_TR.jsonl



Translating zh_TW.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A
Translating zh_TW.jsonl in batches:   7%|▋         | 1/15 [00:35<08:10, 35.02s/it][A
Translating zh_TW.jsonl in batches:  13%|█▎        | 2/15 [00:52<05:17, 24.43s/it][A
Translating zh_TW.jsonl in batches:  20%|██        | 3/15 [01:23<05:33, 27.77s/it][A
Translating zh_TW.jsonl in batches:  27%|██▋       | 4/15 [02:31<08:00, 43.67s/it][A
Translating zh_TW.jsonl in batches:  33%|███▎      | 5/15 [03:07<06:48, 40.87s/it][A
Translating zh_TW.jsonl in batches:  40%|████      | 6/15 [03:34<05:23, 36.00s/it][A
Translating zh_TW.jsonl in batches:  47%|████▋     | 7/15 [04:13<04:55, 36.92s/it][A
Translating zh_TW.jsonl in batches:  53%|█████▎    | 8/15 [04:33<03:40, 31.56s/it][A
Translating zh_TW.jsonl in batches:  60%|██████    | 9/15 [05:03<03:06, 31.15s/it][A
Translating zh_TW.jsonl in batches:  67%|██████▋   | 10/15 [05:21<02:15, 27.14s/it][A
Translating zh_TW.jsonl in batches:  73%|███████▎  | 11/15 [


Translations for zh_TW.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-1/zh_TW.jsonl
All files processed and translations saved.
Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [14:37<00:00, 21.94s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 89.74
Created 1316 instances


Predicting DataLoader 0: 100%|██████████| 42/42 [16:24<00:00, 23.44s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 90.08
Created 1654 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [20:05<00:00, 23.19s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 82.99
Created 1660 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [18:44<00:00, 21.63s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 93.44
Created 1229 instances


Predicting DataLoader 0: 100%|██████████| 39/39 [14:23<00:00, 22.15s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 92.24
Created 1268 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [14:03<00:00, 21.08s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 91.40
Created 1409 instances


Predicting DataLoader 0: 100%|██████████| 45/45 [17:20<00:00, 23.11s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 93.64
Created 1177 instances


Predicting DataLoader 0: 100%|██████████| 37/37 [12:04<00:00, 19.59s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 90.86
Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [13:04<00:00, 19.60s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 92.65
Created 1544 instances


Predicting DataLoader 0: 100%|██████████| 49/49 [15:53<00:00, 19.45s/it]


Average COMET score: 92.28


In [None]:
# Build the zero-shot prompt using template string 2 (with entity preservation)
zero_shot_prompt_template = PromptTemplate(
    input_variables=["source_texts_json", "target_language"],
    template=ZERO_SHOT_PROMPT_TEMPLATE_STRING_2,
)

# Run second evaluation pipeline with entity-focused prompt
zero_shot_eval(zero_shot_prompt_template, "zero-shot-2")

# Evaluate second run using COMET and entity accuracy
calculate_scores("zero-shot-2")



Processing files:   0%|          | 0/10 [00:00<?, ?it/s][A

Translating de_DE.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating de_DE.jsonl in batches:   7%|▋         | 1/15 [00:17<04:04, 17.46s/it][A[A

Translating de_DE.jsonl in batches:  13%|█▎        | 2/15 [00:44<04:58, 22.98s/it][A[A

Translating de_DE.jsonl in batches:  20%|██        | 3/15 [01:11<05:00, 25.02s/it][A[A

Translating de_DE.jsonl in batches:  27%|██▋       | 4/15 [01:32<04:16, 23.33s/it][A[A

Translating de_DE.jsonl in batches:  33%|███▎      | 5/15 [01:53<03:45, 22.52s/it][A[A

Translating de_DE.jsonl in batches:  40%|████      | 6/15 [02:13<03:14, 21.58s/it][A[A

Translating de_DE.jsonl in batches:  47%|████▋     | 7/15 [02:32<02:47, 20.95s/it][A[A

Translating de_DE.jsonl in batches:  53%|█████▎    | 8/15 [02:53<02:25, 20.84s/it][A[A

Translating de_DE.jsonl in batches:  60%|██████    | 9/15 [03:13<02:02, 20.42s/it][A[A

Translating de_DE.jsonl in batches:  67%|█████


Translations for de_DE.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/de_DE.jsonl




Translating fr_FR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating fr_FR.jsonl in batches:   7%|▋         | 1/15 [00:20<04:48, 20.61s/it][A[A

Translating fr_FR.jsonl in batches:  13%|█▎        | 2/15 [00:41<04:33, 21.00s/it][A[A

Translating fr_FR.jsonl in batches:  20%|██        | 3/15 [01:03<04:17, 21.48s/it][A[A

Translating fr_FR.jsonl in batches:  27%|██▋       | 4/15 [01:24<03:53, 21.21s/it][A[A

Translating fr_FR.jsonl in batches:  33%|███▎      | 5/15 [01:44<03:26, 20.62s/it][A[A

Translating fr_FR.jsonl in batches:  40%|████      | 6/15 [02:03<03:00, 20.04s/it][A[A

Translating fr_FR.jsonl in batches:  47%|████▋     | 7/15 [02:28<02:55, 21.89s/it][A[A

Translating fr_FR.jsonl in batches:  53%|█████▎    | 8/15 [02:53<02:38, 22.61s/it][A[A

Translating fr_FR.jsonl in batches:  60%|██████    | 9/15 [03:13<02:11, 21.90s/it][A[A

Translating fr_FR.jsonl in batches:  67%|██████▋   | 10/15 [03:33<01:46, 21.25s/it][A[A

Translating fr_


Translations for fr_FR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/fr_FR.jsonl




Translating th_TH.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating th_TH.jsonl in batches:   7%|▋         | 1/15 [00:27<06:27, 27.66s/it][A[A

Translating th_TH.jsonl in batches:  13%|█▎        | 2/15 [00:52<05:37, 25.96s/it][A[A

Translating th_TH.jsonl in batches:  20%|██        | 3/15 [01:18<05:11, 25.96s/it][A[A

Translating th_TH.jsonl in batches:  27%|██▋       | 4/15 [01:44<04:47, 26.18s/it][A[A

Translating th_TH.jsonl in batches:  33%|███▎      | 5/15 [02:12<04:27, 26.70s/it][A[A

Translating th_TH.jsonl in batches:  40%|████      | 6/15 [02:36<03:50, 25.60s/it][A[A

Translating th_TH.jsonl in batches:  47%|████▋     | 7/15 [03:01<03:25, 25.63s/it][A[A

Translating th_TH.jsonl in batches:  53%|█████▎    | 8/15 [03:36<03:18, 28.42s/it][A[A

Translating th_TH.jsonl in batches:  60%|██████    | 9/15 [04:02<02:45, 27.65s/it][A[A

Translating th_TH.jsonl in batches:  67%|██████▋   | 10/15 [04:41<02:36, 31.36s/it][A[A

Translating th_


Translations for th_TH.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/th_TH.jsonl




Translating ko_KR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating ko_KR.jsonl in batches:   7%|▋         | 1/15 [00:30<07:04, 30.29s/it][A[A

Translating ko_KR.jsonl in batches:  13%|█▎        | 2/15 [00:56<06:00, 27.74s/it][A[A

Translating ko_KR.jsonl in batches:  20%|██        | 3/15 [01:22<05:25, 27.10s/it][A[A

Translating ko_KR.jsonl in batches:  27%|██▋       | 4/15 [01:45<04:40, 25.54s/it][A[A

Translating ko_KR.jsonl in batches:  33%|███▎      | 5/15 [02:08<04:04, 24.48s/it][A[A

Translating ko_KR.jsonl in batches:  40%|████      | 6/15 [02:29<03:30, 23.37s/it][A[A

Translating ko_KR.jsonl in batches:  47%|████▋     | 7/15 [02:54<03:11, 23.89s/it][A[A

Translating ko_KR.jsonl in batches:  53%|█████▎    | 8/15 [03:19<02:49, 24.25s/it][A[A

Translating ko_KR.jsonl in batches:  60%|██████    | 9/15 [03:41<02:20, 23.45s/it][A[A

Translating ko_KR.jsonl in batches:  67%|██████▋   | 10/15 [04:03<01:54, 22.94s/it][A[A

Translating ko_


Translations for ko_KR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/ko_KR.jsonl




Translating es_ES.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating es_ES.jsonl in batches:   7%|▋         | 1/15 [00:19<04:31, 19.37s/it][A[A

Translating es_ES.jsonl in batches:  13%|█▎        | 2/15 [00:39<04:18, 19.87s/it][A[A

Translating es_ES.jsonl in batches:  20%|██        | 3/15 [01:02<04:15, 21.31s/it][A[A

Translating es_ES.jsonl in batches:  27%|██▋       | 4/15 [01:19<03:36, 19.66s/it][A[A

Translating es_ES.jsonl in batches:  33%|███▎      | 5/15 [02:07<04:57, 29.73s/it][A[A

Translating es_ES.jsonl in batches:  40%|████      | 6/15 [02:27<03:57, 26.43s/it][A[A

Translating es_ES.jsonl in batches:  47%|████▋     | 7/15 [02:47<03:15, 24.45s/it][A[A

Translating es_ES.jsonl in batches:  53%|█████▎    | 8/15 [03:14<02:56, 25.15s/it][A[A

Translating es_ES.jsonl in batches:  60%|██████    | 9/15 [03:34<02:21, 23.57s/it][A[A

Translating es_ES.jsonl in batches:  67%|██████▋   | 10/15 [03:56<01:54, 23.00s/it][A[A

Translating es_


Translations for es_ES.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/es_ES.jsonl




Translating it_IT.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating it_IT.jsonl in batches:   7%|▋         | 1/15 [00:19<04:30, 19.35s/it][A[A

Translating it_IT.jsonl in batches:  13%|█▎        | 2/15 [00:38<04:08, 19.12s/it][A[A

Translating it_IT.jsonl in batches:  20%|██        | 3/15 [01:00<04:08, 20.74s/it][A[A

Translating it_IT.jsonl in batches:  27%|██▋       | 4/15 [01:20<03:42, 20.24s/it][A[A

Translating it_IT.jsonl in batches:  33%|███▎      | 5/15 [01:39<03:16, 19.66s/it][A[A

Translating it_IT.jsonl in batches:  40%|████      | 6/15 [01:59<02:58, 19.87s/it][A[A

Translating it_IT.jsonl in batches:  47%|████▋     | 7/15 [02:21<02:45, 20.73s/it][A[A

Translating it_IT.jsonl in batches:  53%|█████▎    | 8/15 [02:45<02:32, 21.78s/it][A[A

Translating it_IT.jsonl in batches:  60%|██████    | 9/15 [03:07<02:10, 21.75s/it][A[A

Translating it_IT.jsonl in batches:  67%|██████▋   | 10/15 [03:31<01:51, 22.37s/it][A[A

Translating it_


Translations for it_IT.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/it_IT.jsonl




Translating ja_JP.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating ja_JP.jsonl in batches:   7%|▋         | 1/15 [00:26<06:10, 26.49s/it][A[A

Translating ja_JP.jsonl in batches:  13%|█▎        | 2/15 [00:49<05:18, 24.49s/it][A[A

Translating ja_JP.jsonl in batches:  20%|██        | 3/15 [01:09<04:31, 22.60s/it][A[A

Translating ja_JP.jsonl in batches:  27%|██▋       | 4/15 [01:32<04:07, 22.50s/it][A[A

Translating ja_JP.jsonl in batches:  33%|███▎      | 5/15 [01:54<03:44, 22.42s/it][A[A

Translating ja_JP.jsonl in batches:  40%|████      | 6/15 [02:12<03:09, 21.05s/it][A[A

Translating ja_JP.jsonl in batches:  47%|████▋     | 7/15 [02:33<02:46, 20.80s/it][A[A

Translating ja_JP.jsonl in batches:  53%|█████▎    | 8/15 [02:56<02:31, 21.62s/it][A[A

Translating ja_JP.jsonl in batches:  60%|██████    | 9/15 [03:22<02:18, 23.07s/it][A[A

Translating ja_JP.jsonl in batches:  67%|██████▋   | 10/15 [03:47<01:57, 23.58s/it][A[A

Translating ja_


Translations for ja_JP.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/ja_JP.jsonl




Translating ar_AE.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating ar_AE.jsonl in batches:   7%|▋         | 1/15 [00:22<05:14, 22.49s/it][A[A

Translating ar_AE.jsonl in batches:  13%|█▎        | 2/15 [00:40<04:21, 20.14s/it][A[A

Translating ar_AE.jsonl in batches:  20%|██        | 3/15 [01:01<04:03, 20.30s/it][A[A

Translating ar_AE.jsonl in batches:  27%|██▋       | 4/15 [01:20<03:37, 19.74s/it][A[A

Translating ar_AE.jsonl in batches:  33%|███▎      | 5/15 [01:40<03:17, 19.72s/it][A[A

Translating ar_AE.jsonl in batches:  40%|████      | 6/15 [01:59<02:57, 19.74s/it][A[A

Translating ar_AE.jsonl in batches:  47%|████▋     | 7/15 [02:18<02:34, 19.27s/it][A[A

Translating ar_AE.jsonl in batches:  53%|█████▎    | 8/15 [02:36<02:13, 19.05s/it][A[A

Translating ar_AE.jsonl in batches:  60%|██████    | 9/15 [03:01<02:04, 20.76s/it][A[A

Translating ar_AE.jsonl in batches:  67%|██████▋   | 10/15 [03:20<01:41, 20.31s/it][A[A

Translating ar_


Translations for ar_AE.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/ar_AE.jsonl




Translating tr_TR.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating tr_TR.jsonl in batches:   7%|▋         | 1/15 [00:20<04:46, 20.48s/it][A[A

Translating tr_TR.jsonl in batches:  13%|█▎        | 2/15 [00:39<04:15, 19.67s/it][A[A

Translating tr_TR.jsonl in batches:  20%|██        | 3/15 [01:01<04:06, 20.57s/it][A[A

Translating tr_TR.jsonl in batches:  27%|██▋       | 4/15 [01:22<03:48, 20.77s/it][A[A

Translating tr_TR.jsonl in batches:  33%|███▎      | 5/15 [01:45<03:37, 21.79s/it][A[A

Translating tr_TR.jsonl in batches:  40%|████      | 6/15 [02:16<03:43, 24.86s/it][A[A

Translating tr_TR.jsonl in batches:  47%|████▋     | 7/15 [02:40<03:17, 24.63s/it][A[A

Translating tr_TR.jsonl in batches:  53%|█████▎    | 8/15 [03:27<03:42, 31.77s/it][A[A

Translating tr_TR.jsonl in batches:  60%|██████    | 9/15 [03:56<03:04, 30.83s/it][A[A

Translating tr_TR.jsonl in batches:  67%|██████▋   | 10/15 [04:36<02:48, 33.64s/it][A[A

Translating tr_


Translations for tr_TR.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/tr_TR.jsonl




Translating zh_TW.jsonl in batches:   0%|          | 0/15 [00:00<?, ?it/s][A[A

Translating zh_TW.jsonl in batches:   7%|▋         | 1/15 [00:19<04:32, 19.48s/it][A[A

Translating zh_TW.jsonl in batches:  13%|█▎        | 2/15 [00:44<04:55, 22.71s/it][A[A

Translating zh_TW.jsonl in batches:  20%|██        | 3/15 [01:05<04:21, 21.76s/it][A[A

Translating zh_TW.jsonl in batches:  27%|██▋       | 4/15 [01:27<04:00, 21.91s/it][A[A

Translating zh_TW.jsonl in batches:  33%|███▎      | 5/15 [01:48<03:37, 21.70s/it][A[A

Translating zh_TW.jsonl in batches:  40%|████      | 6/15 [02:11<03:17, 21.97s/it][A[A

Translating zh_TW.jsonl in batches:  47%|████▋     | 7/15 [02:43<03:23, 25.43s/it][A[A

Translating zh_TW.jsonl in batches:  53%|█████▎    | 8/15 [03:05<02:50, 24.39s/it][A[A

Translating zh_TW.jsonl in batches:  60%|██████    | 9/15 [03:25<02:17, 22.96s/it][A[A

Translating zh_TW.jsonl in batches:  67%|██████▋   | 10/15 [03:46<01:50, 22.19s/it][A[A

Translating zh_


Translations for zh_TW.jsonl saved to /content/drive/MyDrive/Colab Notebooks/data/predictions/gpt-4o/validation/zero-shot-2/zh_TW.jsonl
All files processed and translations saved.
Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [15:00<00:00, 22.50s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 89.71
Created 1316 instances


Predicting DataLoader 0: 100%|██████████| 42/42 [17:06<00:00, 24.45s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 89.62
Created 1654 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [20:50<00:00, 24.04s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 81.81
Created 1660 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [19:19<00:00, 22.30s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 92.77
Created 1229 instances


Predicting DataLoader 0: 100%|██████████| 39/39 [14:36<00:00, 22.47s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 92.06
Created 1268 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [14:29<00:00, 21.73s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 91.06
Created 1409 instances


Predicting DataLoader 0: 100%|██████████| 45/45 [17:55<00:00, 23.90s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 93.23
Created 1177 instances


Predicting DataLoader 0: 100%|██████████| 37/37 [12:30<00:00, 20.29s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 88.55
Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [13:36<00:00, 20.42s/it]
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Average COMET score: 92.57
Created 1544 instances


Predicting DataLoader 0: 100%|██████████| 49/49 [16:34<00:00, 20.29s/it]


Average COMET score: 92.11


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Few-shot prompt template for translation with example guidance
FEW_SHOT_PROMPT_TEMPLATE_STRING = """
Your task is to translate English sentences into {target_language}.
Below are a few examples of English sentences with named entities translated correctly into {target_language}.
Pay close attention to how named entities (e.g., people, organizations, locations, product names) are translated correctly and consistently.

Examples:
{few_shot_examples_json}

Now, translate the following new English sentences into {target_language}.
Provide the translations as a JSON array of objects, where each object contains the original "id" and its "translation" (ensure the key name is "translation").
Maintain the original order of sentences from the input. Please return the json array without any additional text or formatting.

Input JSON:
{source_texts_json}
"""


In [None]:
few_shot_examples_dict = {
    'ar': [
        {'source': 'What is the seventh tallest mountain in North America?', 'target': 'ما سابع أعلى جبل في أمريكا الشمالية؟'},
        {'source': 'Which actor was the star of Titanic and was born in Los Angeles, California?', 'target': 'مَنْ الممثل الذي لعب دور البطولة في فيلم "تيتانيك" وهو من مواليد لوس أنجلوس بكاليفورنيا؟'},
        {'source': 'What year was the first book of the A Song of Ice and Fire series published?', 'target': 'في أي عام تم نشر أول كتاب من سلسلة "أغنية الجليد والنار"؟'},
        {'source': 'Who is the youngest current US governor?', 'target': 'مَن أصغر حاكم ولاية أمريكي حالٍ؟'},
        {'source': 'How long did it take to build the Lincoln Memorial?', 'target': 'كم من الوقت استغرق بناء نصب لينكولن التذكاري؟'},
        {'source': 'Has Bernie Sanders ever been president of the United States?', 'target': 'هل كان بيرني ساندرز رئيسًا للولايات المتحدة يومًا ما؟'},
        {'source': 'Which actor was Stephenie Meyers first choice to play Edward Cullen in the movie Twilight?', 'target': 'مَن الممثل الذي وقع عليه الاختيار الأول للكاتبة ستيفاني ماير للعب دور "إدوارد كولن" في فيلم "الشفق"؟'},
        {'source': 'Which river is longer than the Mississippi River?', 'target': 'أي نهر أطول من نهر المسيسيبي؟'},
        {'source': 'What is the latest US state to be admitted to the union that is not Hawaii?', 'target': 'ما آخر ولاية أمريكية تم قبولها في الاتحاد، بخلاف هاواي؟'},
        {'source': 'What is the longest lake in the world?', 'target': 'ما أطول بحيرة في العالم؟'},
    ],
    'de': [
        {'source': 'What is the seventh tallest mountain in North America?', 'target': 'Wie heißt der siebthöchste Berg Nordamerikas?'},
        {'source': 'What year was the first book of the A Song of Ice and Fire series published?', 'target': 'In welchem Jahr wurde das erste Buch der Reihe "Das Lied von Eis und Feuer" veröffentlicht?'},
        {'source': 'Who is the youngest current US governor?', 'target': 'Wer ist derzeit der jüngste amerikanische Gouverneur?'},
        {'source': 'Has Bernie Sanders ever been president of the United States?', 'target': 'War Bernie Sanders jemals Präsident der Vereinigten Staaten?'},
        {'source': 'Which actor was Stephenie Meyers first choice to play Edward Cullen in the movie Twilight?', 'target': 'Welcher Schauspieler war Stephanie Meyers erste Wahl für die Rolle des Edward Cullen in dem Film Twilight – Biss zum Morgengrauen?'},
        {'source': 'Which river is longer than the Mississippi River?', 'target': 'Welcher Fluss ist länger als der Mississippi'},
        {'source': 'What is the longest lake in the world?', 'target': 'Welcher See ist der längste der Welt?'},
        {'source': 'Is Texas the largest state in US?', 'target': 'Ist Texas der größte Bundesstaat in den Vereinigten Staaten?'},
        {'source': 'Who was the president of Argentina from 1989 to 1999?', 'target': 'Wer war von 1989 - 1999 Präsident von Argentinien?'},
        {'source': 'Who was a member of the 2004 U.S. Olympic mens swim team and born in Baltimore, Maryland?', 'target': 'Wer gehörte 2004 zur olympischen, Schwimmstaffel der Vereinigten Staaten und wurde in Baltimore, Maryland geboren?'},
    ],
    'es': [
        {'source': 'Which actor was the star of Titanic and was born in Los Angeles, California?', 'target': '¿Qué actor protagonizó Titanic y nació en Los Ángeles, California?'},
        {'source': 'What year was the first book of the A Song of Ice and Fire series published?', 'target': '¿En qué año se publicó el primer libro de la saga Canción de hielo y fuego?'},
        {'source': 'Which US president has had the most votes?', 'target': '¿Qué presidente de Estados Unidos obtuvo más votos?'},
        {'source': 'How long did it take to build the Lincoln Memorial?', 'target': '¿En cuánto tiempo se construyó el Monumento a Lincoln?'},
        {'source': 'Has Bernie Sanders ever been president of the United States?', 'target': '¿Bernie Sanders ha sido alguna vez presidente de los Estados Unidos?'},
        {'source': 'Which actor was Stephenie Meyers first choice to play Edward Cullen in the movie Twilight?', 'target': '¿Qué actor fue la primera opción de Stephanie Meyer para interpretar a Edward Cullen en la película Crepúsculo?'},
        {'source': 'What is the latest US state to be admitted to the union that is not Hawaii?', 'target': '¿Cuál es el último estado de Estados Unidos en ser incorporado a la unión aparte de Hawái?'},
        {'source': 'What is the longest lake in the world?', 'target': '¿Cuál es el lago más largo del mundo?'},
        {'source': 'Is Texas the largest state in US?', 'target': '¿Tejas es el estado más grande de Estados Unidos?'},
        {'source': 'How many times have the Los Angeles Dodgers lost the World Series?', 'target': '¿Cuántas veces perdieron los Dodgers de Los Ángeles la Serie Mundial?'},
    ],
    'fr': [
        {'source': 'What is the seventh tallest mountain in North America?', 'target': 'Quelle est la septième plus haute montagne d’Amérique du Nord ?'},
        {'source': 'Who is the youngest current US governor?', 'target': 'Qui est l’actuel plus jeune gouverneur américain ?'},
        {'source': 'Has Bernie Sanders ever been president of the United States?', 'target': 'Bernie Sanders a-t-il déjà été Président des États-Unis ?'},
        {'source': 'Which actor was Stephenie Meyers first choice to play Edward Cullen in the movie Twilight?', 'target': 'Quel acteur Stephanie Meyer a-t-elle choisi en premier pour jouer le rôle d’Edward Cullen dans le film Twilight ?'},
        {'source': 'Which river is longer than the Mississippi River?', 'target': 'Quel fleuve est plus long que le Mississippi ?'},
        {'source': 'What is the latest US state to be admitted to the union that is not Hawaii?', 'target': 'Quel est le dernier État Américain à avoir été admis dans l’Union et qui n’est pas Hawaï ?'},
        {'source': 'What is the longest lake in the world?', 'target': 'Quel est le lac le plus long du monde ?'},
        {'source': 'How many times have the Los Angeles Dodgers lost the World Series?', 'target': 'Combien de fois les Dodgers de Los Angeles ont-ils perdu dans la série mondiale ?'},
        {'source': 'Who was the president of Argentina from 1989 to 1999?', 'target': 'Qui était le Président de l’Argentine, de mille neuf cent quatre-vingt-neuf à mille neuf cent quatre-vingt-dix-neuf ?'},
        {'source': 'Who was a member of the 2004 U.S. Olympic mens swim team and born in Baltimore, Maryland?', 'target': 'Quel membre de l’équipe olympique masculine de natation des États-Unis de 2004 est né à Baltimore, dans le Maryland ?'},
    ],
    'it': [
        {'source': 'What year was the first book of the A Song of Ice and Fire series published?', 'target': 'In che anno venne pubblicato il primo libro della serie Cronache del Ghiaccio e del Fuoco?'},
        {'source': 'Which actor was Stephenie Meyers first choice to play Edward Cullen in the movie Twilight?', 'target': 'Quale attore era stata la prima scelta di Stephanie Meyer per interpretare Edward Cullen nel film Twilight?'},
        {'source': 'Which river is longer than the Mississippi River?', 'target': 'Quale fiume è più lungo del Mississippi?'},
        {'source': 'What is the longest lake in the world?', 'target': 'Qual è il lago più lungo del mondo?'},
        {'source': 'How many times have the Los Angeles Dodgers lost the World Series?', 'target': 'Quante volte i Los Angeles Dodgers hanno perso la Worls Series?'},
        {'source': 'Who was a member of the 2004 U.S. Olympic mens swim team and born in Baltimore, Maryland?', 'target': 'Chi era un membro della squadra olimpica di nuoto maschile americana del 2004 ed è nato a Baltimora, Maryland?'},
        {'source': 'Has there ever been a Christian U.S. senator?', 'target': 'Cè mai stato un senatore americano cristiano?'},
        {'source': 'How many Pulitzer Prizes has Colson Whitehead won?', 'target': 'Quanti premi Pulitzer ha vinto Colson Whitehead?'},
        {'source': 'Which movie was released first, Avengers: Endgame or Joker?', 'target': 'Quale film è uscito prima, Avengers Endgame o Joker?'},
        {'source': 'Was Friday the 13th released the same year as Halloween?', 'target': 'Venerdì 13 è uscito lo stesso anno di Halloween?'},
    ]
}

In [None]:
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type

@retry(
    wait=wait_random_exponential(multiplier=1, min=DELAY_BETWEEN_REQUESTS_SECONDS, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type(Exception)
)
def get_translated_content_with_retries_few_shot(source_records_batch, target_language, llm_instance, template, few_shot_examples):
    """
    Translates a batch of source_texts using the provided LLM instance and few-shot examples.
    Includes robust retry logic with exponential backoff for API stability.
    """
    global api_request_count

    # Prepare the input and examples JSON strings for templating
    input_json_for_prompt = [{"id": rec['id'], "text": rec['source']} for rec in source_records_batch]
    source_texts_json_str = json.dumps(input_json_for_prompt, ensure_ascii=False)
    few_shot_examples_json = json.dumps(few_shot_examples, ensure_ascii=False)

    # Format the few-shot prompt
    prompt_to_send = template.format(
        source_texts_json=source_texts_json_str,
        target_language=target_language,
        few_shot_examples_json=few_shot_examples_json
    )

    print(f"Sending prompt to model: {prompt_to_send}")  # Debug: Log full prompt

    # Invoke LLM with the prompt
    response = llm_instance.invoke(prompt_to_send)
    api_request_count += 1

    try:
        # Clean potential markdown wrapping
        cleaned_content = response.content.strip()
        if cleaned_content.startswith("```json") and cleaned_content.endswith("```"):
            cleaned_content = cleaned_content[7:-3].strip()

        # Parse model output
        translated_outputs = json.loads(cleaned_content)

        print(f"Model response: {translated_outputs}")  # Debug: Log parsed output

        # Validate structure
        if not isinstance(translated_outputs, list) or \
           not all(isinstance(item, dict) and 'id' in item and 'translation' in item for item in translated_outputs):
            raise ValueError("Model did not return a valid JSON array of translation objects.")

        # Reconstruct ordered translations
        translated_dict = {item['id']: item['translation'] for item in translated_outputs}
        ordered_translations = [translated_dict.get(record['id'], "ERROR: ID not found in JSON output")
                                for record in source_records_batch]

        return ordered_translations

    # Handle errors
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Invalid JSON response from model"] * len(source_records_batch)
    except ValueError as e:
        print(f"Value Error (JSON format issue): {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Invalid JSON structure from model"] * len(source_records_batch)
    except Exception as e:
        print(f"An unexpected error occurred while processing model output: {e}. Raw response: {response.content.strip()}")
        return ["ERROR: Unexpected issue processing model output"] * len(source_records_batch)


In [None]:
def few_shot_eval(template, template_id, few_shot_examples):
    overall_pbar = tqdm.tqdm(jsonl_files, desc="Processing files (few-shot)")

    # Create directory for saving few-shot predictions
    output_prediction_dir_1 = os.path.join(output_prediction_dir, template_id)
    os.makedirs(output_prediction_dir_1, exist_ok=True)

    for file_path in overall_pbar:
        filename = os.path.basename(file_path)
        outfile_path = os.path.join(output_prediction_dir_1, filename)

        # Load JSONL input file
        data_to_translate = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data_to_translate.append(json.loads(line))

        translated_results_for_file = []

        # Translate in batches
        for i in tqdm.tqdm(range(0, len(data_to_translate), BATCH_SIZE), desc=f"Translating {filename} in batches (few-shot)", leave=False):
            batch_records = data_to_translate[i : i + BATCH_SIZE]
            target_locale = batch_records[0]['target_locale']
            target_language = get_language_name(target_locale)

            try:
                # Translate batch using few-shot prompt
                translated_texts_batch = get_translated_content_with_retries_few_shot(
                    batch_records, target_language, llm, template, few_shot_examples[target_locale]
                )

                # Store translations with metadata
                for j, record in enumerate(batch_records):
                    record_id = record['id']
                    source_text = record['source']
                    source_locale = record['source_locale']
                    translated_text = translated_texts_batch[j] if j < len(translated_texts_batch) else "ERROR: Translation missing"
                    translated_results_for_file.append({
                        "id": record_id,
                        "source_language": get_language_name(source_locale),
                        "target_language": target_language,
                        "text": source_text,
                        "prediction": translated_text,
                    })

            except Exception as e:
                print(f"\nCRITICAL ERROR: Failed to translate a batch starting with ID '{batch_records[0]['id']}' after multiple retries. Error: {e}")
                for record in batch_records:
                    translated_results_for_file.append({
                        "id": record['id'],
                        "source_language": get_language_name(record['source_locale']),
                        "target_language": get_language_name(record['target_locale']),
                        "text": record['source'],
                        "prediction": "ERROR: Batch translation failed due to API issues/rate limits.",
                    })

            # Respect rate limits between batches/files
            if i + BATCH_SIZE < len(data_to_translate) or overall_pbar.n < len(jsonl_files):
                 time.sleep(DELAY_BETWEEN_REQUESTS_SECONDS)

        # Save results to file
        with open(outfile_path, 'w', encoding='utf-8') as f:
            for res in translated_results_for_file:
                f.write(json.dumps(res, ensure_ascii=False) + '\n')

        print(f"\nTranslations for {filename} saved to {outfile_path}")

    overall_pbar.close()
    print("All files processed and translations saved (few-shot).")


In [None]:
# Build few-shot prompt template with support for in-context examples
few_shot_prompt_template = PromptTemplate(
    input_variables=["source_texts_json", "target_language", "few_shot_examples_json"],
    template=FEW_SHOT_PROMPT_TEMPLATE_STRING,
)

# Run the few-shot evaluation pipeline using provided examples
few_shot_eval(
    few_shot_prompt_template,
    "few-shot",
    few_shot_examples_dict
)


Processing files (few-shot):   0%|          | 0/10 [00:00<?, ?it/s]
Translating de_DE.jsonl in batches (few-shot):   0%|          | 0/15 [00:00<?, ?it/s][A

Sending prompt to model: 
Your task is to translate English sentences into German.
Below are a few examples of English sentences with named entities translated correctly into German.
Pay close attention to how named entities (e.g., people, organizations, locations, product names) are translated correctly and consistently.

Examples:
[{"source": "What is the seventh tallest mountain in North America?", "target": "Wie heißt der siebthöchste Berg Nordamerikas?"}, {"source": "What year was the first book of the A Song of Ice and Fire series published?", "target": "In welchem Jahr wurde das erste Buch der Reihe \"Das Lied von Eis und Feuer\" veröffentlicht?"}, {"source": "Who is the youngest current US governor?", "target": "Wer ist derzeit der jüngste amerikanische Gouverneur?"}, {"source": "Has Bernie Sanders ever been president of the United States?", "target": "War Bernie Sanders jemals Präsident der Vereinigten Staaten?"}, {"source": "Which actor was Stephenie Meyers first choice to pl


Translating de_DE.jsonl in batches (few-shot):   7%|▋         | 1/15 [00:28<06:41, 28.66s/it][A

Sending prompt to model: 
Your task is to translate English sentences into German.
Below are a few examples of English sentences with named entities translated correctly into German.
Pay close attention to how named entities (e.g., people, organizations, locations, product names) are translated correctly and consistently.

Examples:
[{"source": "What is the seventh tallest mountain in North America?", "target": "Wie heißt der siebthöchste Berg Nordamerikas?"}, {"source": "What year was the first book of the A Song of Ice and Fire series published?", "target": "In welchem Jahr wurde das erste Buch der Reihe \"Das Lied von Eis und Feuer\" veröffentlicht?"}, {"source": "Who is the youngest current US governor?", "target": "Wer ist derzeit der jüngste amerikanische Gouverneur?"}, {"source": "Has Bernie Sanders ever been president of the United States?", "target": "War Bernie Sanders jemals Präsident der Vereinigten Staaten?"}, {"source": "Which actor was Stephenie Meyers first choice to pl


Translating de_DE.jsonl in batches (few-shot):  13%|█▎        | 2/15 [00:53<05:41, 26.29s/it][A

Sending prompt to model: 
Your task is to translate English sentences into German.
Below are a few examples of English sentences with named entities translated correctly into German.
Pay close attention to how named entities (e.g., people, organizations, locations, product names) are translated correctly and consistently.

Examples:
[{"source": "What is the seventh tallest mountain in North America?", "target": "Wie heißt der siebthöchste Berg Nordamerikas?"}, {"source": "What year was the first book of the A Song of Ice and Fire series published?", "target": "In welchem Jahr wurde das erste Buch der Reihe \"Das Lied von Eis und Feuer\" veröffentlicht?"}, {"source": "Who is the youngest current US governor?", "target": "Wer ist derzeit der jüngste amerikanische Gouverneur?"}, {"source": "Has Bernie Sanders ever been president of the United States?", "target": "War Bernie Sanders jemals Präsident der Vereinigten Staaten?"}, {"source": "Which actor was Stephenie Meyers first choice to pl


Translating de_DE.jsonl in batches (few-shot):  20%|██        | 3/15 [01:15<04:53, 24.46s/it][A

Sending prompt to model: 
Your task is to translate English sentences into German.
Below are a few examples of English sentences with named entities translated correctly into German.
Pay close attention to how named entities (e.g., people, organizations, locations, product names) are translated correctly and consistently.

Examples:
[{"source": "What is the seventh tallest mountain in North America?", "target": "Wie heißt der siebthöchste Berg Nordamerikas?"}, {"source": "What year was the first book of the A Song of Ice and Fire series published?", "target": "In welchem Jahr wurde das erste Buch der Reihe \"Das Lied von Eis und Feuer\" veröffentlicht?"}, {"source": "Who is the youngest current US governor?", "target": "Wer ist derzeit der jüngste amerikanische Gouverneur?"}, {"source": "Has Bernie Sanders ever been president of the United States?", "target": "War Bernie Sanders jemals Präsident der Vereinigten Staaten?"}, {"source": "Which actor was Stephenie Meyers first choice to pl


Processing files (few-shot):   0%|          | 0/10 [01:20<?, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-16-2405482697.py", line 6, in <cell line: 0>
    few_shot_eval(
  File "/tmp/ipython-input-15-2179269205.py", line 25, in few_shot_eval
    translated_texts_batch = get_translated_content_with_retries_few_shot(
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/tenacity/__init__.py", line 338, in wrapped_f
    return copy(f, *args, **kw)
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
             ^^^^^^^^^^^^^^^^^^

TypeError: object of type 'NoneType' has no len()

In [None]:
calculate_scores("few-shot")

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [15:27<00:00, 23.19s/it]


Average COMET score: 89.88


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Created 1316 instances


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 42/42 [17:32<00:00, 25.05s/it]


Average COMET score: 89.92


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1654 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [20:12<00:00, 23.31s/it]


Average COMET score: 30.63


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1660 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [18:22<00:00, 21.21s/it]


Average COMET score: 31.88


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1229 instances


Predicting DataLoader 0: 100%|██████████| 39/39 [14:59<00:00, 23.06s/it]


Average COMET score: 92.62


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1268 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [14:52<00:00, 22.30s/it]


Average COMET score: 91.40


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1409 instances


Predicting DataLoader 0: 100%|██████████| 45/45 [16:25<00:00, 21.90s/it]


Average COMET score: 32.04


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1177 instances


Predicting DataLoader 0: 100%|██████████| 37/37 [13:04<00:00, 21.21s/it]


Average COMET score: 90.96


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [13:14<00:00, 19.87s/it]


Average COMET score: 33.04


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1544 instances


Predicting DataLoader 0: 100%|██████████| 49/49 [16:09<00:00, 19.79s/it]


Average COMET score: 32.44


# Chain of Thought Prompting

In [None]:
COT_PROMPT_TEMPLATE_STRING = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert linguist and translator. Your primary goal is to translate English sentences into {target_language}.
            You must pay special attention to identifying and accurately translating named entities within the sentence.

            Here's your step-by-step Chain-of-Thought process for each translation.
            Perform these steps internally, and then provide only the final JSON output as requested below.

            1.  **Identify Named Entities:**
                Go through each English sentence and meticulously identify all named entities.
                Named entities include, but are not limited to, persons, organizations, locations, dates, times, and specific titles.

            2.  **Determine Entity Type:**
                For each named entity identified in step 1, classify its specific type.
                Examples of types include: Person, Organization (ORG), Location (LOC), Date, Time, Title, Musical work, Artwork,
                Food, Animal, Plant, Book, Book series, Fictional entity, Landmark, Movie, Place of worship,
                Natural place, TV series.
                This classification provides crucial context for accurate translation.

            3.  **Contextual Translation of Named Entities:**
                For each identified named entity, determine its most appropriate translation into {target_language}.
                Consider the entity's type and the overall context of the sentence.
                - For proper nouns (like names of people or specific places), often a transliteration or the commonly accepted international form is best.
                - For organizations, use their official translated name if available, otherwise transliterate or provide a descriptive translation.
                - For dates and times, translate them according to the {target_language}'s conventions.

            4.  **Translate Remaining Sentence:**
                Translate all parts of the English sentence that are NOT named entities.
                Ensure that this translation is grammatically correct, natural-sounding, and culturally appropriate for {target_language}.

            5.  **Combine and Finalize Translation:**
                Integrate the accurately translated named entities (from step 3) back into the translated sentence (from step 4).
                Construct the final, complete, and fluent translation of the original English sentence.

            Your final output MUST be a JSON array of objects. Each object MUST contain two fields: "id" (the original sentence ID) and "translation" (the complete translated sentence).
            Do NOT include any other text, reasoning steps, or formatting outside of the JSON array.
            Example for JSON output:
            ```json
            [
              {{"id": "sentence_id_1", "translation": "Translated sentence 1."}},
              {{"id": "sentence_id_2", "translation": "Translated sentence 2."}}
            ]
            ```
            """
        ),
        ("human", "Translate the following English sentences into {target_language}:\n{source_texts_json}"),
    ]
)

In [None]:
# Retry-enabled CoT translation handler
@retry(
    wait=wait_random_exponential(multiplier=1, min=DELAY_BETWEEN_REQUESTS_SECONDS, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type(Exception)
)
def get_translated_content_with_retries_using_cot(source_records_batch, target_language, llm_instance, template):
    global api_request_count

    # Prepare prompt input
    input_json_for_prompt = [{"id": rec['id'], "text": rec['source']} for rec in source_records_batch]
    source_texts_json_str = json.dumps(input_json_for_prompt, ensure_ascii=False)

    # Handle both ChatPromptTemplate and string template
    if isinstance(template, ChatPromptTemplate):
        response = llm_instance.invoke(template.format_messages(
            target_language=target_language,
            source_texts_json=source_texts_json_str
        ))
        raw_content = response.content
    else:
        prompt_to_send = template.format(
            source_texts_json=source_texts_json_str,
            target_language=target_language
        )
        response = llm_instance.invoke(prompt_to_send)
        raw_content = response.content

    api_request_count += 1

    try:
        # Clean markdown-wrapped JSON (```json ... ```)
        cleaned_content = raw_content.strip()
        if cleaned_content.startswith("```json") and cleaned_content.endswith("```"):
            cleaned_content = cleaned_content[7:-3].strip()

        translated_outputs = json.loads(cleaned_content)
        print(translated_outputs)  # Debug output

        # Validate format
        if not isinstance(translated_outputs, list) or \
           not all(isinstance(item, dict) and 'id' in item and 'translation' in item for item in translated_outputs):
            raise ValueError("Model did not return a valid JSON array of translation objects.")

        # Reorder results to match input
        translated_dict = {item['id']: item['translation'] for item in translated_outputs}
        ordered_translations = [translated_dict.get(record['id'], "ERROR: ID not found in JSON output")
                                for record in source_records_batch]

        return ordered_translations

    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}. Raw response: {raw_content.strip()}")
        return ["ERROR: Invalid JSON response from model"] * len(source_records_batch)
    except ValueError as e:
        print(f"Value Error (JSON format issue): {e}. Raw response: {raw_content.strip()}")
        return ["ERROR: Invalid JSON structure from model"] * len(source_records_batch)
    except Exception as e:
        print(f"An unexpected error occurred while processing model output: {e}. Raw response: {raw_content.strip()}")
        return ["ERROR: Unexpected issue processing model output"] * len(source_records_batch)

# Main evaluation loop for CoT or other flexible templates
def translation_eval_cot(template, template_id, few_shot_examples=None):
    overall_pbar = tqdm.tqdm(jsonl_files, desc=f"Processing files ({template_id})")

    output_prediction_sub_dir = os.path.join(output_prediction_dir, template_id)
    os.makedirs(output_prediction_sub_dir, exist_ok=True)

    for file_path in overall_pbar:
        filename = os.path.basename(file_path)
        outfile_path = os.path.join(output_prediction_sub_dir, filename)

        # Load input file
        data_to_translate = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data_to_translate.append(json.loads(line))

        translated_results_for_file = []

        for i in tqdm.tqdm(range(0, len(data_to_translate), BATCH_SIZE), desc=f"Translating {filename} in batches", leave=False):
            batch_records = data_to_translate[i : i + BATCH_SIZE]
            target_locale = batch_records[0]['target_locale']
            target_language = get_language_name(target_locale)

            # Handle dynamic few-shot partial or passthrough
            if isinstance(template, ChatPromptTemplate):
                current_template = template
            else:
                if few_shot_examples:
                    lang_specific_examples = [
                        ex for ex in few_shot_examples if get_language_name(ex['target_locale']) == target_language
                    ]
                    formatted_examples = json.dumps([
                        {"source": ex['source'], "target": ex['target']} for ex in lang_specific_examples
                    ], ensure_ascii=False)
                    current_template = template.partial(few_shot_examples_json=formatted_examples)
                else:
                    current_template = template

            try:
                # Run translation using selected template
                translated_texts_batch = get_translated_content_with_retries_using_cot(
                    batch_records, target_language, llm, current_template
                )

                # Format output
                for j, record in enumerate(batch_records):
                    record_id = record['id']
                    source_text = record['source']
                    source_locale = record['source_locale']
                    translated_text = translated_texts_batch[j] if j < len(translated_texts_batch) else "ERROR: Translation missing"
                    translated_results_for_file.append({
                        "id": record_id,
                        "source_language": get_language_name(source_locale),
                        "target_language": target_language,
                        "text": source_text,
                        "prediction": translated_text,
                    })

            except Exception as e:
                print(f"\nCRITICAL ERROR: Failed to translate a batch starting with ID '{batch_records[0]['id']}' after multiple retries. Error: {e}")
                for record in batch_records:
                    translated_results_for_file.append({
                        "id": record['id'],
                        "source_language": get_language_name(record['source_locale']),
                        "target_language": get_language_name(record['target_locale']),
                        "text": record['source'],
                        "prediction": "ERROR: Batch translation failed due to API issues/rate limits.",
                    })

            # Respect rate limits
            if i + BATCH_SIZE < len(data_to_translate) or overall_pbar.n < len(jsonl_files):
                time.sleep(DELAY_BETWEEN_REQUESTS_SECONDS)

        # Save predictions to file
        with open(outfile_path, 'w', encoding='utf-8') as f:
            for res in translated_results_for_file:
                f.write(json.dumps(res, ensure_ascii=False) + '\n')

        print(f"\nTranslations for {filename} saved to {outfile_path}")

    overall_pbar.close()
    print(f"All files processed for {template_id} and translations saved.")


In [None]:
translation_eval_cot(COT_PROMPT_TEMPLATE_STRING, "cot")


In [None]:
calculate_scores("cot")

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Created 1260 instances


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 40/40 [14:51<00:00, 22.28s/it]


Average COMET score: 89.89


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1316 instances


Predicting DataLoader 0: 100%|██████████| 42/42 [16:57<00:00, 24.23s/it]


Average COMET score: 89.86


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1654 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [20:39<00:00, 23.83s/it]


Average COMET score: 83.40


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1660 instances


Predicting DataLoader 0: 100%|██████████| 52/52 [19:08<00:00, 22.09s/it]


Average COMET score: 93.49


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1229 instances


Predicting DataLoader 0: 100%|██████████| 39/39 [14:35<00:00, 22.46s/it]


Average COMET score: 92.30


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1268 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [14:30<00:00, 21.76s/it]


Average COMET score: 91.19


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1409 instances


Predicting DataLoader 0: 100%|██████████| 45/45 [18:01<00:00, 24.03s/it]


Average COMET score: 93.31


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1177 instances


Predicting DataLoader 0: 100%|██████████| 37/37 [12:38<00:00, 20.50s/it]


Average COMET score: 89.12


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1260 instances


Predicting DataLoader 0: 100%|██████████| 40/40 [13:33<00:00, 20.33s/it]


Average COMET score: 92.24


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created 1544 instances


Predicting DataLoader 0: 100%|██████████| 49/49 [16:36<00:00, 20.35s/it]


Average COMET score: 92.53


Collecting langchain_community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB