# Translation pipeline notebook

In [1]:
from dotenv import load_dotenv
import os

# Load variables from .env
load_dotenv()

# Print the value of the key 'HOME'
print(os.getcwd())

/home/nlp/achimoa/workspace/hebrew_text_retrieval/notebooks/translation


In [2]:
import sys

PROJECT_DIR = os.path.abspath('../../')
SRC_DIR = os.path.join(PROJECT_DIR, 'src')

if not os.path.isdir(SRC_DIR):
    raise FileNotFoundError(f'{SRC_DIR} not found')

if SRC_DIR not in sys.path:
    print(f'Adding {SRC_DIR} to sys.path')
    sys.path.append(SRC_DIR)

os.chdir(PROJECT_DIR)

Adding /home/nlp/achimoa/workspace/hebrew_text_retrieval/src to sys.path


In [3]:
import pandas as pd
import re
from tqdm.notebook import tqdm
from translation.api.translate import run_translation_pipeline

In [4]:
SOURCE_FILE_PATH = "outputs/translation/BeIR/BeIR_msmarco/queries.csv"
PROMPT_FILE_NAMES = [
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_nocontext.yaml",
    # "prompts/translation/openai/translation_prompts_few_shot_v20250128_searchopt.yaml", 
    # "prompts/translation/openai/translation_prompts_few_shot_v20250128_unified.yaml", 
    # "prompts/translation/openai/translation_prompts_few_shot_v20250128_zeroshot.yaml", 
    # "prompts/translation/openai/translation_prompts_few_shot_v20250128_default.yaml"            
    # "prompts/translation/openai/translation_prompts_few_shot_v20250105_default.yaml"
]
MODEL_NAME = "gpt-4o-mini-2024-07-18"  
LIMIT = 10
ENGLISH_KEY = "English"
HEBREW_KEY = "Hebrew"
HEBREW_KEY_QUERY = "Hebrew Query"
HEBREW_KEY_DOCUMENT = "Hebrew Document"
CONTEXT_KEY = "Context"

In [5]:

for prompt_file_name in tqdm(PROMPT_FILE_NAMES):
    # Extract version from prompt file name
    match = re.search(r"v\d{8}_\w+", prompt_file_name)
    version = None
    if match:
        version = match.group(0)
    
    print(f"Running translation pipeline for prompt file: {prompt_file_name}, version: {version}")

    # Run translation pipeline
    run_translation_pipeline(
        source_file_path=SOURCE_FILE_PATH,
        prompt_file_name=prompt_file_name,
        model_name=MODEL_NAME,
        limit=LIMIT,
        english_key=ENGLISH_KEY,
        hebrew_key=HEBREW_KEY,
        context_key=CONTEXT_KEY,
        hebrew_key_query=HEBREW_KEY,
        hebrew_key_document=HEBREW_KEY,
        version=version
    )

  0%|          | 0/1 [00:00<?, ?it/s]

Running translation pipeline for prompt file: prompts/translation/openai/translation_prompts_few_shot_v20250128_nocontext.yaml, version: v20250128_nocontext
queries_v20250128_nocontext.csv
Limiting the number of texts to 10.




Rows: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s]
