# Translation pipeline notebook

In [2]:
from dotenv import load_dotenv
import os

# Load variables from .env
load_dotenv()

# Print the value of the key 'HOME'
print(os.getcwd())

/Users/asaf/Workspace/biu/hebrew_text_retrieval/notebooks/translation


In [3]:
import sys

PROJECT_DIR = os.path.abspath('../../')
SRC_DIR = os.path.join(PROJECT_DIR, 'src')

if not os.path.isdir(SRC_DIR):
    raise FileNotFoundError(f'{SRC_DIR} not found')

if SRC_DIR not in sys.path:
    print(f'Adding {SRC_DIR} to sys.path')
    sys.path.append(SRC_DIR)

os.chdir(PROJECT_DIR)

Adding /Users/asaf/Workspace/biu/hebrew_text_retrieval/src to sys.path


In [16]:
import pandas as pd
import glob
import re
from pydantic import BaseModel
from tqdm.notebook import tqdm
from translation.api.translate import run_translation_pipeline
from pydantic import BaseModel

## Translation pipeline

In [12]:
SOURCE_FILE_PATH = "outputs/translation/BeIR/BeIR_msmarco/queries.csv"
PROMPT_FILE_NAMES = [
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_nocontext.yaml",
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_searchopt.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_unified.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_zeroshot.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_default.yaml",          
    "prompts/translation/openai/translation_prompts_few_shot_v20250105_default.yaml"
]
MODEL_NAME = "gpt-4o-mini-2024-07-18"  
LIMIT = 100
LIMIT = 100
ENGLISH_KEY = "English"
HEBREW_KEY = "Hebrew"
HEBREW_KEY_QUERY = "Hebrew Query"
HEBREW_KEY_DOCUMENT = "Hebrew Document"
CONTEXT_KEY = "Context"

class UnifiedTranslation(BaseModel):
        hebrew_document: str
        hebrew_query: str

        def __str__(self):
            return "<hebrew_document>" + self.hebrew_document + "</hebrew_document><hebrew_query>" + self.hebrew_query + "</hebrew_query>"
        
        def __repr__(self):
            return "<hebrew_document>" + self.hebrew_document + "</hebrew_document><hebrew_query>" + self.hebrew_query + "</hebrew_query>"


In [13]:
class Translation(BaseModel):
    hebrew: str

    def __str__(self):
        return self.hebrew
    
    def __repr__(self):
        return self.hebrew
    

class UnifiedTranslation(BaseModel):
    hebrew_query: str
    hebrew_document: str

    def __str__(self):
        return f"{self.hebrew_query}</query>\n{self.hebrew_document}</document>"
    
    def __repr__(self):
        return f"{self.hebrew_query}</query>\n{self.hebrew_document}</document>"

In [14]:
for prompt_file_name in tqdm(PROMPT_FILE_NAMES):
    # Extract version from prompt file name
    match = re.search(r"v\d{8}_\w+", prompt_file_name)
    version = None
    if match:
        version = match.group(0)
    
    print(f"Running translation pipeline for prompt file: {prompt_file_name}, version: {version}")

    response_format = Translation
    if 'unified' in prompt_file_name:
        response_format = UnifiedTranslation

    # Run translation pipeline
    run_translation_pipeline(
        source_file_path=SOURCE_FILE_PATH,
        prompt_file_name=prompt_file_name,
        model_name=MODEL_NAME,
        limit=LIMIT,
        english_key=ENGLISH_KEY,
        hebrew_key=HEBREW_KEY,
        context_key=CONTEXT_KEY,
        hebrew_key_query=HEBREW_KEY,
        hebrew_key_document=HEBREW_KEY,
        version=version,
        response_format=response_format
    )

  0%|          | 0/6 [00:00<?, ?it/s]

Running translation pipeline for prompt file: prompts/translation/openai/translation_prompts_few_shot_v20250128_nocontext.yaml, version: v20250128_nocontext
Translation output file path: outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_nocontext.csv
Limiting the number of texts to 100.
Skipping translation of outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_nocontext.csv as have been translated.
Running translation pipeline for prompt file: prompts/translation/openai/translation_prompts_few_shot_v20250128_searchopt.yaml, version: v20250128_searchopt
Translation output file path: outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_searchopt.csv
Limiting the number of texts to 100.
Skipping translation of outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_searchopt.csv as have been translated.
Running translation pipeline for prompt file: prompts/translation/openai/translati

Rows: 100%|██████████| 100/100 [01:48<00:00,  1.09s/it]


Running translation pipeline for prompt file: prompts/translation/openai/translation_prompts_few_shot_v20250105_default.yaml, version: v20250105_default
Translation output file path: outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250105_default.csv
Limiting the number of texts to 100.


Rows: 100%|██████████| 100/100 [01:43<00:00,  1.03s/it]


In [20]:
# Define folder path
folder_path = "outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18"
# Find all CSV files in the folder
file_paths = glob.glob(os.path.join(folder_path, "*.csv"))
file_paths

['outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_nocontext.csv',
 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_searchopt.csv',
 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_unified.csv',
 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_zeroshot.csv',
 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_default.csv',
 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250105_default.csv']

In [22]:
# Define shared fields
shared_fields = ["id", "text", "context_id", "context_text", "category", "dataset_name", "model_name"]

# Define different fields
different_fields = ["translation", "input_tokens", "output_tokens", "model_time"]

# Load all files into DataFrames
dfs = [pd.read_csv(file)[shared_fields + different_fields] for file in file_paths]

# Rename differing columns to track their source
for i, (df, file) in enumerate(zip(dfs, file_paths)):
    file_name = os.path.basename(file).replace(".csv", "")  # Extract file name
    for col in different_fields:
        if col in df.columns:
            df.rename(columns={col: f"{col}_{file_name}"}, inplace=True)

# Merge all DataFrames on shared fields using an outer join
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = merged_df.merge(df, on=shared_fields, how="outer")

# Save the merged DataFrame
output_file = os.path.join(folder_path, "merged_output.csv")
merged_df.to_csv(output_file, index=False)
print(f"Merged file saved at: {output_file}")


Merged file saved at: outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/merged_output.csv
