# Translation pipeline notebook

In [2]:
from dotenv import load_dotenv
import os

# Load variables from .env
load_dotenv()

# Print the value of the key 'HOME'
print(os.getcwd())

/Users/asaf/Workspace/biu/hebrew_text_retrieval/notebooks/translation


In [3]:
import sys

PROJECT_DIR = os.path.abspath('../../')
SRC_DIR = os.path.join(PROJECT_DIR, 'src')

if not os.path.isdir(SRC_DIR):
    raise FileNotFoundError(f'{SRC_DIR} not found')

if SRC_DIR not in sys.path:
    print(f'Adding {SRC_DIR} to sys.path')
    sys.path.append(SRC_DIR)

os.chdir(PROJECT_DIR)

Adding /Users/asaf/Workspace/biu/hebrew_text_retrieval/src to sys.path


In [4]:
import pandas as pd
import re
from tqdm.notebook import tqdm
from translation.api.translate import run_translation_pipeline
from pydantic import BaseModel

## Translation pipeline

In [5]:
SOURCE_FILE_PATH = "outputs/translation/BeIR/BeIR_msmarco/queries.csv"
PROMPT_FILE_NAMES = [
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_nocontext.yaml",
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_searchopt.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_unified.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_zeroshot.yaml", 
    "prompts/translation/openai/translation_prompts_few_shot_v20250128_default.yaml"            
    # "prompts/translation/openai/translation_prompts_few_shot_v20250105_default.yaml"
]
MODEL_NAME = "gpt-4o-mini-2024-07-18"  
LIMIT = 100
ENGLISH_KEY = "English"
HEBREW_KEY = "Hebrew"
HEBREW_KEY_QUERY = "Hebrew Query"
HEBREW_KEY_DOCUMENT = "Hebrew Document"
CONTEXT_KEY = "Context"

class UnifiedTranslation(BaseModel):
        hebrew_document: str
        hebrew_query: str

        def __str__(self):
            return "<hebrew_document>" + self.hebrew_document + "</hebrew_document><hebrew_query>" + self.hebrew_query + "</hebrew_query>"
        
        def __repr__(self):
            return "<hebrew_document>" + self.hebrew_document + "</hebrew_document><hebrew_query>" + self.hebrew_query + "</hebrew_query>"


In [28]:
for prompt_file_name in tqdm(PROMPT_FILE_NAMES):
    # Extract version from prompt file name
    match = re.search(r"v\d{8}_\w+", prompt_file_name)
    version = None
    if match:
        version = match.group(0)
    
    print(f"Running translation pipeline for prompt file: {prompt_file_name}, version: {version}")

    # Run translation pipeline
    run_translation_pipeline(
        source_file_path=SOURCE_FILE_PATH,
        prompt_file_name=prompt_file_name,
        model_name=MODEL_NAME,
        limit=LIMIT,
        english_key=ENGLISH_KEY,
        hebrew_key=HEBREW_KEY,
        context_key=CONTEXT_KEY,
        hebrew_key_query=HEBREW_KEY,
        hebrew_key_document=HEBREW_KEY,
        version=version,
        response_format=UnifiedTranslation
    )

  0%|          | 0/1 [00:00<?, ?it/s]

Running translation pipeline for prompt file: prompts/translation/openai/translation_prompts_few_shot_v20250128_unified.yaml, version: v20250128_unified
queries_v20250128_unified.csv
Limiting the number of texts to 100.


Rows: 100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


### Post-process unified translation file

In a unified translation both the query and document are translated.

In [29]:
unified_translation_df = pd.read_csv("outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/queries_v20250128_unified.csv", encoding="utf-8")
unified_translation_df['raw_translation'] = unified_translation_df['translation']
unified_translation_df['query'] = unified_translation_df['raw_translation'].apply(lambda x: x.split("<hebrew_query>")[1].replace("</hebrew_query>", "").strip())
unified_translation_df['document'] = unified_translation_df['raw_translation'].apply(lambda x: x.replace("<hebrew_document>", "").split("</hebrew_document>")[0].strip())
unified_translation_df['translation'] = unified_translation_df['query']
unified_translation_df.head()

Unnamed: 0,_id,title,text,context_id,context_text,category,dataset_name,tokenizer,english_key,hebrew_key,...,model_time,translation_time,timestamp,batch_idx,batch_size,batch_datetime,translation_datetime,raw_translation,query,document
0,949092,,when is the most expensive time to go to pun...,5079981,Close Gallery. Zoom Picture. New York.â Punt...,Misc,BeIR/msmarco,gpt-4o-mini-2024-07-18,English,Hebrew,...,1.270042,1.277493,2025-02-06 14:17:46.486342,0.0,1.0,2025-02-06 14:17:45.208840,2025-02-06 14:17:45.207733,<hebrew_document>סגירת גלריה. זום על התמונה. נ...,מתי הזמן היקר ביותר לנסוע לפונטה קנה?,סגירת גלריה. זום על התמונה. ניו יורק. פונטה קנ...
1,410021,,is fentanyl or dilaudid stronger,1027387,"If you get a doctor that is a pain doctor, you...",Misc,BeIR/msmarco,gpt-4o-mini-2024-07-18,English,Hebrew,...,1.094847,1.105361,2025-02-06 14:17:47.600235,1.0,1.0,2025-02-06 14:17:46.494864,2025-02-06 14:17:45.207733,<hebrew_document>אם תצליחו למצוא רופא שמתמחה ב...,"מה חזק יותר, פנטניל או דילודיד?","אם תצליחו למצוא רופא שמתמחה בכאב, תזכו להצלחה ..."
2,291894,,how many people are living in Damascus now,315872,"Damascus is the second largest city in Syria, ...",Misc,BeIR/msmarco,gpt-4o-mini-2024-07-18,English,Hebrew,...,1.217439,1.226539,2025-02-06 14:17:48.848811,2.0,1.0,2025-02-06 14:17:47.622260,2025-02-06 14:17:45.207733,<hebrew_document>דמשק היא העיר השנייה הגדולה ב...,כמה אנשים חיים בדמשק עכשיו?,"דמשק היא העיר השנייה הגדולה ביותר בסוריה, עם א..."
3,178829,,effects of nicotine lozenges on the body,5847330,The nicotine lozenge (NicoretteÂ® Lozenge) rel...,Misc,BeIR/msmarco,gpt-4o-mini-2024-07-18,English,Hebrew,...,1.596567,1.605973,2025-02-06 14:17:50.480161,3.0,1.0,2025-02-06 14:17:48.874174,2025-02-06 14:17:45.207733,<hebrew_document>סוכריית ניקוטין (NicoretteÂ® ...,מהן ההשפעות של סוכריות ניקוטין על הגוף?,סוכריית ניקוטין (NicoretteÂ® Lozenge) משחררת כ...
4,977756,,where is bath michigan located,2328514,"Bath, MI. Sponsored Topics. Bath is an unincor...",Misc,BeIR/msmarco,gpt-4o-mini-2024-07-18,English,Hebrew,...,2.016298,2.025465,2025-02-06 14:17:52.531785,4.0,1.0,2025-02-06 14:17:50.506305,2025-02-06 14:17:45.207733,"<hebrew_document>באאת, מישיגן. נושאים ממומנים....",איפה ממוקמת באת מישיגן?,"באאת, מישיגן. נושאים ממומנים. באת היא קהילה לא..."


In [1]:
unified_translation_df['raw_translation'].iloc[0].split('<hebrew_query>')[1]

NameError: name 'unified_translation_df' is not defined

## Merge translations

In [6]:
import os
import pandas as pd

def load_and_merge_csvs(directory, main_filename):
    # Load the main file first
    main_file_path = os.path.join(directory, main_filename)
    if not os.path.exists(main_file_path):
        raise FileNotFoundError(f"Main file {main_filename} not found in {directory}")

    main_df = pd.read_csv(main_file_path)

    # Ensure required keys exist
    if "_id" not in main_df.columns or "context_id" not in main_df.columns:
        raise ValueError(f"Main file {main_filename} must contain '_id' and 'context_id' columns")

    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith(".csv") and not f.startswith("_") and f != main_filename]

    # Columns to insert
    selected_columns = ["translation", "document"]

    # Merge each file into the main DataFrame
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        temp_df = pd.read_csv(file_path, encoding="utf-8")

        if 'unified.csv' not in file_path:
            temp_df['document'] = None
        else:
            temp_df['raw_translation'] = temp_df['translation']
            temp_df['query'] = temp_df['raw_translation'].apply(lambda x: x.split("<hebrew_query>")[1].replace("</hebrew_query>", "").strip())
            temp_df['document'] = temp_df['raw_translation'].apply(lambda x: x.replace("<hebrew_document>", "").split("</hebrew_document>")[0].strip())
            temp_df['translation'] = temp_df['query']

        # Ensure required keys exist in the right file
        if "_id" not in temp_df.columns or "context_id" not in temp_df.columns:
            raise ValueError(f"File {csv_file} is missing '_id' or 'context_id'")

        # Select only relevant columns and rename them
        common_cols = ["_id", "context_id"] + [col for col in selected_columns if col in temp_df.columns]
        temp_df = temp_df[common_cols]

        # Prefix the columns with the file name (without extension)
        file_prefix = os.path.splitext(csv_file)[0]
        rename_dict = {col: f"{file_prefix}_{col}" for col in selected_columns if col in temp_df.columns}
        temp_df = temp_df.rename(columns=rename_dict)

        # Merge with main DataFrame
        main_df = main_df.merge(temp_df, on=["_id", "context_id"], how="left")

    return main_df

In [7]:
directory = "outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18"  
main_filename = "_gold.csv"  

merged_df = load_and_merge_csvs(directory, main_filename)

# Save the merged DataFrame to a new file
output_file = os.path.join(directory, "_merged_queries_translation.csv")
merged_df.to_csv(output_file, index=False)
print(f"Merged CSV saved as '{output_file}'")

Merged CSV saved as 'outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18/_merged_queries_translation.csv'


In [9]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Ensure required nltk resources are available
nltk.download('punkt')

# Function to compute BLEU Score
def compute_bleu(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)
    
    # BLEU using bi-grams with smoothing (important for short texts)
    return sentence_bleu([reference_tokens], candidate_tokens, 
                         weights=(0.5, 0.5, 0, 0), 
                         smoothing_function=SmoothingFunction().method1)

# Function to compute ROUGE Scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rouge(reference, candidate):
    scores = scorer.score(reference, candidate)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

# Load the merged CSV
directory = "outputs/translation/BeIR/BeIR_msmarco/gpt-4o-mini-2024-07-18"
merged_csv_path = "_merged_queries_translation.csv"  # Update with the actual file path if needed
df = pd.read_csv(os.path.join(directory, merged_csv_path))

# Identify translation columns
translation_columns = [col for col in df.columns if col.endswith('_translation')]

TRANSLATION_COL = 'translation'
if TRANSLATION_COL not in df.columns:
    raise ValueError("The 'translation' column is missing from the merged CSV.")

# Compute BLEU and ROUGE scores and add them to the DataFrame
for col in translation_columns:
    bleu_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for _, row in df.iterrows():
        ref_translation = str(row[TRANSLATION_COL])  # Reference text
        gen_translation = str(row[col])  # Generated text

        # Skip NaN or empty rows
        if pd.isna(ref_translation) or pd.isna(gen_translation):
            bleu_scores.append(None)
            rouge1_scores.append(None)
            rouge2_scores.append(None)
            rougeL_scores.append(None)
            continue

        try:
            bleu = compute_bleu(ref_translation, gen_translation)
            rouge_scores = compute_rouge(ref_translation, gen_translation)

            bleu_scores.append(bleu)
            rouge1_scores.append(rouge_scores['rouge1'])
            rouge2_scores.append(rouge_scores['rouge2'])
            rougeL_scores.append(rouge_scores['rougeL'])
        except Exception as e:
            print(f"Error comparing translations for column {col}: {e}")
            bleu_scores.append(None)
            rouge1_scores.append(None)
            rouge2_scores.append(None)
            rougeL_scores.append(None)

    # Add new score columns to the DataFrame
    df[f"{col}_BLEU"] = bleu_scores
    df[f"{col}_ROUGE1"] = rouge1_scores
    df[f"{col}_ROUGE2"] = rouge2_scores
    df[f"{col}_ROUGEL"] = rougeL_scores

# Save the updated DataFrame
scores_csv_path = merged_csv_path.replace(".csv", "_scores.csv")
df.to_csv(os.path.join(directory, scores_csv_path), index=False)

print(f"Updated CSV saved as '{scores_csv_path}'")


Updated CSV saved as '_merged_queries_translation_scores.csv'


[nltk_data] Downloading package punkt to /Users/asaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
from rouge_score import rouge_scorer

reference_text = "מתי הזמן היקר ביותר לנסוע לפונטה קנה?"
generated_text = "מתי הזמן היקר ביותר לנסוע לפונטה קנה?"

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_text, generated_text)

print(scores)


{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}


In [14]:
from rouge_score import rouge_scorer

reference_text = "The quick brown fox jumps over the lazy dog"
generated_text = "The quick brown fox jumps over the lazy dog"

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_text, generated_text)

print(scores)


{'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
