
# Training‑Free Prompting (Three‑Step) — Two Versions

This notebook implements the exact three-step method in your paper figure:

1. **Explain** the idiom in the **target language** (default: English) → *explanation / true meaning*
2. **Literal translation** (word‑by‑word) into English
3. **Natural idiomatic translation**, combining (1) + (2)

We provide **two versions**:

- **Version A (Paper Step 3 only):** Use the CSV's `true_meaning` and `literal_translation` for steps (1) and (2), then run step (3) to produce the final idiomatic translation.
- **Version B (Fully LLM‑driven):** Ask the LLM to produce steps (1) and (2), then run step (3).

Both versions save results to CSV.


In [None]:

# (Optional) If needed:
%pip install --quiet openai pandas


## Configuration

In [None]:

import os, json, re, hashlib, pathlib, asyncio
from dataclasses import dataclass
from typing import List, Dict, Tuple, Any, Optional
from collections import Counter
import pandas as pd
from tqdm.notebook import tqdm

# --- Paths ---
INPUT_CSV = "petci_chinese_english_improved.csv"
OUT_A     = "version_A_results.csv"      # Uses CSV meanings/literals + Step 3
OUT_B     = "version_B_results.csv"      # Full LLM: Steps 1 + 2 + 3

# --- Model / Inference ---
MODEL = os.getenv("GPT5_MODEL", "gpt-5-mini")
TARGET_EXPLANATION_LANGUAGE = "English"  # change to "Chinese" or others if needed

# Ensure your OpenAI key is available:
os.environ["OPENAI_API_KEY"] = "sk-..."

CACHE_DIR = pathlib.Path("./cache_three_step")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

@dataclass
class InferenceConfig:
    model: str = MODEL
    top_p: float = 1.0
    seed: Optional[int] = None
    system_prompt: str = "You are a precise bilingual translator. Output compact text, no extra commentary."


## Load CSV

In [None]:

def load_idioms(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if "src" not in df.columns:
        raise ValueError("CSV must include a 'src' column.")
    if "true_meaning" not in df.columns:
        df["true_meaning"] = None
    if "literal_translation" not in df.columns:
        df["literal_translation"] = None
    return df

df = load_idioms(INPUT_CSV)
df.head()


## GPT‑5 Call Helper (with disk cache)

In [None]:

def _cache_key(payload: Dict[str, Any]) -> str:
    return hashlib.sha256(json.dumps(payload, sort_keys=True, ensure_ascii=False).encode()).hexdigest()

def call_gpt5(user_content: str, cfg: InferenceConfig) -> str:
    payload = {
        "model": cfg.model,
        "top_p": cfg.top_p,
        "seed": cfg.seed,
        "system": cfg.system_prompt,
        "user": user_content,
    }
    key = _cache_key(payload)
    f = CACHE_DIR / f"{key}.json"
    if f.exists():
        return json.loads(f.read_text())["text"]

    from openai import OpenAI
    client = OpenAI()
    resp = client.chat.completions.create(
        model=cfg.model,
        messages=[
            {"role":"system","content":cfg.system_prompt},
            {"role":"user","content":user_content},
        ],
        top_p=cfg.top_p,
        seed=cfg.seed
    )
    text = resp.choices[0].message.content.strip()
    f.write_text(json.dumps({"text": text}, ensure_ascii=False))
    return text

CFG = InferenceConfig()


## Three Prompts (matching the paper steps)

In [None]:

PROMPT_STEP1_EXPLAIN = """
Explain the meaning of the following Chinese idiom in {lang}.
- Audience: educated readers; be concise (<= 2 sentences).
- Do not translate word-by-word; provide the **idiomatic sense**.

Idiom: {idiom}
""".strip()

PROMPT_STEP2_LITERAL = """
Provide a **literal, word-by-word** English translation for the following Chinese idiom.
- Keep it terse and faithful to each component.
- No commentary, just the literal gloss.

Idiom: {idiom}
""".strip()

PROMPT_STEP3_NATURAL = """
Produce a **natural English idiomatic translation** given:
(1) An idiom explanation (idiomatic meaning) and
(2) A literal word-by-word gloss.

Rules:
- Output a single short English phrase/sentence that a native speaker would actually say.
- Prefer clarity and naturalness over literalness.
- No extra commentary.

Idiom: {idiom}
Explanation: {explanation}
Literal: {literal}
Result:
""".strip()


## Version A — Use CSV (steps 1 & 2 from file) → Run Step 3 only

In [None]:

def version_A_run(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df.iterrows():
        idiom = str(r["src"])
        explanation = str(r["true_meaning"]) if pd.notna(r["true_meaning"]) else ""
        literal = str(r["literal_translation"]) if pd.notna(r["literal_translation"]) else ""

        # Step 3 prompt
        p3 = PROMPT_STEP3_NATURAL.format(idiom=idiom, explanation=explanation, literal=literal)
        final = call_gpt5(p3, CFG)

        rows.append({
            "src": idiom,
            "explanation_used": explanation,
            "literal_used": literal,
            "final_translation": final
        })
    return pd.DataFrame(rows)

res_A = version_A_run(df)
res_A.head()


## Version A Fast— Use CSV (steps 1 & 2 from file) → Run Step 3 only

In [None]:
# helper: run blocking call_gpt5 in a thread
async def _run_gpt(p3, cfg):
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(None, call_gpt5, p3, cfg)

async def _worker(row):
    idiom = str(row["src"])
    explanation = str(row["true_meaning"]) if pd.notna(row["true_meaning"]) else ""
    literal = str(row["literal_translation"]) if pd.notna(row["literal_translation"]) else ""

    p3 = PROMPT_STEP3_NATURAL.format(
        idiom=idiom,
        explanation=explanation,
        literal=literal,
    )

    final = await _run_gpt(p3, CFG)

    return {
        "src": idiom,
        "explanation_used": explanation,
        "literal_used": literal,
        "final_translation": final,
    }

async def version_A_run_parallel(df: pd.DataFrame, concurrency: int = 8) -> pd.DataFrame:
    # limit how many GPT calls happen at once
    sem = asyncio.Semaphore(concurrency)
    tasks = []

    for row in df.to_dict(orient="records"):
        async def go(row=row):
            async with sem:
                return await _worker(row)
        tasks.append(go())

    results = []
    for coro in tqdm(asyncio.as_completed(tasks),
                     total=len(tasks),
                     desc="Translating idioms"):
        results.append(await coro)

    return pd.DataFrame(results)


res_A = await version_A_run_parallel(df)
res_A.head()


## Version B — Full LLM (steps 1 & 2 via model) → Run Step 3

In [None]:

def step1_explain(idiom: str, lang: str = TARGET_EXPLANATION_LANGUAGE) -> str:
    p = PROMPT_STEP1_EXPLAIN.format(idiom=idiom, lang=lang)
    return call_gpt5(p, CFG)

def step2_literal(idiom: str) -> str:
    p = PROMPT_STEP2_LITERAL.format(idiom=idiom)
    return call_gpt5(p, CFG)

def step3_natural(idiom: str, explanation: str, literal: str) -> str:
    p = PROMPT_STEP3_NATURAL.format(idiom=idiom, explanation=explanation, literal=literal)
    return call_gpt5(p, CFG)

def version_B_run(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df.iterrows():
        idiom = str(r["src"])

        # Step 1 & 2 generated by LLM
        explanation_gen = step1_explain(idiom)
        literal_gen     = step2_literal(idiom)

        # Step 3
        final = step3_natural(idiom, explanation_gen, literal_gen)

        rows.append({
            "src": idiom,
            "explanation_gen": explanation_gen,
            "literal_gen": literal_gen,
            "final_translation": final
        })
    return pd.DataFrame(rows)

res_B = version_B_run(df)
res_B.head()


## Save Results

In [None]:

res_A.to_csv(OUT_A, index=False)
#res_B.to_csv(OUT_B, index=False)
print("Saved:")
print(" - Version A →", OUT_A)
#print(" - Version B →", OUT_B)
