# Translation Chain (v1 – June 2025 API)

A reproducible pipeline that…

1. **Pre-processes** an input text  
2. **Generates & refines** a translation in `n` feedback loops  
3. **Post-processes** the best translation against canonical references  
4. Lets you **apply edits**

In [None]:
!pip install -U "openai>=1.10.0"

In [None]:
import os, json, textwrap, re, time, html, requests
from difflib import SequenceMatcher
from openai import OpenAI
from google.colab import userdata
from typing import List, Dict, Any

openai_api_key = userdata.get("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

MODEL_MAIN   = "gpt-4.1"
MODEL_SEARCH = "gpt-4.1"

# Input text
Paste or edit your source text below.


In [None]:
SOURCE_TEXT = """
엄마 걱정
기형도
열무 삼십 단을 이고
시장에 간 우리 엄마
안 오시네, 해는 시든 지 오래
나는 찬밥처럼 방에 담겨
아무리 천천히 숙제를 해도
엄마 안 오시네, 배춧잎 같은 발소리 타박타박
안 들리네, 어둡고 무서워
금간 창 틈으로 고요히 빗소리
빈 방에 혼자 엎드려 훌쩍거리던
아주 먼 옛날
지금도 내 눈시울을 뜨겁게 하는
그 시절, 내 유년의 윗목
""".strip()

TARGET_LANG = "English"


---
---

# Preprocessing

#### return clean:

In [None]:
def _response_text(resp):
    """
    Pull plain text from various Responses SDK shapes.
    Adjust if your client returns a different structure.
    """
    if hasattr(resp, "output_text") and resp.output_text:
        return resp.output_text

    if hasattr(resp, "output") and resp.output:
        for block in resp.output:
            # Some SDKs: block.type == "message" -> block.content (list)
            content = getattr(block, "content", None)
            if isinstance(content, list) and content:
                for piece in content:
                    txt = getattr(piece, "text", None)
                    if txt:
                        return getattr(txt, "value", txt)
            txt = getattr(block, "text", None)
            if txt:
                return getattr(txt, "value", txt)

    # Absolute fallback: string-ify resp
    return str(resp)

_json_re = re.compile(r'(\{.*\}|\[.*\])', re.DOTALL)

def _parse_json_from_text(text):
    """
    Find the first JSON object/array in text and parse it.
    Raises ValueError if none found/parsable.
    """
    m = _json_re.search(text)
    if not m:
        raise ValueError("No JSON object/array found in model output.")
    return json.loads(m.group(1))

## Step 1: Canonical Translation Search

In [None]:
_URL_RE = re.compile(r"^https?://[^\s]+$", re.IGNORECASE)

_PUNCT_RE = re.compile(r"[^\w\s]", re.UNICODE)

def _norm(s: str) -> str:
    # unescape HTML, lowercase, collapse whitespace, strip surrounding quotes/punct
    s = html.unescape(s or "").strip()
    s = s.replace("’", "'").replace("“", '"').replace("”", '"').replace("–", "-").replace("—", "-")
    s = re.sub(r"\s+", " ", s)
    s = _PUNCT_RE.sub(" ", s)
    return s.lower().strip()

def _fetch_page_text(url: str, timeout: float = 12.0) -> str | None:
    try:
        r = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0 (canon-check/1.1)"})
        if r.status_code != 200:
            return None
        ct = (r.headers.get("Content-Type") or "").lower()
        if not any(t in ct for t in ("text/html", "text/plain", "text/markdown", "application/xhtml", "xml")):
            return None
        return r.text
    except Exception:
        return None

def _excerpt_matches_page(excerpt: str, page_text: str, min_chars: int = 20, threshold: float = 0.82) -> bool:
    """
    Lenient match: normalize both strings, then require either
    - normalized excerpt substring in normalized page, or
    - fuzzy similarity above threshold.
    """
    if not excerpt or len(excerpt.strip()) < min_chars:
        return False
    ex = _norm(excerpt)
    pt = _norm(page_text)
    if not ex or not pt:
        return False
    if ex in pt:
        return True
    # Fuzzy fallback: compare excerpt to best-matching window (fast heuristic)
    # Take a window ~ 3x excerpt length to improve hit probability
    L = len(ex)
    if L < 10:
        return False
    win = min(len(pt), L * 3)
    # Sample a few windows across the page text
    steps = max(1, len(pt) // max(1, (L // 2)))
    candidates = [pt[i:i+win] for i in range(0, len(pt) - win, steps)]
    candidates = candidates[:300]  # cap to keep it quick
    best = 0.0
    for c in candidates:
        ratio = SequenceMatcher(None, ex, c).ratio()
        if ratio > best:
            best = ratio
            if best >= threshold:
                return True
    return False

# ---------- Validation: require accessible URL + excerpt present (lenient) ----------
def _validate_entries_accessible(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Keep entries that have:
      - basic metadata (title, translator, plausible year, valid URL, snippet/title)
      - page fetch succeeds
      - translated_excerpt appears on the fetched page (lenient match)
    """
    cleaned = []
    for e in entries:
        title = (e.get("title") or "").strip()
        translator = (e.get("translator") or "").strip()
        work = (e.get("work") or "").strip()
        year = e.get("year")
        url = (e.get("source_url") or "").strip()
        source_title = (e.get("source_title") or "").strip()
        source_snippet = (e.get("source_snippet") or "").strip()
        translated_excerpt = (e.get("translated_excerpt") or "").strip()

        if not title or not translator or not _URL_RE.match(url) or not source_title or not source_snippet:
            continue
        try:
            year_int = int(year)
        except Exception:
            year_int = None
        if year_int is None or not (1500 <= year_int <= 2100):
            continue

        page_text = _fetch_page_text(url)
        if not page_text:
            continue
        if not _excerpt_matches_page(translated_excerpt, page_text, min_chars=20, threshold=0.82):
            continue

        cleaned.append({
            "title": title,
            "translator": translator,
            "work": work,
            "year": year_int,
            "publisher": (e.get("publisher") or "").strip(),
            "language": (e.get("language") or "English").strip(),
            "source_url": url,
            "source_title": source_title,
            "source_snippet": source_snippet,
            "translated_excerpt": translated_excerpt,
            "notes": (e.get("notes") or "").strip()
        })
        time.sleep(0.15)  # be polite
    return cleaned

def canonical_info_structured(text: str, k: int = 3) -> List[Dict[str, Any]]:
    """
    Returns only entries with an accessible page AND a verified translated excerpt (lenient match).
    If none qualify, returns [].
    """
    schema_hint = """
    Return ONLY valid JSON with this exact shape:
    {
      "translations": [
        {
          "title": "string",
          "translator": "string",
          "work": "string",
          "year": 2000,
          "publisher": "string",
          "language": "string",
          "source_url": "https://example.org/page",
          "source_title": "string",
          "source_snippet": "string",
          "translated_excerpt": "string (20-600 chars, copied from the page; no paraphrase)",
          "notes": "string"
        }
      ]
    }
    Rules:
    - Use web_search_preview to find published translations with some text visible publicly.
    - Include ONLY entries you can back with a working URL.
    - Choose pages where at least a short excerpt is visible (public preview, sample, or free edition).
    - If none meet this bar, return {"translations": []}.
    - Do not include code fences or commentary—JSON only.
    - Prefer authoritative sources (publisher, university, library, reputable archives).
    """

    prompt = f"""
    Find at least {k} published or widely cited translations of the text below
    that have some portion of the translated text visible online (not paywalled).

    {schema_hint}

    TEXT:
    ---
    {text}
    ---
    """

    resp = client.responses.create(
        model=MODEL_SEARCH,
        input=prompt,
        tools=[{"type": "web_search_preview"}],
    )

    raw = _response_text(resp)
    data = _parse_json_from_text(raw)
    translations = data.get("translations", []) if isinstance(data, dict) else []

    verified = _validate_entries_accessible(translations)
    return verified

# Example
entries = canonical_info_structured(SOURCE_TEXT, k=3)
print(json.dumps(entries, ensure_ascii=False, indent=2))

## Step 2: Word-by-Word Definitions, Etymologies, Connotations

In [None]:
def word_metadata(text: str) -> dict[str, dict]:
    prompt = f"""
    Provide a JSON object where each key is a distinct word
    from the text below, and each value is an object with
    fields "definition", "etymology", and "connotations".

    Return ONLY valid JSON — absolutely no extra commentary.

    TEXT:
    ---
    {text}
    ---
    """
    resp = client.responses.create(
        model=MODEL_MAIN,
        input=prompt
    )
    data = _parse_json_from_text(_response_text(resp))
    if isinstance(data, dict):
        return data
    raise ValueError("Expected a JSON object for word_metadata.")

print(word_metadata(SOURCE_TEXT))

## Step 3: Authorial Context

In [None]:
def author_context(text: str) -> dict:
    prompt = f"""
    Give concise context about the author of the text below.
    Return ONLY a JSON object with the keys
      "author", "life_span", "era", "notes".
    If the exact year of the quoted translation is known, include it in 'notes'.

    TEXT:
    ---
    {text}
    ---
    """
    resp = client.responses.create(
        model=MODEL_MAIN,
        input=prompt
    )
    data = _parse_json_from_text(_response_text(resp))
    if isinstance(data, dict):
        return data
    raise ValueError("Expected a JSON object for author_context.")

print(author_context(SOURCE_TEXT))

## Step 4: Phonetics and Rhythm

In [None]:
def sound_profile(text: str) -> dict:
    prompt = f"""
    Analyse the phonetic and rhythmic qualities of the text below.
    Return ONLY a JSON object with keys
      "meter", "notable_phonetics", "observations".

    TEXT:
    ---
    {text}
    ---
    """
    resp = client.responses.create(
        model=MODEL_MAIN,
        input=prompt
    )
    data = _parse_json_from_text(_response_text(resp))
    if isinstance(data, dict):
        return data
    raise ValueError("Expected a JSON object for sound_profile.")

print(sound_profile(SOURCE_TEXT))

## Combine Preprocessing Outputs

In [None]:
canonical_translations = canonical_info(SOURCE_TEXT, k=3)
word_info             = word_metadata(SOURCE_TEXT)
author_info           = author_context(SOURCE_TEXT)
sound_info            = sound_profile(SOURCE_TEXT)

preproc_bundle = {
    "canonical_translations": canonical_translations,
    "word_info":              word_info,
    "author_info":            author_info,
    "sound_info":             sound_info,
}

print(json.dumps(preproc_bundle, ensure_ascii=False, indent=2))

---
---
# Processing

## Step 1: Generate Translation from Preprocessing

In [None]:
def translate_once(text: str,
                   bundle: dict,
                   prior_translation: str | None = None) -> str:
    """
    Produce a translation informed by `bundle`, optionally improving a draft.
    """
    bundle_json = json.dumps(bundle,
                             ensure_ascii=False,
                             indent=2,
                             default=str)

    system_msg = {
        "role": "developer",
        "content": textwrap.dedent(f"""
            You are an academic translator, well-versed in translational theory.

            ## Context (JSON)
            ```json
            {bundle_json}
            ```

            • Preserve meaning, style, and sonic qualities where possible.
            • If a prior draft is provided, improve it; otherwise create a new one.
            • Output **only** the translation text—no commentary.
        """)
    }

    user_msg = {
        "role": "user",
        "content": (
            prior_translation
            if prior_translation
            else f"Translate into {TARGET_LANG}:\n\n{text}"
        )
    }

    resp = client.responses.create(
        model=MODEL_MAIN,
        input=[system_msg, user_msg]
    )
    return resp.output_text.strip()


current_translation = translate_once(SOURCE_TEXT, preproc_bundle)

## Step 2: Judge First Translation

In [None]:
def judge_translation(src: str, draft: str) -> dict:
    """
    Returns {"positives": [...], "negatives": [...]}
    """

    prompt = f"""
    Evaluate the English translation against the source.
    Return **ONLY** a JSON object with two keys:
      "positives" : string list of strengths
      "negatives" : string list of weaknesses

    <source>
    {src}
    </source>

    <translation>
    {draft}
    </translation>
    """

    resp = client.responses.create(
        model=MODEL_MAIN,
        input=prompt
    )

    try:
        return json.loads(resp.output_text)
    except json.JSONDecodeError as e:
        raise ValueError("Model did not return valid JSON; consider retrying.") from e


feedback = judge_translation(SOURCE_TEXT, current_translation)


## Step 3: Revise Translation from Feedback

In [None]:
def revise_translation(src: str,
                       draft: str,
                       critique: dict) -> str:
    """
    Improve `draft` using `critique`.
    Returns the revised translation as plain text.
    """

    messages = [
        {
            "role": "developer",
            "content": textwrap.dedent(f"""
                You are revising an English literary translation.

                ## Critique
                Positives → {critique['positives']}
                Negatives → {critique['negatives']}

                • Keep all positives intact.
                • Fix every negative issue.
                • Output **only** the revised translation text—no commentary,
                  no markdown.
            """)
        },
        {
            "role": "user",
            "content": draft
        }
    ]

    resp = client.responses.create(
        model=MODEL_MAIN,
        input=messages
    )
    return resp.output_text.strip()


current_translation = revise_translation(
    SOURCE_TEXT,
    current_translation,
    feedback
)


## Step 4: *N* iterations

In [None]:
ITERATIONS   = 3       # set number of iterations

for i in range(1, ITERATIONS):
    fb   = judge_translation(SOURCE_TEXT, current_translation)
    prev = current_translation
    current_translation = revise_translation(SOURCE_TEXT, prev, fb)
    print(f"Iteration {i} done")

---
---

# Post-Processing

## Step 1: Translation + Canonical Critique

In [None]:
def compare_to_canon(final: str,
                     canon: list[str]) -> str:
    """
    Returns a short critique of `final` vs. canonical translations.
    """
    prompt = textwrap.dedent(f"""
        Compare the new translation to these canonical versions. Make these observations as concise and academically technical as possible. You are a research assistant to a brilliant translator, who needs to make edits quickly, not slog through your verbose comments. Be precise, identify strenghts and weaknesses. Only return the observations, no framing language of 'certainly,' or 'of course,' etc.

        <new_translation>
        {final}
        </new_translation>

        <canonical>
        {json.dumps(canon, ensure_ascii=False, indent=2, default=str)}
        </canonical>

        • Highlight unique strengths or weaknesses of the new version.
        • Mention where it surpasses or falls short of canon.
    """)

    resp = client.responses.create(
        model=MODEL_MAIN,
        input=prompt
    )
    return resp.output_text.strip()


analysis = compare_to_canon(current_translation, canonical_info)

print("### Final translation ###\n")
print(current_translation)
print("\n---\n")
print("### Comparative analysis ###\n")
print(analysis)


## Step 3: Final Edits

In [None]:
USER_EDITS = """
collapse repeated clauses
""".strip()

# e.g. “Make diction simpler in line 2”, “Keep rhyme scheme”, …

---
---

# Final Translation

In [None]:
def apply_edits(draft: str,
                canon_analysis: str = "",
                user_edits: str = "") -> str:
    """
    Improve `draft` by:
      • integrating observations from `canon_analysis`
      • applying user-supplied edits in `user_edits`
    Returns the revised translation (plain text).
    """
    if not (canon_analysis.strip() or user_edits.strip()):
        return draft

    dev_blocks = ["You are revising the translation."]

    if canon_analysis.strip():
        dev_blocks.append("## OBSERVATIONS FROM CANON COMPARISON\n"
                          + canon_analysis.strip())

    if user_edits.strip():
        dev_blocks.append("## USER EXTRA EDITS\n" + user_edits.strip())

    dev_blocks.append("Return **only** the final revised translation text.")

    messages = [
        {"role": "developer", "content": "\n\n".join(dev_blocks)},
        {"role": "user",      "content": draft}
    ]

    resp = client.responses.create(model=MODEL_MAIN, input=messages)
    return resp.output_text.strip()


final_translation = apply_edits(
    draft=current_translation,
    canon_analysis=analysis,
    user_edits=USER_EDITS
)

print("### Final-final translation ###\n")
print(final_translation)
