In [1]:
"""
마크다운 논문 번역 파이프라인
- Title, ABSTRACT 추출 → 한 프롬프트로 번역
- 나머지 섹션은 각각 프롬프트로 번역
- OpenAI chat.completions API (gpt-5-mini) 사용
- 결과를 섹션별 .md와 전체 합본으로 저장
"""

import os
import re
import argparse
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from openai import OpenAI

In [2]:
# ========== 유틸 ==========

HEADING_RE = re.compile(r'^(#{1,6})\s*(.+?)\s*$', re.UNICODE)

def normalize_heading_name(name: str) -> str:
    """
    매칭용으로 헤딩 텍스트 정규화.
    - 앞쪽 번호/기호 제거 ("1 INTRODUCTION", "1. INTRODUCTION")
    - 공백/구두점 정리, 대문자화
    """
    name = name.strip()
    name = re.sub(r'^\s*\d+[\.\)\-:]?\s*', '', name)  # 선행 번호 제거
    name = name.replace(' ', ' ')  # non-breaking space 방지
    name = re.sub(r'\s+', ' ', name).strip()
    return name.upper().rstrip('.')

def parse_markdown_sections(md_text: str) -> List[Dict]:
    """
    마크다운을 (레벨, 제목, 본문) 섹션 리스트로 파싱.
    - 헤딩이 등장할 때마다 새로운 섹션 시작
    - 본문은 다음 헤딩 전까지의 텍스트
    """
    lines = md_text.splitlines()
    sections = []
    current = None

    for line in lines:
        m = HEADING_RE.match(line)
        if m:
            # 새 섹션 시작
            if current:
                current["content"] = current["content"].rstrip()
                sections.append(current)
            level = len(m.group(1))
            title = m.group(2).strip()
            current = {"level": level, "title": title, "title_norm": normalize_heading_name(title), "content": ""}
        else:
            if current is None:
                # 헤딩 이전의 텍스트는 무시(보통 프롤로그)
                continue
            current["content"] += line + "\n"

    if current:
        current["content"] = current["content"].rstrip()
        sections.append(current)

    return sections

def find_title(sections: List[Dict]) -> Optional[str]:
    """문서의 타이틀(첫 번째 헤딩)을 반환."""
    return sections[0]["title"] if sections else None

def pick_section(sections: List[Dict], target_names: List[str]) -> Optional[Dict]:
    """
    섹션 목록에서 특정 이름(정규화 기준)과 일치하는 첫 섹션을 반환.
    target_names 예: ["ABSTRACT", "INTRODUCTION"]
    """
    targets = set(n.upper() for n in target_names)
    for s in sections:
        if s["title_norm"] in targets:
            return s
    return None

def is_reserved_section(title_norm: str) -> bool:
    """번역 제외용 등 필요시 확장 가능(예: REFERENCES/APPENDIX 등)."""
    return title_norm in {"REFERENCES", "BIBLIOGRAPHY"}

In [None]:
# ========== 번역 관련 ==========

# def build_developer_message() -> str:
#     return (
#         "역할: 영어 학술 텍스트를 정확하고 매끄러운 학술 한국어로 번역하세요.\n"
#         "규칙:\n"
#         "1) 원문의 마크다운 구조(헤딩, 목록, 표, 인라인 코드, 수식 $...$)를 보존하세요.\n"
#         "2) 고유명사/모델/데이터셋/라이브러리명(TabM, MLP, BatchEnsemble 등)은 번역하지 마세요.\n"
#         "3) 참고문헌 표기 (예: (Wen et al., 2020))와 URL은 원문 그대로 두세요.\n"
#         "4) 약어는 첫 등장에 한해 한국어-영문 병기: 예) 기계학습(ML), 딥러닝(DL).\n"
#         "5) 존댓말/높임 없이 학술 문체로 간결하고 일관되게 번역하세요.\n"
#         "6) 표/그림 캡션, 변수/기호는 의미가 바뀌지 않도록 주의하세요.\n"
#     )

def build_developer_message() -> str:
    prompt = '''Translate markdown documents written in English into Korean, following the detailed rules below to preserve formatting, structure, and technical accuracy.
## Detailed Translation Rules

### 1. Headers
- For lines beginning with `#`, `##`, `###`, etc., translate the header text to Korean, but retain the markdown level (number of `#`).
- Example:
  - Input: `## Introduction`
  - Output: `## 소개`

### 2. Figures (Images & Captions)
- Image links (`![...](...)`):
  - Retain the image markdown as-is; do not translate or alter the filename or alt text.
  - If identical image links are repeated in succession, keep only one instance.
- Figure captions (starting with “Figure X:” immediately below an image):
  - Collect all caption sentences directly beneath the same image, treating `.`, `?`, `!` as sentence boundaries except within citations (e.g., "et al. (2020)").
  - Translate each sentence only once into Korean, omitting duplicated translations.
  - Combine translated sentences into a single caption block:
    - Format: `그림 X: <Korean sentence 1> <Korean sentence 2> ...`
    - Ensure only one such caption per figure image.

### 3. Tables
- For tables written in markdown syntax:
  - Do not translate any of the contents; output them exactly as in the source.
  - If tables are duplicated, retain only a single instance.

### 4. Regular Text Translation
- For all non-header, non-figure, and non-table text:
  - Segment by sentence (using `.`, `?`, `!`), ensuring citations (like "et al. (YEAR)", "(YEAR)." etc.) are not incorrectly split.
  - Translate each English sentence into Korean one-to-one:
    - Do not merge multiple English sentences into one, or split a single English sentence into multiple Korean ones.
    - Do not repeat any Korean translation for the same English text.
    - Do not omit any sentences.
  - Preserve original markdown formatting for lists, equations, links, images, tables, etc.
  - Leave blank lines between sections as in the original document.

### 5. Overall Common Rules
- No duplicate Korean translations for any content.
- Do not omit any sentence or structural element.
- Retain all markdown formatting of the original document, including lists, mathematics, links, images, and tables.
- Preserve technical terms, abbreviations, and model names (such as MLP, TabM, Transformer) in English, without translating them.

---

### Output Format

- Output should consist of the fully translated markdown document.
- Retain the original markdown structure, formatting, and all blank lines as in the source.
- Do not include any additional commentary or metadata.

---

### Examples

#### Example 1: Header, Image, and Caption

**Input:**
```
## Results

![Sample image](sample.png)
Figure 1: The MLP outperforms other models. This result is consistent with Smith et al. (2021).
```

**Output:**
```
## 결과

![Sample image](sample.png)
그림 1: MLP가 다른 모델보다 뛰어난 성능을 보였다. 이 결과는 Smith et al. (2021)과 일치한다.
```

#### Example 2: Table and Regular Text

**Input:**
```
Table 1 shows model accuracy.

| Model | Accuracy |
|-------|----------|
| MLP   | 89%      |

Table 1: Test results.
```

**Output:**
```
표 1은 모델 정확도를 보여준다.

| Model | Accuracy |
|-------|----------|
| MLP   | 89%      |

표 1: 테스트 결과.
```

#### Example 3: Sentence Segmentation with Citations

**Input:**
```
This approach was first introduced by Wang et al. (2020). It improves on previous work. See Table 2.
```

**Output:**
```
이 접근법은 Wang et al. (2020)에 의해 처음 도입되었다. 이는 이전 연구보다 향상된 점이 있다. 표 2를 참조.
```
(Actual examples should reflect realistic document length and structure, using placeholders as necessary.)

---

**Important Reminders:**
- Translate only the text as specified, preserving formatting.
- Avoid duplicates, omissions, or inappropriate merges/splits.
- Retain technical terms, abbreviations, and model names in English.

---

**Task Objective Recap:**
Translate markdown documents from English to Korean while preserving headers, tables, images, captions, technical terms, and formatting as described above. Ensure 1:1 sentence correspondence, avoid duplicates, and do not omit any content.'''

    return prompt

def build_user_prompt(title_block: str, text_block: str) -> str:
    prompt = (
        "아래 텍스트를 한국어로 번역하세요. 위 지침을 반드시 따르세요.\n\n"
        "=== 메타 정보 ===\n"
        f"{title_block.strip()}\n\n"
        "=== 원문 시작 ===\n"
        f"{text_block.strip()}\n"
        "=== 원문 끝 ==="
    )
    return prompt

def openai_translate(client: OpenAI, text: str, meta_title: str) -> str:
    dev_msg = build_developer_message()
    user_msg = build_user_prompt(meta_title, text)
    print(f'Develope Prompt: \n{dev_msg}\n')
    print(f'User Prompt: \n{user_msg}\n')

    resp = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {"role": "developer", "content": [{"type": "text", "text": dev_msg}]},
            {"role": "user", "content": [{"type": "text", "text": user_msg}]},
        ],
        response_format={"type": "text"},
        verbosity="low",
        reasoning_effort="minimal",
        store=False,
    )
    return resp.choices[0].message.content.strip()

# ========== 파일 저장 ==========

def safe_slug(s: str) -> str:
    s = normalize_heading_name(s)
    s = s.lower()
    s = re.sub(r'[^a-z0-9\-_\s]', '', s)
    s = re.sub(r'\s+', '-', s).strip('-')
    return s or "section"

def write_text(path: Path, text: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(text, encoding="utf-8")

# ========== 메인 파이프라인 ==========

def translate_markdown(
    input_path: Path,
    outdir: Path,
    skip_reserved: bool = True,
    join_output_filename: str = "korean_full.md",
):
    # api_key = os.getenv("OPENAI_API_KEY")
    
    if not api_key:
        raise RuntimeError("환경변수 OPENAI_API_KEY가 설정되지 않았습니다.")

    md_text = input_path.read_text(encoding="utf-8")
    sections = parse_markdown_sections(md_text)

    if not sections:
        raise RuntimeError("헤딩( # / ## / ### )이 있는 마크다운 구조를 찾지 못했습니다.")

    doc_title = find_title(sections) or "Untitled"
    abstract_sec = pick_section(sections, ["ABSTRACT"])
    intro_sec = pick_section(sections, ["INTRODUCTION"])

    client = OpenAI(api_key=api_key)

    outputs = []  # (filename, ko_markdown)

    # 1) Title + ABSTRACT 묶어서 번역
    title_block = f"Title: {doc_title}"
    abstract_text = abstract_sec["content"] if abstract_sec else ""
    title_abstract_src = f"# {doc_title}\n\n## ABSTRACT\n\n{abstract_text}".strip()
    print("[번역] Title + ABSTRACT ...")
    ko_title_abstract = openai_translate(client, title_abstract_src, title_block)
    print(f'[번역 결과]: \n {ko_title_abstract}\n')
    fname0 = outdir / f"00_title_abstract.ko.md"
    write_text(fname0, ko_title_abstract)
    outputs.append((fname0.name, ko_title_abstract))

    # 2) 나머지 섹션 각각 번역 (ABSTRACT 제외, Title은 이미 포함)
    #    (원하시면 REFERENCES/APPENDIX 필터링 가능)
    numbered = 1
    for s in sections:
        tnorm = s["title_norm"]
        if tnorm in {"ABSTRACT"}:
            continue
        if skip_reserved and is_reserved_section(tnorm):
            continue
        # Title 헤딩 자체는 내용이 거의 없으므로 건너뜀(이미 1)에서 처리)
        if s is sections[0]:
            continue

        src = f"{'#' * s['level']} {s['title']}\n\n{s['content']}".strip()
        meta = f"Section: {s['title']}"
        print(f"[번역] Section -> {s['title']}")

        ko = openai_translate(client, src, meta)
        print(f'[번역 결과]: \n {ko}\n')
        slug = safe_slug(s['title'])
        fname = outdir / f"{numbered:02d}_{slug}.ko.md"
        write_text(fname, ko)
        outputs.append((fname.name, ko))
        numbered += 1

    # 3) 합본 저장
    combined = "\n\n".join(text for _, text in outputs)
    write_text(outdir / join_output_filename, combined)

    print("\n완료 ✅")
    print(f"- 섹션별 파일: {outdir}")
    print(f"- 합본: {outdir / join_output_filename}")

In [16]:
md_name = 'paper2'

input_path = Path(f'./data/{md_name}.md').expanduser()
outdir = Path('./data').expanduser()
print(f'Input path: {input_path}, Output path: {outdir}')
translate_markdown(
    input_path=input_path,
    outdir=outdir,
    skip_reserved=False,
    join_output_filename=f"{md_name}_korean.md",
)

Input path: data/paper2.md, Output path: data
[번역] Title + ABSTRACT ...
Develope Prompt: 
Translate markdown documents written in English into Korean, following the detailed rules below to preserve formatting, structure, and technical accuracy.
## Detailed Translation Rules

### 1. Headers
- For lines beginning with `#`, `##`, `###`, etc., translate the header text to Korean, but retain the markdown level (number of `#`).
- Example:
  - Input: `## Introduction`
  - Output: `## 소개`

### 2. Figures (Images & Captions)
- Image links (`![...](...)`):
  - Retain the image markdown as-is; do not translate or alter the filename or alt text.
  - If identical image links are repeated in succession, keep only one instance.
- Figure captions (starting with “Figure X:” immediately below an image):
  - Collect all caption sentences directly beneath the same image, treating `.`, `?`, `!` as sentence boundaries except within citations (e.g., "et al. (2020)").
  - Translate each sentence only once int

In [None]:
# ========== CLI ==========
def main():
    p = argparse.ArgumentParser(description="Markdown 논문 번역기 (Title+Abstract 묶음, 나머지 섹션별)")
    p.add_argument("--input", type=str, required=True, help="입력 마크다운 파일 경로 (.md)")
    p.add_argument("--outdir", type=str, required=True, help="번역 결과 출력 폴더")
    p.add_argument("--no-skip-reserved", action="store_true", help="REFERENCES/APPENDIX 등도 번역")
    args = p.parse_args()

    input_path = Path(args.input).expanduser()
    outdir = Path(args.outdir).expanduser()
    translate_markdown(
        input_path=input_path,
        outdir=outdir,
        skip_reserved=not args.no_skip_reserved,
    )

if __name__ == "__main__":
    main()