In [2]:
import json
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()  # .env 파일에 OPENAI_API_KEY가 있어야 함

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    request_timeout=60,
    api_key=os.environ["OPENAI_API_KEY"]
)

In [5]:
system_prompt = """
You are a professional Korean-to-English translator.
Translate the given sentence into natural English.
\"화자 n\" in the given sentence should be translated to \"speaker n\".
Return the translated sentence only — no additional text.
"""

translate_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", "{source}")
])

input_path  = Path("../dataset/dev_eng_question_added.json")  # 기존 파일
output_path = Path("../dataset/dev_eng_final.json")    # 결과 저장

with input_path.open("r", encoding="utf-8") as f:
    dataset = json.load(f)

In [6]:
# 전체 번역 대상 문장 수 계산 (inference_1~3)
total_sentences = 0
for s in dataset:
    for k in ("inference_1", "inference_2", "inference_3"):
        if s["input"].get(k):
            total_sentences += 1

# ─────────────────────────────────────────────────────────────
# 4) 번역 수행 + tqdm 진행바
# ─────────────────────────────────────────────────────────────
progress = tqdm(total=total_sentences, desc="Translating inferences", unit="sentence")

for sample in dataset:
    for idx in range(1, 4):
        key            = f"inference_{idx}"
        trans_key      = f"inference_translated_{idx}"
        korean_sentence = sample["input"].get(key, "")

        if korean_sentence:
            try:
                chain     = translate_prompt | llm
                response  = chain.invoke({"source": korean_sentence})
                english   = response.content.strip()
            except Exception as e:
                english   = f"[Error] {str(e)}"

            # 추가 필드 저장
            sample["input"][trans_key] = english
            progress.update(1)

progress.close()

# ─────────────────────────────────────────────────────────────
# 5) 결과 저장
# ─────────────────────────────────────────────────────────────
with output_path.open("w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print(f"✅ 번역 완료 · 저장 경로: {output_path.resolve()}")

Translating inferences: 100%|██████████| 453/453 [08:01<00:00,  1.06s/sentence]

✅ 번역 완료 · 저장 경로: /Users/taeyoonkwack/Documents/HCLT-KACL-2025/dataset/dev_eng_final.json



