In [1]:
#pip install -U langchain # install LangChain
#pip install -U langchain-naver 
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from pathlib import Path
import json
import os

load_dotenv()
api_key = os.getenv("CLOVASTUDIO_API_KEY")  

input_path  = Path("../dataset/dev_eng_final.json")               

with input_path.open("r", encoding="utf-8") as f:
    dataset = json.load(f)

In [3]:
from langchain_naver import ChatClovaX

llm = ChatClovaX(
    model="HCX-005",        # default general-purpose chat model
    temperature=0,          # deterministic, same as before
    timeout=60,             # ≈ OpenAI's request_timeout
    api_key=api_key         # optional if it's already in the env
)
system_prompt = """
You are an assistant that infers conversational context.
Based on the dialogue below, choose the correct answer.
"""

integrated_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", '''
[Conversation]
{transcript}

[Question]
Among the options below, {question}

[Options]
{inferences}
     
Do not output anything except A, B, or C.
Return your answer only — no additional text.
''')
])


chain = integrated_prompt | llm   

In [4]:
def make_transcript(conversation: list[dict]) -> str:
    """화자 1: ..., 화자 2: ... 형식으로 변환"""
    return "\n".join(
        f"speaker {turn['speaker']}: {turn['translated_utterance']}"
        for turn in conversation
    )

def make_inferences(item: dict) -> str:
    """선지를 A/B/C 형식으로 변환"""
    return (
        f"A. {item['inference_translated_1']}\n"
        f"B. {item['inference_translated_2']}\n"
        f"C. {item['inference_translated_3']}"
    )

In [15]:
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import os
import time

def evaluate(dataset: list[dict], csv_path: str = "eng_HCX-005.csv") -> None:
    """
    Collect LLM answers (raw + mapped) and append to `csv_path`.
    Resumes automatically if the CSV already contains some rows.
    """

    # --- load any existing log ---
    if os.path.exists(csv_path):
        answered_df = pd.read_csv(csv_path)
        answered_ids = set(answered_df["id"])
    else:
        answered_df = pd.DataFrame(
            columns=[
                "id", "category", "ground_truth",
                "llm_answer_raw", "llm_answer"
            ]
        )
        answered_ids = set()

    # --- main loop ---
    buffer = []
    for record in tqdm(dataset, desc="Evaluating", unit="item"):
        if record["id"] in answered_ids:
            continue

        inp = record["input"]
        prompt_vars = {
            "transcript": make_transcript(inp["conversation"]),
            "question":  inp["question"],
            "inferences": make_inferences(inp),
        }

        # retry on rate-limit (429) by sleeping 61 s
        while True:
            try:
                answer_raw = chain.invoke(prompt_vars).content.strip()  # "A"|"B"|"C"
                break
            except Exception as err:
                if "429" in str(err) or "rate exceeded" in str(err).lower():
                    print("[RateLimit] 429 encountered — sleeping 61 s …")
                    time.sleep(61)
                    continue
                print(f"[ERROR] {err!r} — stopping early, progress saved.")
                # flush buffer before exiting
                if buffer:
                    pd.DataFrame(buffer).to_csv(
                        csv_path,
                        mode="a",
                        header=not os.path.exists(csv_path),
                        index=False,
                        encoding="utf-8-sig",
                    )
                return  # abort whole evaluation

        # map A/B/C → 실제 선지 내용
        letter_to_key = {"A": "inference_1", "B": "inference_2", "C": "inference_3"}
        key = letter_to_key.get(answer_raw)
        llm_answer = key if key else ""

        buffer.append({
            "id": record["id"],
            "category": inp["category"],
            "ground_truth": record["output"],
            "llm_answer_raw": answer_raw,
            "llm_answer": llm_answer,
        })

        # flush every 10 rows
        if len(buffer) >= 10:
            pd.DataFrame(buffer).to_csv(
                csv_path,
                mode="a",
                header=not os.path.exists(csv_path),
                index=False,
                encoding="utf-8-sig",
            )
            buffer.clear()

    # flush any leftover rows
    if buffer:
        pd.DataFrame(buffer).to_csv(
            csv_path,
            mode="a",
            header=not os.path.exists(csv_path),
            index=False,
            encoding="utf-8-sig",
        )

def print_results(csv_path: str = "eng_HCX-005.csv") -> None:
    """Read `csv_path` and display accuracy by category + overall."""
    if not os.path.exists(csv_path):
        print("No results file found — run evaluate() first.")
        return

    df = pd.read_csv(csv_path)
    if df.empty:
        print("Results file is empty.")
        return

    df["correct"] = df["ground_truth"] == df["llm_answer"]

    print("=== 문제 유형별 정답률 ===")
    for cat, group in df.groupby("category"):
        corr = group["correct"].sum()
        tot = len(group)
        print(f"{cat}: {corr}/{tot} = {corr / tot:.2%}")

    overall_corr = df["correct"].sum()
    overall_tot = len(df)
    print("\n=== 전체 정답률 ===")
    print(f"{overall_corr}/{overall_tot} = {overall_corr / overall_tot:.2%}")


In [16]:
evaluate(dataset)

Evaluating:  40%|███▉      | 60/151 [00:25<00:46,  1.96item/s]

[RateLimit] 429 encountered — sleeping 61 s …


Evaluating:  79%|███████▉  | 120/151 [01:49<00:10,  2.86item/s]

[RateLimit] 429 encountered — sleeping 61 s …


Evaluating: 100%|██████████| 151/151 [03:02<00:00,  1.21s/item]


In [17]:
print_results()

=== 문제 유형별 정답률 ===
동기: 22/25 = 88.00%
반응: 21/25 = 84.00%
원인: 19/25 = 76.00%
전제: 21/25 = 84.00%
후행사건: 42/51 = 82.35%

=== 전체 정답률 ===
125/151 = 82.78%
