In [14]:
#pip install -U langchain # install LangChain
#pip install -U langchain-naver 
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from pathlib import Path
import json
import os

load_dotenv()
api_key = os.getenv("CLOVASTUDIO_API_KEY")  

input_path  = Path("../dataset/dev_question_added.json")               

with input_path.open("r", encoding="utf-8") as f:
    dataset = json.load(f)

In [15]:
from langchain_naver import ChatClovaX

llm = ChatClovaX(
    model="HCX-005",        # default general-purpose chat model
    temperature=0,          # deterministic, same as before
    timeout=60,             # ≈ OpenAI's request_timeout
    api_key=api_key         # optional if it's already in the env
)
system_prompt = """
당신은 대화 맥락을 추론하는 assistant입니다. 
아래의 대화 내용을 참고하여 문제의 정답을 맞추시오. 
"""

integrated_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", '''
[대화]
{transcript}

[질문]
아래의 선지 중, {question}

[선지]
{inferences}
A, B, C 를 제외한 그 무엇도 출력하지 마시오. 
Return your answer only — no additional text.
''')
])

chain = integrated_prompt | llm   

In [16]:
def make_transcript(conversation: list[dict]) -> str:
    """화자 1: ..., 화자 2: ... 형식으로 변환"""
    return "\n".join(
        f"화자 {turn['speaker']}: {turn['utterance']}"
        for turn in conversation
    )

def make_inferences(item: dict) -> str:
    """선지를 A/B/C 형식으로 변환"""
    return (
        f"A. {item['inference_1']}\n"
        f"B. {item['inference_2']}\n"
        f"C. {item['inference_3']}"
    )

In [22]:
from tqdm import tqdm  
from collections import defaultdict
import pandas as pd

def evaluate(dataset: list[dict], csv_path: str = "results.csv") -> None:
    """Run the chain only for items not yet answered and append to csv_path."""

    # --- load any existing log ---
    if os.path.exists(csv_path):                         # file exists ✔
        answered_df = pd.read_csv(csv_path)
        answered_ids = set(answered_df["id"])
    else:
        answered_df = pd.DataFrame(
            columns=["id", "category", "ground_truth", "llm_answer"]
        )
        answered_ids = set()

    # --- main loop ---
    buffer = []                                          # row buffer to reduce I/O
    for record in tqdm(dataset, desc="Evaluating", unit="item"):
        if record["id"] in answered_ids:
            continue                                     # already done

        inp = record["input"]
        prompt_vars = {
            "transcript": make_transcript(inp["conversation"]),
            "question":  inp["question"],
            "inferences": make_inferences(inp),
        }

        # try / except → stop cleanly on API issues (e.g. RateLimitError 429)
        try:
            answer = chain.invoke(prompt_vars).content.strip()     # "A" | "B" | "C"
        except Exception as err:
            print(f"[ERROR] {err!r} — stopping early, progress saved.")
            break

        buffer.append({
            "id": record["id"],
            "category": inp["category"],
            "ground_truth": record["output"],
            "llm_answer": answer,
        })

        # flush every 10 records
        if len(buffer) >= 10:
            pd.DataFrame(buffer).to_csv(
                csv_path,
                mode="a",
                header=not os.path.exists(csv_path),     # header only once
                index=False,
                encoding="utf-8-sig",
            )
            buffer.clear()

    # flush any remainder
    if buffer:
        pd.DataFrame(buffer).to_csv(
            csv_path,
            mode="a",
            header=not os.path.exists(csv_path),
            index=False,
            encoding="utf-8-sig",
        )

def print_results(csv_path: str = "results.csv") -> None:
    """Load csv_path and display accuracy by category + overall."""

    if not os.path.exists(csv_path):
        print("No results file found — run evaluate() first.")
        return

    df = pd.read_csv(csv_path)
    if df.empty:
        print("Results file is empty.")
        return

    df["correct"] = df["ground_truth"] == df["llm_answer"]

    print("=== 문제 유형별 정답률 ===")
    for cat, group in df.groupby("category"):
        corr = group["correct"].sum()
        tot = len(group)
        print(f"{cat}: {corr}/{tot} = {corr / tot:.2%}")

    overall_corr = df["correct"].sum()
    overall_tot = len(df)
    print("\n=== 전체 정답률 ===")
    print(f"{overall_corr}/{overall_tot} = {overall_corr / overall_tot:.2%}")


In [24]:
evaluate(dataset)

Evaluating:  37%|███▋      | 56/151 [00:01<00:03, 30.85item/s]

[ERROR] RateLimitError("Error code: 429 - {'error': {'message': 'Too many requests - rate exceeded', 'code': '42901'}}") — stopping early, progress saved.





In [None]:
print_results()