In [1]:
#pip install -U langchain # install LangChain
#pip install -U langchain-naver 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI         # ← OpenAI chat wrapper
from langchain_core.prompts import ChatPromptTemplate
from pathlib import Path
import json
import os

load_dotenv()

input_path  = Path("../dataset/dev_question_added.json")               

with input_path.open("r", encoding="utf-8") as f:
    dataset = json.load(f)

In [None]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    timeout=60,
    api_key=os.getenv("OPENAI_API_KEY"),        # make sure it's in your env
)

system_prompt = """
당신은 대화 맥락을 추론하는 assistant입니다. 
아래의 대화 내용을 참고하여 문제의 정답을 맞추시오. 
"""

integrated_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", '''
[대화]
{transcript}

[질문]
아래의 선지 중, {question}

[선지]
{inferences}
A, B, C 를 제외한 그 무엇도 출력하지 마시오. 
Return your answer only — no additional text.
''')
])


chain = integrated_prompt | llm   

In [3]:
def make_transcript(conversation: list[dict]) -> str:
    """화자 1: ..., 화자 2: ... 형식으로 변환"""
    return "\n".join(
        f"speaker {turn['speaker']}: {turn['utterance']}"
        for turn in conversation
    )

def make_inferences(item: dict) -> str:
    """선지를 A/B/C 형식으로 변환"""
    return (
        f"A. {item['inference_1']}\n"
        f"B. {item['inference_2']}\n"
        f"C. {item['inference_3']}"
    )

In [4]:
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import os
import time

def evaluate(dataset: list[dict], csv_path: str = "kor_gpt-4o-mini.csv") -> None:
    """
    Collect LLM answers (raw + mapped) and append to `csv_path`.
    Adds dialogue metadata and an O/X correctness flag.
    """

    # --- load any existing log ---
    if os.path.exists(csv_path):
        answered_df = pd.read_csv(csv_path)
        answered_ids = set(answered_df["id"])
    else:
        answered_df = pd.DataFrame(
            columns=[
                "id", "transcript", "category", "question", "inferences",
                "ground_truth", "llm_answer_raw", "llm_answer", "is_right"
            ]
        )
        answered_ids = set()

    # --- main loop ---
    buffer = []
    for record in tqdm(dataset, desc="Evaluating", unit="item"):
        if record["id"] in answered_ids:
            continue

        inp = record["input"]
        transcript_str = make_transcript(inp["conversation"])
        inferences_str = make_inferences(inp)

        prompt_vars = {
            "transcript": transcript_str,
            "question":  inp["question"],
            "inferences": inferences_str,
        }

        # retry on rate-limit (429) by sleeping 61 s
        while True:
            try:
                answer_raw = chain.invoke(prompt_vars).content.strip()  # "A"|"B"|"C"
                break
            except Exception as err:
                if "429" in str(err) or "rate exceeded" in str(err).lower():
                    print("[RateLimit] 429 encountered — sleeping 61 s …")
                    time.sleep(61)
                    continue
                print(f"[ERROR] {err!r} — stopping early, progress saved.")
                if buffer:
                    pd.DataFrame(buffer).to_csv(
                        csv_path,
                        mode="a",
                        header=not os.path.exists(csv_path),
                        index=False,
                        encoding="utf-8-sig",
                    )
                return  # abort evaluation

        # map A/B/C → inference key
        letter_to_key = {"A": "inference_1", "B": "inference_2", "C": "inference_3"}
        key = letter_to_key.get(answer_raw, "")
        is_right = "O" if key == record["output"] else "X"

        buffer.append({
            "id": record["id"],
            "transcript": transcript_str,
            "category": inp["category"],
            "question": inp["question"],
            "inferences": inferences_str,
            "ground_truth": record["output"],
            "llm_answer_raw": answer_raw,
            "llm_answer": key,
            "is_right": is_right,
        })

        # flush every 10 rows
        if len(buffer) >= 10:
            pd.DataFrame(buffer).to_csv(
                csv_path,
                mode="a",
                header=not os.path.exists(csv_path),
                index=False,
                encoding="utf-8-sig",
            )
            buffer.clear()

    # flush any leftover rows
    if buffer:
        pd.DataFrame(buffer).to_csv(
            csv_path,
            mode="a",
            header=not os.path.exists(csv_path),
            index=False,
            encoding="utf-8-sig",
        )

def print_results(csv_path: str = "kor_gpt-4o-mini.csv") -> None:
    """Read `csv_path` and display accuracy by category + overall."""
    if not os.path.exists(csv_path):
        print("No results file found — run evaluate() first.")
        return

    df = pd.read_csv(csv_path)
    if df.empty:
        print("Results file is empty.")
        return

    if "is_right" in df.columns:
        df["correct"] = df["is_right"] == "O"
    else:
        df["correct"] = df["ground_truth"] == df["llm_answer"]

    print("=== 문제 유형별 정답률 ===")
    for cat, group in df.groupby("category"):
        corr = group["correct"].sum()
        tot = len(group)
        print(f"{cat}: {corr}/{tot} = {corr / tot:.2%}")

    overall_corr = df["correct"].sum()
    overall_tot = len(df)
    print("\n=== 전체 정답률 ===")
    print(f"{overall_corr}/{overall_tot} = {overall_corr / overall_tot:.2%}")


In [5]:
evaluate(dataset)

Evaluating: 100%|██████████| 151/151 [01:48<00:00,  1.39item/s]


In [6]:
print_results()

=== 문제 유형별 정답률 ===
동기: 14/25 = 56.00%
반응: 20/25 = 80.00%
원인: 18/25 = 72.00%
전제: 21/25 = 84.00%
후행사건: 45/51 = 88.24%

=== 전체 정답률 ===
118/151 = 78.15%
