In [1]:
import re, os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_core.messages import HumanMessage

from dotenv import load_dotenv
#os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"

## 1. Agent 설정
LLM, Temperature, 프롬프트, 페르소나 등

In [8]:
llm = ChatOpenAI(model="gpt-4o-mini",
                    temperature=0,
                    request_timeout=60,
                    api_key=os.environ["OPENAI_API_KEY"])

system_prompt = '''Y
You are a logical reasoning assistant.  
You will be shown a short story as a list of sentences and then a question.  
Your task is to answer using exactly one English word naming a location (e.g., “desk”, “bookshelf”).  
Do not add any extra words, punctuation, or explanation—just the single location word.
'''

integrated_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", """
[Story]
{Story}

[Question]
{Question}

""")
])

## 2. 질문 하나만 테스트 해보기

In [9]:
story = '''
1. Oliver entered the porch.
2. Owen entered the porch.
3. Abigail entered the porch.
4. The broccoli is in the bucket.
5. Abigail exited the porch.
6. Owen exited the porch.
7. Owen hates the peach
8. Oliver moved the broccoli to the pantry.
'''
qa_list = [
    {'type': 'reality', 'question':'Where is the broccoli really?', 'answer':'pantry'},
    {'type':'memory', 'question':'Where was the broccoli at the beginning?', 'answer':'bucket'},
    {'type':'first_order', 'question':'Where will Oliver look for the broccoli?', 'answer':'pantry'},
    {'type':'first_order', 'question':'Where will Abigail look for the broccoli?', 'answer':'bucket'},
    {'type':'second_order', 'question':'Where does Oliver think that Abigail searches for the broccoli?', 'answer':'pantry'},
    {'type':'second_order', 'question':'Where does Abigail think that Oliver searches for the broccoli?', 'answer':'pantry'},
]

In [10]:
for i, qa in enumerate(qa_list, 1):
    response = llm.invoke(integrated_prompt.format_messages(
        Story=story.strip(), Question=qa['question']
    ))
    answer = response.content.strip()
    print(f"\033[94mQuestion {i} ({qa['type']}): ){qa['question']}\033[0m")
    print(f"  Model answer: {answer}, Correct answer: {qa['answer']}")

[94mQuestion 1 (reality): )Where is the broccoli really?[0m
  Model answer: pantry, Correct answer: pantry
[94mQuestion 2 (memory): )Where was the broccoli at the beginning?[0m
  Model answer: bucket, Correct answer: bucket
[94mQuestion 3 (first_order): )Where will Oliver look for the broccoli?[0m
  Model answer: pantry, Correct answer: pantry
[94mQuestion 4 (first_order): )Where will Abigail look for the broccoli?[0m
  Model answer: pantry, Correct answer: bucket
[94mQuestion 5 (second_order): )Where does Oliver think that Abigail searches for the broccoli?[0m
  Model answer: pantry, Correct answer: pantry
[94mQuestion 6 (second_order): )Where does Abigail think that Oliver searches for the broccoli?[0m
  Model answer: pantry, Correct answer: pantry


## 3. 전체 데이터셋에 실험 

In [None]:
import pandas as pd

# GitHub raw CSV URL
url = "https://raw.githubusercontent.com/beefed-up-geek/Multi-Agent/main/Experiments/ToMi/ToMi_official_github/tomi_dataset.csv"

# 상위 100개 story 불러오기
df = pd.read_csv(url)
df_top100 = df.head(100)

print("Loaded dataset with", len(df_top100), "stories.")

Loaded dataset with 100 stories.


In [13]:
from tqdm import tqdm                     # ← NEW: progress-bar utility

correct = {"Reality": 0, "Memory": 0,
           "First order": 0, "Second order": 0}
total   = {"Reality": 0, "Memory": 0,
           "First order": 0, "Second order": 0}

# tqdm shows progress & ETA
for _, row in tqdm(df_top100.iterrows(), total=len(df_top100), desc="Stories"):
    story = row["Story"]

    for q_col, a_col in [("Reality Question", "Reality Answer")]:
        total["Reality"] += 1
        pred = llm.invoke(
            integrated_prompt.format_messages(Story=story, Question=row[q_col])
        ).content.strip().lower()
        if pred == row[a_col].strip().lower():
            correct["Reality"] += 1

    for q_col, a_col in [("Memory Question", "Memory Answer")]:
        total["Memory"] += 1
        pred = llm.invoke(
            integrated_prompt.format_messages(Story=story, Question=row[q_col])
        ).content.strip().lower()
        if pred == row[a_col].strip().lower():
            correct["Memory"] += 1

    for q_col, a_col in [("First-Order Belief A Question", "First-Order Belief A Answer"),
                         ("First-Order Belief B Question", "First-Order Belief B Answer")]:
        total["First order"] += 1
        pred = llm.invoke(
            integrated_prompt.format_messages(Story=story, Question=row[q_col])
        ).content.strip().lower()
        if pred == row[a_col].strip().lower():
            correct["First order"] += 1

    for q_col, a_col in [("Second-Order Belief A Question", "Second-Order Belief A Answer"),
                         ("Second-Order Belief B Question", "Second-Order Belief B Answer")]:
        total["Second order"] += 1
        pred = llm.invoke(
            integrated_prompt.format_messages(Story=story, Question=row[q_col])
        ).content.strip().lower()
        if pred == row[a_col].strip().lower():
            correct["Second order"] += 1

print("\nAccuracy")
for cat in ["Reality", "Memory", "First order", "Second order"]:
    pct = correct[cat] / total[cat] * 100 if total[cat] else 0.0
    print(f"{cat:12}: {correct[cat]}/{total[cat]} = {pct:5.2f}%")


Stories: 100%|██████████| 100/100 [06:55<00:00,  4.15s/it]


Accuracy
Reality     : 96/100 = 96.00%
Memory      : 100/100 = 100.00%
First order : 161/200 = 80.50%
Second order: 109/200 = 54.50%



