In [None]:
import os, re, json
from tqdm.auto import tqdm
from openai import OpenAI

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

SYSTEM = (
    "You are a precise causal reasoning assistant. "
    "Given a directed graph and textual premises, decide whether the stated hypothesis is TRUE (1) or FALSE (0). "
    "Only output a single digit: 0 or 1."
)

def build_messages_openai(ex):
    graph_str  = json.dumps(ex["graph"], sort_keys=True)
    premise    = (ex.get("premise","") or "").strip()
    hypothesis = (ex.get("hypothesis","") or "").strip()

    user_text = (
        "### Task:\nDecide if the hypothesis follows from the graph and premise.\n\n"
        f"### Graph (JSON):\n{graph_str}\n\n"
        f"### Premise:\n{premise}\n\n"
        f"### Hypothesis:\n{hypothesis}\n\n"
        "### Answer:\n"
    )

    return [
        {"role": "system", "content": SYSTEM},
        {"role": "user",   "content": user_text},
    ]

def predict_label_gpt4(ex, model_name="gpt-4.1-mini"):
    messages = build_messages_openai(ex)

    resp = client.responses.create(
        model=model_name,
        input=[{"role": m["role"], "content": m["content"]} for m in messages],
        max_output_tokens=16,
        temperature=0,
    )

    # Extract the text output
    text = resp.output[0].content[0].text.value  # SDK v1 Responses API shape

    m = re.search(r"[01]", text)
    if m:
        return int(m.group(0)), text
    return None, text

correct = 0
total = 0

for ex in tqdm(small_test_raw, desc="Evaluating GPT-4.1-mini", unit="example"):
    pred, text = predict_label_gpt4(ex)
    if pred is not None:
        total += 1
        correct += int(pred == int(ex["label"]))

print(f"GPT-4.1-mini accuracy on test subset: {correct}/{total} = {correct/total:.3f}")
