# Agent-Diff Benchmark: ReAct Agent

Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using the paper's custom ReAct agent loop with XML-tag parsing.

The agent reasons step-by-step (`<thinking>`), executes bash/curl commands (`<action>`), observes the result, and repeats until the task is done (`<done>`).

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/react_agent_benchmark.ipynb)

**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)

In [None]:
!pip install agent-diff httpx datasets -q

In [None]:
%env AGENT_DIFF_API_KEY=
%env AGENT_DIFF_BASE_URL=https://api.agentdiff.dev
# OpenRouter key (or any OpenAI-compatible provider) e.g. https://openrouter.ai/deepseek/deepseek-chat-v3-0324
%env OPENROUTER_API_KEY=

In [None]:
import os
import re
import time
import json
import httpx
import random
from datasets import load_dataset
from agent_diff import AgentDiff, BashExecutorProxy

OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]
MODEL = "deepseek/deepseek-chat-v3-0324"

SERVICE_CONFIG = {
    "slack": {"name": "Slack", "base_url": "https://slack.com/api", "extra": ""},
    "box": {"name": "Box", "base_url": "https://api.box.com/2.0", "extra": ""},
    "calendar": {"name": "Google Calendar", "base_url": "https://www.googleapis.com/calendar/v3",
                 "extra": "- **Current Date/Time**: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles.\n"},
    "linear": {"name": "Linear", "base_url": "https://api.linear.app/graphql", "extra": ""},
}

REACT_SYSTEM_PROMPT = """You are an AI assistant that completes tasks by interacting with APIs via bash commands.

## Current Session
- **Service**: {service_name}
- **Base URL**: {base_url}
{extra_context}

## Environment
- Authentication is handled automatically via proxy. Use placeholder tokens where credentials would go.
- You execute bash commands (primarily curl) to interact with the API.
- If you are not sure how to use the API, explore the endpoint, parameters, and learn how it works.

## Response Format
Respond using XML tags:

<thinking>Your reasoning</thinking>
<action>Your bash command</action>

When done:
<thinking>Your reasoning</thinking>
<done>Brief summary</done>

## Rules
1. Execute ONE command at a time, then wait for the result.
2. Parse API responses carefully - extract IDs and data needed for subsequent calls.
3. If a command fails, analyze the error and try a different approach.
4. Only use <done> when the task is fully completed.
"""


def call_openrouter(model, messages, max_retries=3):
    for attempt in range(max_retries):
        try:
            with httpx.Client(timeout=120) as http:
                resp = http.post(
                    "https://openrouter.ai/api/v1/chat/completions",
                    headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"},
                    json={"model": model, "messages": messages},
                )
                resp.raise_for_status()
                return resp.json()["choices"][0]["message"]["content"]
        except (httpx.HTTPStatusError, httpx.ConnectError) as e:
            if attempt < max_retries - 1:
                time.sleep(2 * (2 ** attempt) + random.uniform(0, 1))
                continue
            raise


def parse_react(response):
    thinking = re.search(r'<thinking>(.*?)</thinking>', response, re.DOTALL)
    action = re.search(r'<action>(.*?)</action>', response, re.DOTALL)
    done = re.search(r'<done>(.*?)</done>', response, re.DOTALL)
    return (
        thinking.group(1).strip() if thinking else None,
        action.group(1).strip() if action else None,
        done.group(1).strip() if done else None,
    )


def run_react_agent(model, prompt, bash, system_prompt, max_iterations=40):
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
    for i in range(max_iterations):
        text = call_openrouter(model, messages)
        _, action, done = parse_react(text)
        if action:
            result = bash.execute(action)
            stdout = result.get("stdout", "") if isinstance(result, dict) else str(result)
            stderr = result.get("stderr", "") if isinstance(result, dict) else ""
            obs = stdout.strip() or "(empty output)"
            if result.get("exit_code", 0) != 0:
                obs = f"{stdout}\n[stderr]: {stderr}".strip()
            messages.append({"role": "assistant", "content": text})
            messages.append({"role": "user", "content": f"<observation>\n{obs}\n</observation>"})
        elif done:
            return {"completed": True, "iterations": i + 1}
        else:
            messages.append({"role": "assistant", "content": text})
            messages.append({"role": "user", "content": "Please respond with <action> or <done>."})
    return {"completed": False, "iterations": max_iterations}

In [None]:
client = AgentDiff()
dataset = load_dataset("hubertmarek/agent-diff-bench", split="test")

results = []

for example in dataset.select(range(5)):  # First 5 tasks; remove .select() for full benchmark
    info = json.loads(example["info"]) if isinstance(example["info"], str) else example["info"]
    expected = json.loads(example["answer"]) if isinstance(example["answer"], str) else example["answer"]
    service = info["service"]
    cfg = SERVICE_CONFIG[service]

    system_prompt = REACT_SYSTEM_PROMPT.format(
        service_name=cfg["name"], base_url=cfg["base_url"], extra_context=cfg["extra"]
    )

    print(f"Running: {example.get('test_name', example['test_id'])}")

    env = client.init_env(
        templateService=info["service"],
        templateName=info["seed_template"],
        impersonateUserId=info["impersonate_user_id"],
    )
    run = client.start_run(envId=env.environmentId)
    bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)

    start = time.perf_counter()
    try:
        trace = run_react_agent(MODEL, example["question"], bash, system_prompt)
    except Exception as e:
        trace = {"completed": False, "error": str(e)}
    elapsed = time.perf_counter() - start

    client.evaluate_run(runId=run.runId, expectedOutput=expected)
    result = client.get_results_for_run(runId=run.runId)

    results.append({
        "test_id": example["test_id"],
        "service": service,
        "passed": result.passed,
        "score": result.score,
        "time": round(elapsed, 1),
    })
    print(f"  {'PASS' if result.passed else 'FAIL'} | score={result.score} | {elapsed:.1f}s")

    client.delete_env(envId=env.environmentId)

passed = sum(1 for r in results if r["passed"])
print(f"\nResults: {passed}/{len(results)} passed")