# Agent-Diff Benchmark: ReAct Agent

Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using the paper's custom ReAct agent loop with XML-tag parsing.

The agent reasons step-by-step (`<thinking>`), executes bash/curl commands (`<action>`), observes the result, and repeats until the task is done (`<done>`).

All 4 services (Box, Calendar, Linear, Slack) are evaluated across 224 tasks.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/react_agent_benchmark.ipynb)

**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)

In [None]:
!pip install agent-diff httpx tqdm pandas -q

In [None]:
import os
from getpass import getpass

if not os.environ.get("AGENT_DIFF_API_KEY"):
    os.environ["AGENT_DIFF_API_KEY"] = getpass("Agent-Diff API key: ")

if not os.environ.get("AGENT_DIFF_BASE_URL"):
    os.environ["AGENT_DIFF_BASE_URL"] = "https://api.agentdiff.dev"

OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY") or getpass("OpenRouter API key: ")

# --- Settings ---
MODEL = "deepseek/deepseek-chat-v3-0324"  # change to any OpenRouter model
MAX_ITERATIONS = 40       # max ReAct loop turns per task
MAX_TESTS = None          # None = run all tests; set to e.g. 5 for a quick trial
TIMEOUT_SECONDS = 480     # per-test timeout

In [None]:
import re
from typing import Optional, Tuple

SERVICE_CONFIG = {
    "slack": {
        "name": "Slack",
        "base_url": "https://slack.com/api",
        "description": "Slack workspace messaging and collaboration API",
        "extra_context": "",
        "test_suite_name": "Slack Bench v2",
    },
    "box": {
        "name": "Box",
        "base_url": "https://api.box.com/2.0",
        "description": "Box cloud storage and file management API",
        "extra_context": "",
        "test_suite_name": "Box Bench v2",
    },
    "calendar": {
        "name": "Google Calendar",
        "base_url": "https://www.googleapis.com/calendar/v3",
        "description": "Google Calendar scheduling and events API",
        "extra_context": "- **Current Date/Time**: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles. Use this as the reference point for all relative date/time expressions like 'today', 'tomorrow', 'this Saturday', etc.",
        "test_suite_name": "Calendar Bench",
    },
    "linear": {
        "name": "Linear",
        "base_url": "https://api.linear.app/graphql",
        "description": "Linear project management and issue tracking API",
        "extra_context": "",
        "test_suite_name": "Linear Bench",
    },
}

REACT_SYSTEM_PROMPT = """You are an AI assistant that completes tasks by interacting with APIs via bash commands.

## Current Session
- **Service**: {service_name}
- **Base URL**: {base_url}
- **Description**: {service_description}
{extra_context}

## Environment
- You are authenticated as a user in the {service_name} workspace/account.
- Authentication is handled automatically via proxy. Use placeholder tokens like `<TOKEN>` where credentials would go.
- You execute bash commands (primarily curl) to interact with the {service_name} API.
- If you are not sure how to use {service_name} API, explore the endpoint, parameters, and learn how it works.
- The environment is stateless between commands - you cannot install packages or persist files.

## Response Format
You must respond using XML tags. Think step-by-step, then execute a command OR declare completion.

**To execute a bash command:**
<thinking>
Your reasoning about what needs to be done and why this command will help.
</thinking>

<action>
Your bash command here (e.g., curl request)
</action>

**When the task is complete:**
<thinking>
Your reasoning confirming the task is done based on API responses.
</thinking>

<done>
Brief summary of what was accomplished.
</done>

## Rules
1. Execute ONE command at a time, then wait for the result.
2. Parse API responses carefully - extract IDs and data needed for subsequent calls.
3. If a command fails, analyze the error and try a different approach.
4. Only use <done> when the task is fully completed (not just when you've gathered information).
"""


def build_system_prompt(service: str) -> str:
    config = SERVICE_CONFIG[service]
    return REACT_SYSTEM_PROMPT.format(
        service_name=config["name"],
        base_url=config["base_url"],
        service_description=config["description"],
        extra_context=config["extra_context"],
    )


def parse_react_response(response: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """Parse ReAct XML response. Returns (thinking, action, done)."""
    thinking_match = re.search(r'<thinking>(.*?)</thinking>', response, re.DOTALL)
    action_match = re.search(r'<action>(.*?)</action>', response, re.DOTALL)
    done_match = re.search(r'<done>(.*?)</done>', response, re.DOTALL)
    thinking = thinking_match.group(1).strip() if thinking_match else None
    action = action_match.group(1).strip() if action_match else None
    done = done_match.group(1).strip() if done_match else None
    return thinking, action, done

In [None]:
import time
import httpx
from agent_diff import AgentDiff, BashExecutorProxy


def call_openrouter(model: str, messages: list, max_retries: int = 3) -> dict:
    """Call OpenRouter chat completions API with retry logic."""
    import random
    last_error = None
    for attempt in range(max_retries):
        try:
            with httpx.Client(timeout=120) as http:
                resp = http.post(
                    "https://openrouter.ai/api/v1/chat/completions",
                    headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"},
                    json={"model": model, "messages": messages},
                )
                resp.raise_for_status()
                data = resp.json()
                choice = data["choices"][0]
                usage = data.get("usage", {})
                return {
                    "content": choice["message"]["content"],
                    "usage": {
                        "prompt_tokens": usage.get("prompt_tokens", 0),
                        "completion_tokens": usage.get("completion_tokens", 0),
                        "total_tokens": usage.get("total_tokens", 0),
                        "cost": usage.get("cost", 0.0),
                    },
                }
        except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadError) as e:
            last_error = e
            should_retry = not isinstance(e, httpx.HTTPStatusError) or e.response.status_code in (429, 500, 502, 503, 504)
            if should_retry and attempt < max_retries - 1:
                delay = 2 * (2 ** attempt) + random.uniform(0, 1)
                print(f"  [RETRY] attempt {attempt+1}: {e}. Waiting {delay:.1f}s...")
                time.sleep(delay)
                continue
            raise
    raise last_error


def run_react_agent(model: str, task_prompt: str, bash_executor: BashExecutorProxy, system_prompt: str, max_iterations: int = 40) -> dict:
    """Run the ReAct agent loop: think -> act -> observe -> repeat."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Task: {task_prompt}"},
    ]
    steps = []
    total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "cost": 0.0}

    for iteration in range(max_iterations):
        try:
            api_resp = call_openrouter(model, messages)
        except Exception as e:
            steps.append({"iteration": iteration + 1, "error": str(e)})
            break

        response_text = api_resp["content"]
        for k in total_usage:
            total_usage[k] += api_resp["usage"].get(k, 0)

        thinking, action, done = parse_react_response(response_text)

        if action:
            try:
                result = bash_executor.execute(action)
                observation = {"stdout": result.get("stdout", ""), "stderr": result.get("stderr", ""), "exit_code": result.get("exit_code", 0)} if isinstance(result, dict) else {"stdout": str(result), "stderr": "", "exit_code": 0}
            except Exception as e:
                observation = {"stdout": "", "stderr": str(e), "exit_code": 1}

            steps.append({"iteration": iteration + 1, "thinking": thinking, "action": action, "observation": observation})

            obs_text = observation["stdout"].strip() or "(empty output)"
            if observation.get("exit_code", 0) != 0:
                obs_text = f"{observation['stdout']}\n[stderr]: {observation['stderr']}\n[exit_code]: {observation['exit_code']}".strip()

            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": f"<observation>\n{obs_text}\n</observation>"})

        elif done:
            return {"steps": steps, "completed": True, "iterations": iteration + 1, "summary": done, "usage": total_usage}
        else:
            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": "Please respond with either an <action> to execute or <done> if the task is complete."})

    return {"steps": steps, "completed": False, "iterations": max_iterations, "summary": None, "usage": total_usage}

In [None]:
from tqdm.auto import tqdm


def run_single_test(client: AgentDiff, model: str, test, system_prompt: str, max_iterations: int, timeout: int) -> dict:
    """Run one test: init env -> agent loop -> evaluate -> cleanup."""
    env = None
    try:
        env = client.init_env(testId=test.id)
        run = client.start_run(envId=env.environmentId, testId=test.id)
        bash_executor = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)

        start = time.perf_counter()
        trace = run_react_agent(model, test.prompt, bash_executor, system_prompt, max_iterations)
        elapsed = time.perf_counter() - start

        client.evaluate_run(runId=run.runId)
        result = client.get_results_for_run(runId=run.runId)
        client.delete_env(envId=env.environmentId)

        return {
            "test_id": str(test.id),
            "test_name": getattr(test, "name", ""),
            "passed": result.passed,
            "score": result.score.get("percent", 0) if isinstance(result.score, dict) else 0,
            "failures": result.failures,
            "time": round(elapsed, 2),
            "iterations": trace["iterations"],
            "completed": trace["completed"],
            "usage": trace["usage"],
        }
    except Exception as e:
        if env:
            try:
                client.delete_env(envId=env.environmentId)
            except Exception:
                pass
        return {"test_id": str(test.id), "test_name": getattr(test, "name", ""), "passed": False, "score": 0, "error": str(e)}


def run_benchmark(model: str, services: list[str] | None = None, max_tests: int | None = None, max_iterations: int = 40, timeout: int = 480) -> list[dict]:
    """Run the full benchmark across services. Returns list of result dicts."""
    services = services or list(SERVICE_CONFIG.keys())
    client = AgentDiff()
    all_results = []

    for service in services:
        config = SERVICE_CONFIG[service]
        system_prompt = build_system_prompt(service)

        suite_list = client.list_test_suites(name=config["test_suite_name"])
        if not suite_list.testSuites:
            print(f"[SKIP] Test suite '{config['test_suite_name']}' not found.")
            continue
        suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)
        tests = suite.tests[:max_tests] if max_tests else suite.tests

        print(f"\n{'='*60}")
        print(f"  {config['name']} â€” {len(tests)} tests | model: {model}")
        print(f"{'='*60}")

        for test in tqdm(tests, desc=config["name"]):
            result = run_single_test(client, model, test, system_prompt, max_iterations, timeout)
            result["service"] = service
            result["model"] = model
            all_results.append(result)

            status = "PASS" if result.get("passed") else "FAIL"
            score = result.get("score", 0)
            tqdm.write(f"  [{status}] {result.get('test_name', result['test_id'])[:60]}  score={score}")

    return all_results

In [None]:
results = run_benchmark(
    model=MODEL,
    services=None,          # all 4 services; or e.g. ["slack", "box"]
    max_tests=MAX_TESTS,
    max_iterations=MAX_ITERATIONS,
    timeout=TIMEOUT_SECONDS,
)

In [None]:
import pandas as pd

df = pd.DataFrame(results)

print("\n" + "=" * 60)
print(f"  Results: {MODEL}")
print("=" * 60)

if "service" in df.columns and "score" in df.columns:
    summary = df.groupby("service").agg(
        tests=("score", "count"),
        passed=("passed", "sum"),
        mean_score=("score", "mean"),
        pass_rate=("passed", "mean"),
    ).round(2)
    summary["pass_rate"] = (summary["pass_rate"] * 100).round(1)
    print("\nPer-service summary:")
    print(summary.to_string())

    overall_score = df["score"].mean()
    overall_pass = df["passed"].mean() * 100
    total_cost = sum(r.get("usage", {}).get("cost", 0) for r in results)
    print(f"\nOverall: score={overall_score:.1f}  pass_rate={overall_pass:.1f}%  cost=${total_cost:.4f}")

    summary["mean_score"].plot.bar(title=f"Agent-Diff Score by Service ({MODEL})", ylabel="Score", xlabel="Service", rot=0)
else:
    print(df)