# Agent-Diff Benchmark: LangChain Agent

Run the [Agent-Diff benchmark](https://arxiv.org/abs/2602.11224) using LangChain's built-in agent with tool calling.

Unlike the [ReAct notebook](react_agent_benchmark.ipynb) which uses a custom XML-tag loop, this notebook lets LangChain handle the agent loop via the model's native function-calling protocol. The `BashExecutorProxy` from the `agent-diff` SDK is wrapped as a LangChain tool.

All 4 services (Box, Calendar, Linear, Slack) are evaluated across 224 tasks.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/agent-diff-bench/agent-diff/blob/main/examples/langchain_agent_benchmark.ipynb)

**Links:** [Paper](https://arxiv.org/abs/2602.11224) | [Dataset](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) | [GitHub](https://github.com/agent-diff-bench/agent-diff)

In [None]:
!pip install agent-diff langchain langchain-openai tqdm pandas -q

In [None]:
import os
from getpass import getpass

if not os.environ.get("AGENT_DIFF_API_KEY"):
    os.environ["AGENT_DIFF_API_KEY"] = getpass("Agent-Diff API key: ")

if not os.environ.get("AGENT_DIFF_BASE_URL"):
    os.environ["AGENT_DIFF_BASE_URL"] = "https://api.agentdiff.dev"

OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY") or getpass("OpenRouter API key: ")

# --- Settings ---
MODEL = "deepseek/deepseek-chat-v3-0324"  # change to any OpenRouter model
MAX_ITERATIONS = 40       # max agent loop turns per task
MAX_TESTS = None          # None = run all tests; set to e.g. 5 for a quick trial
TIMEOUT_SECONDS = 480     # per-test timeout

In [None]:
SERVICE_CONFIG = {
    "slack": {
        "name": "Slack",
        "base_url": "https://slack.com/api",
        "description": "Slack workspace messaging and collaboration API",
        "extra_context": "",
        "test_suite_name": "Slack Bench v2",
    },
    "box": {
        "name": "Box",
        "base_url": "https://api.box.com/2.0",
        "description": "Box cloud storage and file management API",
        "extra_context": "",
        "test_suite_name": "Box Bench v2",
    },
    "calendar": {
        "name": "Google Calendar",
        "base_url": "https://www.googleapis.com/calendar/v3",
        "description": "Google Calendar scheduling and events API",
        "extra_context": "Current Date/Time: Sunday, June 17, 2018 at 00:01 (midnight), timezone America/Los_Angeles. Use this as the reference point for all relative date/time expressions like 'today', 'tomorrow', 'this Saturday', etc.",
        "test_suite_name": "Calendar Bench",
    },
    "linear": {
        "name": "Linear",
        "base_url": "https://api.linear.app/graphql",
        "description": "Linear project management and issue tracking API",
        "extra_context": "",
        "test_suite_name": "Linear Bench",
    },
}

SYSTEM_PROMPT_TEMPLATE = """You are an AI assistant that completes tasks by interacting with APIs via bash commands.

Current Session:
- Service: {service_name}
- Base URL: {base_url}
- Description: {service_description}
{extra_context}

Environment:
- You are authenticated as a user in the {service_name} workspace/account.
- Authentication is handled automatically via proxy. Use placeholder tokens like <TOKEN> where credentials would go.
- Use the execute_bash tool to run bash commands (primarily curl) to interact with the {service_name} API.
- If you are not sure how to use the {service_name} API, explore the endpoint, parameters, and learn how it works.
- Parse API responses carefully - extract IDs and data needed for subsequent calls.
- If a command fails, analyze the error and try a different approach.
- Only declare completion when the task is fully completed (not just when you've gathered information).
"""


def build_system_prompt(service: str) -> str:
    config = SERVICE_CONFIG[service]
    return SYSTEM_PROMPT_TEMPLATE.format(
        service_name=config["name"],
        base_url=config["base_url"],
        service_description=config["description"],
        extra_context=config["extra_context"],
    )

In [None]:
import time
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from agent_diff import AgentDiff, BashExecutorProxy, create_langchain_tool


def create_agent(service: str, bash_executor: BashExecutorProxy, model: str) -> AgentExecutor:
    """Create a LangChain agent with the bash tool for a given service."""
    llm = ChatOpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
        model=model,
        temperature=0,
    )
    tool = create_langchain_tool(bash_executor)
    system_prompt = build_system_prompt(service)

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ])

    agent = create_tool_calling_agent(llm, [tool], prompt)
    return AgentExecutor(
        agent=agent,
        tools=[tool],
        max_iterations=MAX_ITERATIONS,
        handle_parsing_errors=True,
        verbose=False,
    )

In [None]:
from tqdm.auto import tqdm


def run_single_test(client: AgentDiff, model: str, test, service: str) -> dict:
    """Run one test: init env -> LangChain agent -> evaluate -> cleanup."""
    env = None
    try:
        env = client.init_env(testId=test.id)
        run = client.start_run(envId=env.environmentId, testId=test.id)
        bash_executor = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)

        agent_executor = create_agent(service, bash_executor, model)

        start = time.perf_counter()
        agent_output = agent_executor.invoke({"input": test.prompt})
        elapsed = time.perf_counter() - start

        client.evaluate_run(runId=run.runId)
        result = client.get_results_for_run(runId=run.runId)
        client.delete_env(envId=env.environmentId)

        return {
            "test_id": str(test.id),
            "test_name": getattr(test, "name", ""),
            "passed": result.passed,
            "score": result.score.get("percent", 0) if isinstance(result.score, dict) else 0,
            "failures": result.failures,
            "time": round(elapsed, 2),
            "agent_output": agent_output.get("output", ""),
        }
    except Exception as e:
        if env:
            try:
                client.delete_env(envId=env.environmentId)
            except Exception:
                pass
        return {"test_id": str(test.id), "test_name": getattr(test, "name", ""), "passed": False, "score": 0, "error": str(e)}


def run_benchmark(model: str, services: list[str] | None = None, max_tests: int | None = None) -> list[dict]:
    """Run the full benchmark across services using LangChain agent."""
    services = services or list(SERVICE_CONFIG.keys())
    client = AgentDiff()
    all_results = []

    for service in services:
        config = SERVICE_CONFIG[service]

        suite_list = client.list_test_suites(name=config["test_suite_name"])
        if not suite_list.testSuites:
            print(f"[SKIP] Test suite '{config['test_suite_name']}' not found.")
            continue
        suite = client.get_test_suite(suite_list.testSuites[0].id, expand=True)
        tests = suite.tests[:max_tests] if max_tests else suite.tests

        print(f"\n{'='*60}")
        print(f"  {config['name']} â€” {len(tests)} tests | model: {model}")
        print(f"{'='*60}")

        for test in tqdm(tests, desc=config["name"]):
            result = run_single_test(client, model, test, service)
            result["service"] = service
            result["model"] = model
            all_results.append(result)

            status = "PASS" if result.get("passed") else "FAIL"
            score = result.get("score", 0)
            tqdm.write(f"  [{status}] {result.get('test_name', result['test_id'])[:60]}  score={score}")

    return all_results

In [None]:
results = run_benchmark(
    model=MODEL,
    services=None,          # all 4 services; or e.g. ["slack", "box"]
    max_tests=MAX_TESTS,
)

In [None]:
import pandas as pd

df = pd.DataFrame(results)

print("\n" + "=" * 60)
print(f"  Results: {MODEL} (LangChain Agent)")
print("=" * 60)

if "service" in df.columns and "score" in df.columns:
    summary = df.groupby("service").agg(
        tests=("score", "count"),
        passed=("passed", "sum"),
        mean_score=("score", "mean"),
        pass_rate=("passed", "mean"),
    ).round(2)
    summary["pass_rate"] = (summary["pass_rate"] * 100).round(1)
    print("\nPer-service summary:")
    print(summary.to_string())

    overall_score = df["score"].mean()
    overall_pass = df["passed"].mean() * 100
    print(f"\nOverall: score={overall_score:.1f}  pass_rate={overall_pass:.1f}%")

    summary["mean_score"].plot.bar(title=f"Agent-Diff Score by Service ({MODEL}, LangChain)", ylabel="Score", xlabel="Service", rot=0)
else:
    print(df)