aganthos · bordeauxred · May 21, 2026 · Apr 27, 2026 · gemini-code-assist · Apr 27, 2026
diff --git a/README.md b/README.md
@@ -99,7 +99,8 @@ results = agent.learn(MathEnvironment(), iterations=10, episodes_per_iter=5)
 **Config-driven training (no code):**
 
 ```bash
-uv run python examples/train_runner.py examples/configs/math_harness.json
+uv run clawloop run examples/configs/math_harness.json
+uv run clawloop run examples/configs/math_harness.json --dry-run  # mock LLMs
 ```
 
 ## Choose Your Integration Path
@@ -110,7 +111,7 @@ uv run python examples/train_runner.py examples/configs/math_harness.json
 | Harness: package/module demo entry points | `uv run python -m clawloop.demo_math --dry-run` or [`examples/demo_math.py`](examples/demo_math.py) | Same math demo from an installed package or source clone |
 | Playbook internals walkthrough | `uv run python examples/playbook_demo.py --dry-run` | `forward_backward`, `optim_step`, entry scoring, structured skills |
 | Workflow: n8n webhook integration | [`examples/n8n/`](examples/n8n/) | Workflow platform sends traces to clawloop-server; no Python in the workflow |
-| Harness benchmarks: config-driven runner | `uv run python examples/train_runner.py examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL via JSON configs and litellm |
+| Harness benchmarks: config-driven runner | `uv run clawloop run examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL, TauBench via JSON configs and litellm (`examples/train_runner.py` is a deprecated shim that forwards here) |
 | Proxy harness: zero-code-change OpenClaw | `uv run python examples/openclaw_demo.py` | Transparent proxy captures traces and injects learned skills |
 | Remote OpenClaw: SSH-connected proxy harness | `uv run python examples/openclaw_demo_remote.py --host YOUR_HOST ...` | Learn from a remote OpenClaw instance and compare before/after |
 | Weights: SkyRL/Tinker training recipes | [`examples/recipes/`](examples/recipes/) | GRPO, PPO, and fine-tuning recipes for GPU training |
@@ -150,6 +151,7 @@ all layers roll back together.
 | `harbor` | [Harbor](https://harborframework.com/) sandboxed agent tasks (BFCL, etc.) | Docker + LLM API |
 | `entropic` | [CRMArenaPro](https://github.com/salesforce/CRMArena) A2A benchmark | Entropic bench + LLM API |
 | `openclaw` | Transparent proxy — captures traces + injects playbook skills | Node.js + OpenAI-compatible Chat Completions endpoint |
+| `taubench` | [tau-bench](https://github.com/sierra-research/tau2-bench) retail/airline customer-service tasks | `pip install "clawloop[taubench]"` + LLM API |
 
 ## LLM Providers
 

diff --git a/clawloop/cli.py b/clawloop/cli.py
@@ -1,28 +1,30 @@
 """ClawLoop CLI — entry points for demo and benchmark setup commands.
 
-The legacy ``run`` and ``eval`` subcommands are disabled: they only wired a
-subset of environments and drifted from the unified ``TrainConfig`` runner.
-They remain in the parser so stale muscle memory gets a truthful redirect
-instead of a misleading ``Unknown benchmark`` failure.
+`clawloop run <config.json>` is a thin wrapper over the unified ``TrainConfig``
+runner: load JSON, validate via Pydantic, dispatch to ``train()``. The
+``--dry-run`` flag swaps real LLM clients for mocks so smoke tests work
+without API keys.
+
+`clawloop eval` is still disabled; legacy invocations get a truthful redirect
+to ``clawloop run`` and ``clawloop demo math --dry-run``.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
 import logging
 import sys
+from pathlib import Path
 from typing import Any
 
 log = logging.getLogger("clawloop")
 
-_DISABLED_MSG = (
-    "`clawloop {cmd}` is temporarily disabled. Use one of:\n"
-    "  - Real benchmark:  uv run python examples/train_runner.py \\\n"
-    "                         examples/configs/entropic_harness.json\n"
+_EVAL_DISABLED_MSG = (
+    "`clawloop eval` is disabled. Use one of:\n"
+    "  - Real benchmark:  uv run clawloop run examples/configs/math_harness.json\n"
     "  - Other configs:   examples/configs/  (math, harbor, entropic, openclaw, taubench)\n"
-    "  - No-key demo:     uv run clawloop demo math --dry-run\n"
-    "The config-driven runner covers every supported environment; "
-    "reintroduction of `{cmd}` as a thin wrapper is tracked upstream."
+    "  - No-key demo:     uv run clawloop demo math --dry-run"
 )
 
 
@@ -34,11 +36,16 @@ def _build_parser() -> argparse.ArgumentParser:
     parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
     sub = parser.add_subparsers(dest="command", required=True)
 
-    # Disabled subcommands. add_help=False so `run --help` hits the redirect
-    # rather than argparse's auto-generated help output. Any legacy flags land
-    # in `unknown` via parse_known_args() in main() and are ignored.
-    sub.add_parser("run", help="(disabled) use examples/train_runner.py", add_help=False)
-    sub.add_parser("eval", help="(disabled) use examples/train_runner.py", add_help=False)
+    run_p = sub.add_parser("run", help="Run a TrainConfig JSON via train()")
+    run_p.add_argument("config", type=Path, help="Path to TrainConfig JSON")
+    run_p.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Swap real LLM clients for mocks (no API calls)",
+    )
+
+    # Eval stays disabled. add_help=False so `eval --help` hits the redirect.
+    sub.add_parser("eval", help="(disabled) use `clawloop run` instead", add_help=False)
 
     setup_p = sub.add_parser("setup-bench", help="Install benchmark dependencies")
     setup_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
@@ -82,7 +89,6 @@ def _build_parser() -> argparse.ArgumentParser:
 def cmd_setup_bench(args: argparse.Namespace) -> None:
     """Install benchmark external dependencies."""
     import subprocess
-    from pathlib import Path
 
     bench = args.bench
     if bench not in BENCH_SETUP:
@@ -135,10 +141,56 @@ def cmd_demo(args: argparse.Namespace) -> None:
         sys.exit(1)
 
 
+def cmd_run(args: argparse.Namespace) -> None:
+    """Load a TrainConfig JSON and dispatch to train()."""
+    from clawloop.train import MODE_LAYERS, TrainConfig, train
+
+    raw = json.loads(args.config.read_text())
+    config = TrainConfig(**raw)  # Pydantic ValidationError surfaces fail-fast
+
+    log.info(
+        "mode=%s env=%s layers=%s",
+        config.mode,
+        config.env_type,
+        MODE_LAYERS[config.mode],
+    )
+
+    if args.dry_run:
+        _install_dry_run_clients(config)
+
+    train(config)
+
+
+def _install_dry_run_clients(config: "Any") -> None:
+    """Patch ``clawloop.train._make_llm_client`` to return mock clients.
+
+    Identifies the role (reflector / task / other) by matching the cfg
+    object identity against ``config.llm_clients``. Falls back to a generic
+    ``MockLLMClient`` for any unknown role so unfamiliar envs still run.
+    """
+    import clawloop.train as _train
+    from clawloop.demo_math import MockTaskClient, _build_mock_reflector_responses
+    from clawloop.llm import MockLLMClient
+
+    role_by_id = {id(v): k for k, v in config.llm_clients.items()}
+    original = _train._make_llm_client
+
+    def _mock_make(cfg):
+        role = role_by_id.get(id(cfg))
+        if role == "reflector":
+            return MockLLMClient(responses=_build_mock_reflector_responses())
+        if role == "task":
+            return MockTaskClient()
+        return MockLLMClient(responses=["[]"])
+
+    _train._make_llm_client = _mock_make
+    log.info("dry-run: LLM clients patched to mocks (original=%r)", original.__name__)
+
+
 def main() -> None:
     parser = _build_parser()
-    # Use parse_known_args so disabled subcommands can ignore legacy flags
-    # (`clawloop run --bench entropic`) and fall through to the redirect.
+    # parse_known_args lets the disabled `eval` subcommand swallow legacy flags
+    # (`clawloop eval --bench entropic`) and fall through to the redirect.
     args, _unknown = parser.parse_known_args()
 
     log_level = logging.DEBUG if getattr(args, "verbose", False) else logging.INFO
@@ -148,13 +200,14 @@ def main() -> None:
         datefmt="%H:%M:%S",
     )
 
-    if args.command in {"run", "eval"}:
-        print(_DISABLED_MSG.format(cmd=args.command), file=sys.stderr)
+    if args.command == "eval":
+        print(_EVAL_DISABLED_MSG, file=sys.stderr)
         sys.exit(2)
 
     # For active subcommands, re-parse strictly so typos still error.
     args = parser.parse_args()
     handlers = {
+        "run": cmd_run,
         "setup-bench": cmd_setup_bench,
         "demo": cmd_demo,
     }

diff --git a/clawloop/train.py b/clawloop/train.py
@@ -169,6 +169,19 @@ def _build_openclaw(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]
     return adapter, tasks
 
 
+def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple:
-def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple:
+def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple[Any, list[str]]:
-def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple:
+def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple[Any, list[str]]:
+    from clawloop.environments.taubench import TauBenchAdapter
+
+    taubench_cfg = dict(config.env_config or {})
+    adapter = TauBenchAdapter()
+    adapter.setup(taubench_cfg)
+    tasks = adapter.list_tasks(taubench_cfg.get("task_split", "test"))
+    num_tasks = taubench_cfg.get("num_tasks")
+    if num_tasks is not None:
+        tasks = tasks[: int(num_tasks)]
+    return adapter, tasks
+
+
 # ---------------------------------------------------------------------------
 # Environment registry — add new envs here
 # ---------------------------------------------------------------------------
@@ -233,6 +246,7 @@ def _build_openspiel(config: "TrainConfig", llm_clients: dict[str, "LLMClientCon
     "entropic": _build_entropic,
     "openclaw": _build_openclaw,
     "openspiel": _build_openspiel,
+    "taubench": _build_taubench,
 }
 
 

diff --git a/examples/configs/taubench_harness.json b/examples/configs/taubench_harness.json
@@ -1,10 +1,22 @@
 {
-  "_comment": "TauBench harness learning config. Set domain to retail or airline.",
-  "domain": "retail",
-  "llm_agent": "gemini/gemini-2.0-flash-lite",
-  "llm_user": "gemini/gemini-2.0-flash-lite",
-  "max_steps": 30,
-  "max_concurrency": 8,
-  "task_split": "test",
-  "num_tasks": 10
+    "mode": "harness_learning",
+    "env_type": "taubench",
+    "system_prompt": "You are a customer service agent. Help the user according to the policy. Be accurate, efficient, and polite.",
+    "llm_clients": {
+        "reflector": {
+            "model": "anthropic/claude-sonnet-4-5-20250929",
+            "api_base": ""
+        }
+    },
+    "env_config": {
+        "domain": "retail",
+        "llm_agent": "gemini/gemini-2.0-flash-lite",
+        "llm_user": "gemini/gemini-2.0-flash-lite",
+        "max_steps": 30,
+        "max_concurrency": 8,
+        "task_split": "test",
+        "num_tasks": 10
+    },
+    "n_iterations": 3,
+    "episodes_per_iter": 3
 }
diff --git a/examples/train_runner.py b/examples/train_runner.py
@@ -1,54 +1,28 @@
 #!/usr/bin/env python3
-"""ClawLoop unified training runner.
+"""Deprecated shim — forwards to ``clawloop run <config.json>``.
 
-Load a JSON config, call train(). One script, two modes.
+The unified runner now lives in the CLI:
 
-    # Harness learning (prompt optimization, no GPU):
-    python examples/train_runner.py examples/configs/math_harness.json
+    uv run clawloop run examples/configs/math_harness.json
+    uv run clawloop run examples/configs/math_harness.json --dry-run
 
-    # Weight training (SkyRL GRPO on GPU):
-    python examples/train_runner.py examples/configs/math_weight.json
-
-Tinker-compatible: weight mode uses SkyRL's training infrastructure
-under the hood. ClawLoop wraps it with a unified API that lets you switch
-between prompt learning and weight training by changing one field.
+Existing invocations such as ``python examples/train_runner.py <config>``
+keep working: this shim prepends ``run`` to argv before dispatching.
 """
 
 from __future__ import annotations
 
-import json
-import logging
 import sys
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
-from clawloop.train import MODE_LAYERS, TrainConfig, train
-
-
-def main():
-    if len(sys.argv) < 2:
-        print(f"Usage: {sys.argv[0]} <config.json>")
-        sys.exit(1)
-
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
-    )
-
-    config_path = Path(sys.argv[1])
-    raw = json.loads(config_path.read_text())
-    config = TrainConfig(**raw)
+from clawloop.cli import main as cli_main
 
-    logging.getLogger("clawloop").info(
-        "mode=%s env=%s layers=%s",
-        config.mode,
-        config.env_type,
-        MODE_LAYERS[config.mode],
-    )
 
-    agent_state, state_id = train(config)
-    print(f"\nDone. Final state: {state_id.combined_hash[:12]}")
+def main() -> None:
+    sys.argv.insert(1, "run")
+    cli_main()
 
 
 if __name__ == "__main__":