From e4cb8ded00665dfc87cd1a5d8c185a6a9d65cd50 Mon Sep 17 00:00:00 2001
From: dantp-ai <1534513+dantp-ai@users.noreply.github.com>
Date: Mon, 27 Apr 2026 23:29:59 +0200
Subject: [PATCH] chore: reintroduce `clawloop run` as a TrainConfig wrapper

---
 README.md                              |   6 +-
 clawloop/cli.py                        |  95 ++++++++++++++++-----
 clawloop/train.py                      |  14 ++++
 examples/configs/taubench_harness.json |  28 +++++--
 examples/train_runner.py               |  46 +++-------
 tests/test_cli.py                      | 112 +++++++++++++++++--------
 6 files changed, 199 insertions(+), 102 deletions(-)

diff --git a/README.md b/README.md
index 50c6b9a3..07cbfedc 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,8 @@ results = agent.learn(MathEnvironment(), iterations=10, episodes_per_iter=5)
 **Config-driven training (no code):**
 
 ```bash
-uv run python examples/train_runner.py examples/configs/math_harness.json
+uv run clawloop run examples/configs/math_harness.json
+uv run clawloop run examples/configs/math_harness.json --dry-run  # mock LLMs
 ```
 
 ## Choose Your Integration Path
@@ -110,7 +111,7 @@ uv run python examples/train_runner.py examples/configs/math_harness.json
 | Harness: package/module demo entry points | `uv run python -m clawloop.demo_math --dry-run` or [`examples/demo_math.py`](examples/demo_math.py) | Same math demo from an installed package or source clone |
 | Playbook internals walkthrough | `uv run python examples/playbook_demo.py --dry-run` | `forward_backward`, `optim_step`, entry scoring, structured skills |
 | Workflow: n8n webhook integration | [`examples/n8n/`](examples/n8n/) | Workflow platform sends traces to clawloop-server; no Python in the workflow |
-| Harness benchmarks: config-driven runner | `uv run python examples/train_runner.py examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL via JSON configs and litellm |
+| Harness benchmarks: config-driven runner | `uv run clawloop run examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL, TauBench via JSON configs and litellm (`examples/train_runner.py` is a deprecated shim that forwards here) |
 | Proxy harness: zero-code-change OpenClaw | `uv run python examples/openclaw_demo.py` | Transparent proxy captures traces and injects learned skills |
 | Remote OpenClaw: SSH-connected proxy harness | `uv run python examples/openclaw_demo_remote.py --host YOUR_HOST ...` | Learn from a remote OpenClaw instance and compare before/after |
 | Weights: SkyRL/Tinker training recipes | [`examples/recipes/`](examples/recipes/) | GRPO, PPO, and fine-tuning recipes for GPU training |
@@ -150,6 +151,7 @@ all layers roll back together.
 | `harbor` | [Harbor](https://harborframework.com/) sandboxed agent tasks (BFCL, etc.) | Docker + LLM API |
 | `entropic` | [CRMArenaPro](https://github.com/salesforce/CRMArena) A2A benchmark | Entropic bench + LLM API |
 | `openclaw` | Transparent proxy — captures traces + injects playbook skills | Node.js + OpenAI-compatible Chat Completions endpoint |
+| `taubench` | [tau-bench](https://github.com/sierra-research/tau2-bench) retail/airline customer-service tasks | `pip install "clawloop[taubench]"` + LLM API |
 
 ## LLM Providers
 
diff --git a/clawloop/cli.py b/clawloop/cli.py
index b4176825..8d2e24ed 100644
--- a/clawloop/cli.py
+++ b/clawloop/cli.py
@@ -1,28 +1,30 @@
 """ClawLoop CLI — entry points for demo and benchmark setup commands.
 
-The legacy ``run`` and ``eval`` subcommands are disabled: they only wired a
-subset of environments and drifted from the unified ``TrainConfig`` runner.
-They remain in the parser so stale muscle memory gets a truthful redirect
-instead of a misleading ``Unknown benchmark`` failure.
+`clawloop run <config.json>` is a thin wrapper over the unified ``TrainConfig``
+runner: load JSON, validate via Pydantic, dispatch to ``train()``. The
+``--dry-run`` flag swaps real LLM clients for mocks so smoke tests work
+without API keys.
+
+`clawloop eval` is still disabled; legacy invocations get a truthful redirect
+to ``clawloop run`` and ``clawloop demo math --dry-run``.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
 import logging
 import sys
+from pathlib import Path
 from typing import Any
 
 log = logging.getLogger("clawloop")
 
-_DISABLED_MSG = (
-    "`clawloop {cmd}` is temporarily disabled. Use one of:\n"
-    "  - Real benchmark:  uv run python examples/train_runner.py \\\n"
-    "                         examples/configs/entropic_harness.json\n"
+_EVAL_DISABLED_MSG = (
+    "`clawloop eval` is disabled. Use one of:\n"
+    "  - Real benchmark:  uv run clawloop run examples/configs/math_harness.json\n"
     "  - Other configs:   examples/configs/  (math, harbor, entropic, openclaw, taubench)\n"
-    "  - No-key demo:     uv run clawloop demo math --dry-run\n"
-    "The config-driven runner covers every supported environment; "
-    "reintroduction of `{cmd}` as a thin wrapper is tracked upstream."
+    "  - No-key demo:     uv run clawloop demo math --dry-run"
 )
 
 
@@ -34,11 +36,16 @@ def _build_parser() -> argparse.ArgumentParser:
     parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
     sub = parser.add_subparsers(dest="command", required=True)
 
-    # Disabled subcommands. add_help=False so `run --help` hits the redirect
-    # rather than argparse's auto-generated help output. Any legacy flags land
-    # in `unknown` via parse_known_args() in main() and are ignored.
-    sub.add_parser("run", help="(disabled) use examples/train_runner.py", add_help=False)
-    sub.add_parser("eval", help="(disabled) use examples/train_runner.py", add_help=False)
+    run_p = sub.add_parser("run", help="Run a TrainConfig JSON via train()")
+    run_p.add_argument("config", type=Path, help="Path to TrainConfig JSON")
+    run_p.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Swap real LLM clients for mocks (no API calls)",
+    )
+
+    # Eval stays disabled. add_help=False so `eval --help` hits the redirect.
+    sub.add_parser("eval", help="(disabled) use `clawloop run` instead", add_help=False)
 
     setup_p = sub.add_parser("setup-bench", help="Install benchmark dependencies")
     setup_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
@@ -82,7 +89,6 @@ def _build_parser() -> argparse.ArgumentParser:
 def cmd_setup_bench(args: argparse.Namespace) -> None:
     """Install benchmark external dependencies."""
     import subprocess
-    from pathlib import Path
 
     bench = args.bench
     if bench not in BENCH_SETUP:
@@ -135,10 +141,56 @@ def cmd_demo(args: argparse.Namespace) -> None:
         sys.exit(1)
 
 
+def cmd_run(args: argparse.Namespace) -> None:
+    """Load a TrainConfig JSON and dispatch to train()."""
+    from clawloop.train import MODE_LAYERS, TrainConfig, train
+
+    raw = json.loads(args.config.read_text())
+    config = TrainConfig(**raw)  # Pydantic ValidationError surfaces fail-fast
+
+    log.info(
+        "mode=%s env=%s layers=%s",
+        config.mode,
+        config.env_type,
+        MODE_LAYERS[config.mode],
+    )
+
+    if args.dry_run:
+        _install_dry_run_clients(config)
+
+    train(config)
+
+
+def _install_dry_run_clients(config: "Any") -> None:
+    """Patch ``clawloop.train._make_llm_client`` to return mock clients.
+
+    Identifies the role (reflector / task / other) by matching the cfg
+    object identity against ``config.llm_clients``. Falls back to a generic
+    ``MockLLMClient`` for any unknown role so unfamiliar envs still run.
+    """
+    import clawloop.train as _train
+    from clawloop.demo_math import MockTaskClient, _build_mock_reflector_responses
+    from clawloop.llm import MockLLMClient
+
+    role_by_id = {id(v): k for k, v in config.llm_clients.items()}
+    original = _train._make_llm_client
+
+    def _mock_make(cfg):
+        role = role_by_id.get(id(cfg))
+        if role == "reflector":
+            return MockLLMClient(responses=_build_mock_reflector_responses())
+        if role == "task":
+            return MockTaskClient()
+        return MockLLMClient(responses=["[]"])
+
+    _train._make_llm_client = _mock_make
+    log.info("dry-run: LLM clients patched to mocks (original=%r)", original.__name__)
+
+
 def main() -> None:
     parser = _build_parser()
-    # Use parse_known_args so disabled subcommands can ignore legacy flags
-    # (`clawloop run --bench entropic`) and fall through to the redirect.
+    # parse_known_args lets the disabled `eval` subcommand swallow legacy flags
+    # (`clawloop eval --bench entropic`) and fall through to the redirect.
     args, _unknown = parser.parse_known_args()
 
     log_level = logging.DEBUG if getattr(args, "verbose", False) else logging.INFO
@@ -148,13 +200,14 @@ def main() -> None:
         datefmt="%H:%M:%S",
     )
 
-    if args.command in {"run", "eval"}:
-        print(_DISABLED_MSG.format(cmd=args.command), file=sys.stderr)
+    if args.command == "eval":
+        print(_EVAL_DISABLED_MSG, file=sys.stderr)
         sys.exit(2)
 
     # For active subcommands, re-parse strictly so typos still error.
     args = parser.parse_args()
     handlers = {
+        "run": cmd_run,
         "setup-bench": cmd_setup_bench,
         "demo": cmd_demo,
     }
diff --git a/clawloop/train.py b/clawloop/train.py
index 55f1ce49..ec7324a9 100644
--- a/clawloop/train.py
+++ b/clawloop/train.py
@@ -169,6 +169,19 @@ def _build_openclaw(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]
     return adapter, tasks
 
 
+def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple:
+    from clawloop.environments.taubench import TauBenchAdapter
+
+    taubench_cfg = dict(config.env_config or {})
+    adapter = TauBenchAdapter()
+    adapter.setup(taubench_cfg)
+    tasks = adapter.list_tasks(taubench_cfg.get("task_split", "test"))
+    num_tasks = taubench_cfg.get("num_tasks")
+    if num_tasks is not None:
+        tasks = tasks[: int(num_tasks)]
+    return adapter, tasks
+
+
 # ---------------------------------------------------------------------------
 # Environment registry — add new envs here
 # ---------------------------------------------------------------------------
@@ -233,6 +246,7 @@ def _build_openspiel(config: "TrainConfig", llm_clients: dict[str, "LLMClientCon
     "entropic": _build_entropic,
     "openclaw": _build_openclaw,
     "openspiel": _build_openspiel,
+    "taubench": _build_taubench,
 }
 
 
diff --git a/examples/configs/taubench_harness.json b/examples/configs/taubench_harness.json
index 99083351..9d5b6f50 100644
--- a/examples/configs/taubench_harness.json
+++ b/examples/configs/taubench_harness.json
@@ -1,10 +1,22 @@
 {
-  "_comment": "TauBench harness learning config. Set domain to retail or airline.",
-  "domain": "retail",
-  "llm_agent": "gemini/gemini-2.0-flash-lite",
-  "llm_user": "gemini/gemini-2.0-flash-lite",
-  "max_steps": 30,
-  "max_concurrency": 8,
-  "task_split": "test",
-  "num_tasks": 10
+    "mode": "harness_learning",
+    "env_type": "taubench",
+    "system_prompt": "You are a customer service agent. Help the user according to the policy. Be accurate, efficient, and polite.",
+    "llm_clients": {
+        "reflector": {
+            "model": "anthropic/claude-sonnet-4-5-20250929",
+            "api_base": ""
+        }
+    },
+    "env_config": {
+        "domain": "retail",
+        "llm_agent": "gemini/gemini-2.0-flash-lite",
+        "llm_user": "gemini/gemini-2.0-flash-lite",
+        "max_steps": 30,
+        "max_concurrency": 8,
+        "task_split": "test",
+        "num_tasks": 10
+    },
+    "n_iterations": 3,
+    "episodes_per_iter": 3
 }
diff --git a/examples/train_runner.py b/examples/train_runner.py
index 3c697045..fb500df6 100644
--- a/examples/train_runner.py
+++ b/examples/train_runner.py
@@ -1,54 +1,28 @@
 #!/usr/bin/env python3
-"""ClawLoop unified training runner.
+"""Deprecated shim — forwards to ``clawloop run <config.json>``.
 
-Load a JSON config, call train(). One script, two modes.
+The unified runner now lives in the CLI:
 
-    # Harness learning (prompt optimization, no GPU):
-    python examples/train_runner.py examples/configs/math_harness.json
+    uv run clawloop run examples/configs/math_harness.json
+    uv run clawloop run examples/configs/math_harness.json --dry-run
 
-    # Weight training (SkyRL GRPO on GPU):
-    python examples/train_runner.py examples/configs/math_weight.json
-
-Tinker-compatible: weight mode uses SkyRL's training infrastructure
-under the hood. ClawLoop wraps it with a unified API that lets you switch
-between prompt learning and weight training by changing one field.
+Existing invocations such as ``python examples/train_runner.py <config>``
+keep working: this shim prepends ``run`` to argv before dispatching.
 """
 
 from __future__ import annotations
 
-import json
-import logging
 import sys
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
-from clawloop.train import MODE_LAYERS, TrainConfig, train
-
-
-def main():
-    if len(sys.argv) < 2:
-        print(f"Usage: {sys.argv[0]} <config.json>")
-        sys.exit(1)
-
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
-    )
-
-    config_path = Path(sys.argv[1])
-    raw = json.loads(config_path.read_text())
-    config = TrainConfig(**raw)
+from clawloop.cli import main as cli_main
 
-    logging.getLogger("clawloop").info(
-        "mode=%s env=%s layers=%s",
-        config.mode,
-        config.env_type,
-        MODE_LAYERS[config.mode],
-    )
 
-    agent_state, state_id = train(config)
-    print(f"\nDone. Final state: {state_id.combined_hash[:12]}")
+def main() -> None:
+    sys.argv.insert(1, "run")
+    cli_main()
 
 
 if __name__ == "__main__":
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 739e014e..cdc7be3c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,9 +1,19 @@
-"""CLI smoke tests — ensure disabled subcommands emit a truthful redirect."""
+"""CLI smoke tests.
+
+`run` dispatches to ``train()``; `eval` is still disabled and emits a redirect.
+"""
 
 from __future__ import annotations
 
+import json
 import subprocess
 import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+CONFIGS_DIR = REPO_ROOT / "examples" / "configs"
 
 
 def _run_cli(*args: str) -> subprocess.CompletedProcess[str]:
@@ -11,56 +21,88 @@ def _run_cli(*args: str) -> subprocess.CompletedProcess[str]:
         [sys.executable, "-m", "clawloop.cli", *args],
         capture_output=True,
         text=True,
+        cwd=str(REPO_ROOT),
     )
 
 
-def test_run_subcommand_prints_redirect_and_exits_nonzero():
-    result = _run_cli("run")
-    assert result.returncode != 0
-    combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
-    assert "clawloop demo" in combined
+# ---------------------------------------------------------------------------
+# eval (still disabled)
+# ---------------------------------------------------------------------------
 
 
-def test_run_subcommand_redirects_even_with_legacy_flags():
-    result = _run_cli("run", "--bench", "entropic", "--iterations", "1")
+def test_eval_subcommand_prints_redirect_and_exits_nonzero():
+    result = _run_cli("eval")
     assert result.returncode != 0
     combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
+    assert "clawloop run" in combined
 
 
-def test_run_subcommand_redirects_with_global_verbose_flag():
-    result = _run_cli("-v", "run", "--bench", "entropic")
+def test_eval_redirect_ignores_legacy_flags_with_values():
+    result = _run_cli("eval", "--bench", "entropic", "--config", "/tmp/nope.json")
     assert result.returncode != 0
     combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
+    assert "clawloop run" in combined
 
 
-def test_run_subcommand_redirects_on_help_flag():
-    result = _run_cli("run", "--help")
-    assert result.returncode != 0
-    combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
+# ---------------------------------------------------------------------------
+# demo (regression guard from PR #49)
+# ---------------------------------------------------------------------------
 
 
-def test_eval_subcommand_prints_redirect_and_exits_nonzero():
-    result = _run_cli("eval")
-    assert result.returncode != 0
-    combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
+def test_demo_math_dry_run_still_works():
+    result = _run_cli("demo", "math", "--dry-run", "--iterations", "1", "--episodes", "1")
+    assert result.returncode == 0, f"demo math failed: {result.stderr}"
 
 
-def test_run_redirect_ignores_unknown_subparser_flags_with_values():
-    # Regression guard for the class of failure flagged in review: a flag that
-    # *takes a value* must not cause the disabled redirect to miss. Since only
-    # the outer parser has globals today, we prove the intercept is robust by
-    # passing a value-taking flag to the run subparser.
-    result = _run_cli("run", "--config", "/tmp/does-not-exist.json")
-    assert result.returncode != 0
-    combined = result.stdout + result.stderr
-    assert "train_runner.py" in combined
+# ---------------------------------------------------------------------------
+# run: every public config validates as TrainConfig (no `Unknown env_type`)
+# ---------------------------------------------------------------------------
 
 
-def test_demo_math_dry_run_still_works():
-    result = _run_cli("demo", "math", "--dry-run", "--iterations", "1", "--episodes", "1")
-    assert result.returncode == 0, f"demo math failed: {result.stderr}"
+@pytest.mark.parametrize(
+    "config_path",
+    sorted(CONFIGS_DIR.glob("*.json")),
+    ids=lambda p: p.name,
+)
+def test_public_configs_validate_as_trainconfig(config_path: Path):
+    """Every JSON under examples/configs/ must instantiate TrainConfig and
+    name a known env_type. Acceptance criterion from #50: no `Unknown
+    benchmark` failures from public configs."""
+    from clawloop.train import ENV_BUILDERS, TrainConfig
+
+    raw = json.loads(config_path.read_text())
+    cfg = TrainConfig(**raw)  # raises pydantic.ValidationError on schema drift
+    assert cfg.env_type in ENV_BUILDERS, (
+        f"{config_path.name} uses env_type={cfg.env_type!r} "
+        f"which is not in ENV_BUILDERS ({sorted(ENV_BUILDERS)})"
+    )
+
+
+# ---------------------------------------------------------------------------
+# run: math happy path with --dry-run
+# ---------------------------------------------------------------------------
+
+
+def test_run_math_harness_dry_run_smoke(tmp_path: Path):
+    """`clawloop run <math config> --dry-run` runs end-to-end with mocks."""
+    raw = json.loads((CONFIGS_DIR / "math_harness.json").read_text())
+    raw["n_iterations"] = 1
+    raw["episodes_per_iter"] = 1
+    cfg_path = tmp_path / "math_tiny.json"
+    cfg_path.write_text(json.dumps(raw))
+
+    result = _run_cli("run", str(cfg_path), "--dry-run")
+    assert result.returncode == 0, f"run math --dry-run failed: {result.stderr}"
+
+
+# ---------------------------------------------------------------------------
+# run: missing config path surfaces a real error (not a redirect)
+# ---------------------------------------------------------------------------
+
+
+def test_run_missing_config_errors():
+    result = _run_cli("run", "/tmp/clawloop-does-not-exist.json")
+    assert result.returncode != 0
+    # Should be a FileNotFoundError, not the old disabled-redirect text.
+    combined = result.stdout + result.stderr
+    assert "train_runner.py" not in combined