From e4cb8ded00665dfc87cd1a5d8c185a6a9d65cd50 Mon Sep 17 00:00:00 2001 From: dantp-ai <1534513+dantp-ai@users.noreply.github.com> Date: Mon, 27 Apr 2026 23:29:59 +0200 Subject: [PATCH] chore: reintroduce `clawloop run` as a TrainConfig wrapper --- README.md | 6 +- clawloop/cli.py | 95 ++++++++++++++++----- clawloop/train.py | 14 ++++ examples/configs/taubench_harness.json | 28 +++++-- examples/train_runner.py | 46 +++------- tests/test_cli.py | 112 +++++++++++++++++-------- 6 files changed, 199 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index 50c6b9a3..07cbfedc 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,8 @@ results = agent.learn(MathEnvironment(), iterations=10, episodes_per_iter=5) **Config-driven training (no code):** ```bash -uv run python examples/train_runner.py examples/configs/math_harness.json +uv run clawloop run examples/configs/math_harness.json +uv run clawloop run examples/configs/math_harness.json --dry-run # mock LLMs ``` ## Choose Your Integration Path @@ -110,7 +111,7 @@ uv run python examples/train_runner.py examples/configs/math_harness.json | Harness: package/module demo entry points | `uv run python -m clawloop.demo_math --dry-run` or [`examples/demo_math.py`](examples/demo_math.py) | Same math demo from an installed package or source clone | | Playbook internals walkthrough | `uv run python examples/playbook_demo.py --dry-run` | `forward_backward`, `optim_step`, entry scoring, structured skills | | Workflow: n8n webhook integration | [`examples/n8n/`](examples/n8n/) | Workflow platform sends traces to clawloop-server; no Python in the workflow | -| Harness benchmarks: config-driven runner | `uv run python examples/train_runner.py examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL via JSON configs and litellm | +| Harness benchmarks: config-driven runner | `uv run clawloop run examples/configs/math_harness.json` | Math, CRMArena, Harbor BFCL, TauBench via JSON configs and litellm (`examples/train_runner.py` is a deprecated shim that forwards here) | | Proxy harness: zero-code-change OpenClaw | `uv run python examples/openclaw_demo.py` | Transparent proxy captures traces and injects learned skills | | Remote OpenClaw: SSH-connected proxy harness | `uv run python examples/openclaw_demo_remote.py --host YOUR_HOST ...` | Learn from a remote OpenClaw instance and compare before/after | | Weights: SkyRL/Tinker training recipes | [`examples/recipes/`](examples/recipes/) | GRPO, PPO, and fine-tuning recipes for GPU training | @@ -150,6 +151,7 @@ all layers roll back together. | `harbor` | [Harbor](https://harborframework.com/) sandboxed agent tasks (BFCL, etc.) | Docker + LLM API | | `entropic` | [CRMArenaPro](https://github.com/salesforce/CRMArena) A2A benchmark | Entropic bench + LLM API | | `openclaw` | Transparent proxy — captures traces + injects playbook skills | Node.js + OpenAI-compatible Chat Completions endpoint | +| `taubench` | [tau-bench](https://github.com/sierra-research/tau2-bench) retail/airline customer-service tasks | `pip install "clawloop[taubench]"` + LLM API | ## LLM Providers diff --git a/clawloop/cli.py b/clawloop/cli.py index b4176825..8d2e24ed 100644 --- a/clawloop/cli.py +++ b/clawloop/cli.py @@ -1,28 +1,30 @@ """ClawLoop CLI — entry points for demo and benchmark setup commands. -The legacy ``run`` and ``eval`` subcommands are disabled: they only wired a -subset of environments and drifted from the unified ``TrainConfig`` runner. -They remain in the parser so stale muscle memory gets a truthful redirect -instead of a misleading ``Unknown benchmark`` failure. +`clawloop run ` is a thin wrapper over the unified ``TrainConfig`` +runner: load JSON, validate via Pydantic, dispatch to ``train()``. The +``--dry-run`` flag swaps real LLM clients for mocks so smoke tests work +without API keys. + +`clawloop eval` is still disabled; legacy invocations get a truthful redirect +to ``clawloop run`` and ``clawloop demo math --dry-run``. """ from __future__ import annotations import argparse +import json import logging import sys +from pathlib import Path from typing import Any log = logging.getLogger("clawloop") -_DISABLED_MSG = ( - "`clawloop {cmd}` is temporarily disabled. Use one of:\n" - " - Real benchmark: uv run python examples/train_runner.py \\\n" - " examples/configs/entropic_harness.json\n" +_EVAL_DISABLED_MSG = ( + "`clawloop eval` is disabled. Use one of:\n" + " - Real benchmark: uv run clawloop run examples/configs/math_harness.json\n" " - Other configs: examples/configs/ (math, harbor, entropic, openclaw, taubench)\n" - " - No-key demo: uv run clawloop demo math --dry-run\n" - "The config-driven runner covers every supported environment; " - "reintroduction of `{cmd}` as a thin wrapper is tracked upstream." + " - No-key demo: uv run clawloop demo math --dry-run" ) @@ -34,11 +36,16 @@ def _build_parser() -> argparse.ArgumentParser: parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging") sub = parser.add_subparsers(dest="command", required=True) - # Disabled subcommands. add_help=False so `run --help` hits the redirect - # rather than argparse's auto-generated help output. Any legacy flags land - # in `unknown` via parse_known_args() in main() and are ignored. - sub.add_parser("run", help="(disabled) use examples/train_runner.py", add_help=False) - sub.add_parser("eval", help="(disabled) use examples/train_runner.py", add_help=False) + run_p = sub.add_parser("run", help="Run a TrainConfig JSON via train()") + run_p.add_argument("config", type=Path, help="Path to TrainConfig JSON") + run_p.add_argument( + "--dry-run", + action="store_true", + help="Swap real LLM clients for mocks (no API calls)", + ) + + # Eval stays disabled. add_help=False so `eval --help` hits the redirect. + sub.add_parser("eval", help="(disabled) use `clawloop run` instead", add_help=False) setup_p = sub.add_parser("setup-bench", help="Install benchmark dependencies") setup_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging") @@ -82,7 +89,6 @@ def _build_parser() -> argparse.ArgumentParser: def cmd_setup_bench(args: argparse.Namespace) -> None: """Install benchmark external dependencies.""" import subprocess - from pathlib import Path bench = args.bench if bench not in BENCH_SETUP: @@ -135,10 +141,56 @@ def cmd_demo(args: argparse.Namespace) -> None: sys.exit(1) +def cmd_run(args: argparse.Namespace) -> None: + """Load a TrainConfig JSON and dispatch to train().""" + from clawloop.train import MODE_LAYERS, TrainConfig, train + + raw = json.loads(args.config.read_text()) + config = TrainConfig(**raw) # Pydantic ValidationError surfaces fail-fast + + log.info( + "mode=%s env=%s layers=%s", + config.mode, + config.env_type, + MODE_LAYERS[config.mode], + ) + + if args.dry_run: + _install_dry_run_clients(config) + + train(config) + + +def _install_dry_run_clients(config: "Any") -> None: + """Patch ``clawloop.train._make_llm_client`` to return mock clients. + + Identifies the role (reflector / task / other) by matching the cfg + object identity against ``config.llm_clients``. Falls back to a generic + ``MockLLMClient`` for any unknown role so unfamiliar envs still run. + """ + import clawloop.train as _train + from clawloop.demo_math import MockTaskClient, _build_mock_reflector_responses + from clawloop.llm import MockLLMClient + + role_by_id = {id(v): k for k, v in config.llm_clients.items()} + original = _train._make_llm_client + + def _mock_make(cfg): + role = role_by_id.get(id(cfg)) + if role == "reflector": + return MockLLMClient(responses=_build_mock_reflector_responses()) + if role == "task": + return MockTaskClient() + return MockLLMClient(responses=["[]"]) + + _train._make_llm_client = _mock_make + log.info("dry-run: LLM clients patched to mocks (original=%r)", original.__name__) + + def main() -> None: parser = _build_parser() - # Use parse_known_args so disabled subcommands can ignore legacy flags - # (`clawloop run --bench entropic`) and fall through to the redirect. + # parse_known_args lets the disabled `eval` subcommand swallow legacy flags + # (`clawloop eval --bench entropic`) and fall through to the redirect. args, _unknown = parser.parse_known_args() log_level = logging.DEBUG if getattr(args, "verbose", False) else logging.INFO @@ -148,13 +200,14 @@ def main() -> None: datefmt="%H:%M:%S", ) - if args.command in {"run", "eval"}: - print(_DISABLED_MSG.format(cmd=args.command), file=sys.stderr) + if args.command == "eval": + print(_EVAL_DISABLED_MSG, file=sys.stderr) sys.exit(2) # For active subcommands, re-parse strictly so typos still error. args = parser.parse_args() handlers = { + "run": cmd_run, "setup-bench": cmd_setup_bench, "demo": cmd_demo, } diff --git a/clawloop/train.py b/clawloop/train.py index 55f1ce49..ec7324a9 100644 --- a/clawloop/train.py +++ b/clawloop/train.py @@ -169,6 +169,19 @@ def _build_openclaw(config: TrainConfig, llm_clients: dict[str, LLMClientConfig] return adapter, tasks +def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple: + from clawloop.environments.taubench import TauBenchAdapter + + taubench_cfg = dict(config.env_config or {}) + adapter = TauBenchAdapter() + adapter.setup(taubench_cfg) + tasks = adapter.list_tasks(taubench_cfg.get("task_split", "test")) + num_tasks = taubench_cfg.get("num_tasks") + if num_tasks is not None: + tasks = tasks[: int(num_tasks)] + return adapter, tasks + + # --------------------------------------------------------------------------- # Environment registry — add new envs here # --------------------------------------------------------------------------- @@ -233,6 +246,7 @@ def _build_openspiel(config: "TrainConfig", llm_clients: dict[str, "LLMClientCon "entropic": _build_entropic, "openclaw": _build_openclaw, "openspiel": _build_openspiel, + "taubench": _build_taubench, } diff --git a/examples/configs/taubench_harness.json b/examples/configs/taubench_harness.json index 99083351..9d5b6f50 100644 --- a/examples/configs/taubench_harness.json +++ b/examples/configs/taubench_harness.json @@ -1,10 +1,22 @@ { - "_comment": "TauBench harness learning config. Set domain to retail or airline.", - "domain": "retail", - "llm_agent": "gemini/gemini-2.0-flash-lite", - "llm_user": "gemini/gemini-2.0-flash-lite", - "max_steps": 30, - "max_concurrency": 8, - "task_split": "test", - "num_tasks": 10 + "mode": "harness_learning", + "env_type": "taubench", + "system_prompt": "You are a customer service agent. Help the user according to the policy. Be accurate, efficient, and polite.", + "llm_clients": { + "reflector": { + "model": "anthropic/claude-sonnet-4-5-20250929", + "api_base": "" + } + }, + "env_config": { + "domain": "retail", + "llm_agent": "gemini/gemini-2.0-flash-lite", + "llm_user": "gemini/gemini-2.0-flash-lite", + "max_steps": 30, + "max_concurrency": 8, + "task_split": "test", + "num_tasks": 10 + }, + "n_iterations": 3, + "episodes_per_iter": 3 } diff --git a/examples/train_runner.py b/examples/train_runner.py index 3c697045..fb500df6 100644 --- a/examples/train_runner.py +++ b/examples/train_runner.py @@ -1,54 +1,28 @@ #!/usr/bin/env python3 -"""ClawLoop unified training runner. +"""Deprecated shim — forwards to ``clawloop run ``. -Load a JSON config, call train(). One script, two modes. +The unified runner now lives in the CLI: - # Harness learning (prompt optimization, no GPU): - python examples/train_runner.py examples/configs/math_harness.json + uv run clawloop run examples/configs/math_harness.json + uv run clawloop run examples/configs/math_harness.json --dry-run - # Weight training (SkyRL GRPO on GPU): - python examples/train_runner.py examples/configs/math_weight.json - -Tinker-compatible: weight mode uses SkyRL's training infrastructure -under the hood. ClawLoop wraps it with a unified API that lets you switch -between prompt learning and weight training by changing one field. +Existing invocations such as ``python examples/train_runner.py `` +keep working: this shim prepends ``run`` to argv before dispatching. """ from __future__ import annotations -import json -import logging import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) -from clawloop.train import MODE_LAYERS, TrainConfig, train - - -def main(): - if len(sys.argv) < 2: - print(f"Usage: {sys.argv[0]} ") - sys.exit(1) - - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)s %(name)s: %(message)s", - ) - - config_path = Path(sys.argv[1]) - raw = json.loads(config_path.read_text()) - config = TrainConfig(**raw) +from clawloop.cli import main as cli_main - logging.getLogger("clawloop").info( - "mode=%s env=%s layers=%s", - config.mode, - config.env_type, - MODE_LAYERS[config.mode], - ) - agent_state, state_id = train(config) - print(f"\nDone. Final state: {state_id.combined_hash[:12]}") +def main() -> None: + sys.argv.insert(1, "run") + cli_main() if __name__ == "__main__": diff --git a/tests/test_cli.py b/tests/test_cli.py index 739e014e..cdc7be3c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,9 +1,19 @@ -"""CLI smoke tests — ensure disabled subcommands emit a truthful redirect.""" +"""CLI smoke tests. + +`run` dispatches to ``train()``; `eval` is still disabled and emits a redirect. +""" from __future__ import annotations +import json import subprocess import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +CONFIGS_DIR = REPO_ROOT / "examples" / "configs" def _run_cli(*args: str) -> subprocess.CompletedProcess[str]: @@ -11,56 +21,88 @@ def _run_cli(*args: str) -> subprocess.CompletedProcess[str]: [sys.executable, "-m", "clawloop.cli", *args], capture_output=True, text=True, + cwd=str(REPO_ROOT), ) -def test_run_subcommand_prints_redirect_and_exits_nonzero(): - result = _run_cli("run") - assert result.returncode != 0 - combined = result.stdout + result.stderr - assert "train_runner.py" in combined - assert "clawloop demo" in combined +# --------------------------------------------------------------------------- +# eval (still disabled) +# --------------------------------------------------------------------------- -def test_run_subcommand_redirects_even_with_legacy_flags(): - result = _run_cli("run", "--bench", "entropic", "--iterations", "1") +def test_eval_subcommand_prints_redirect_and_exits_nonzero(): + result = _run_cli("eval") assert result.returncode != 0 combined = result.stdout + result.stderr - assert "train_runner.py" in combined + assert "clawloop run" in combined -def test_run_subcommand_redirects_with_global_verbose_flag(): - result = _run_cli("-v", "run", "--bench", "entropic") +def test_eval_redirect_ignores_legacy_flags_with_values(): + result = _run_cli("eval", "--bench", "entropic", "--config", "/tmp/nope.json") assert result.returncode != 0 combined = result.stdout + result.stderr - assert "train_runner.py" in combined + assert "clawloop run" in combined -def test_run_subcommand_redirects_on_help_flag(): - result = _run_cli("run", "--help") - assert result.returncode != 0 - combined = result.stdout + result.stderr - assert "train_runner.py" in combined +# --------------------------------------------------------------------------- +# demo (regression guard from PR #49) +# --------------------------------------------------------------------------- -def test_eval_subcommand_prints_redirect_and_exits_nonzero(): - result = _run_cli("eval") - assert result.returncode != 0 - combined = result.stdout + result.stderr - assert "train_runner.py" in combined +def test_demo_math_dry_run_still_works(): + result = _run_cli("demo", "math", "--dry-run", "--iterations", "1", "--episodes", "1") + assert result.returncode == 0, f"demo math failed: {result.stderr}" -def test_run_redirect_ignores_unknown_subparser_flags_with_values(): - # Regression guard for the class of failure flagged in review: a flag that - # *takes a value* must not cause the disabled redirect to miss. Since only - # the outer parser has globals today, we prove the intercept is robust by - # passing a value-taking flag to the run subparser. - result = _run_cli("run", "--config", "/tmp/does-not-exist.json") - assert result.returncode != 0 - combined = result.stdout + result.stderr - assert "train_runner.py" in combined +# --------------------------------------------------------------------------- +# run: every public config validates as TrainConfig (no `Unknown env_type`) +# --------------------------------------------------------------------------- -def test_demo_math_dry_run_still_works(): - result = _run_cli("demo", "math", "--dry-run", "--iterations", "1", "--episodes", "1") - assert result.returncode == 0, f"demo math failed: {result.stderr}" +@pytest.mark.parametrize( + "config_path", + sorted(CONFIGS_DIR.glob("*.json")), + ids=lambda p: p.name, +) +def test_public_configs_validate_as_trainconfig(config_path: Path): + """Every JSON under examples/configs/ must instantiate TrainConfig and + name a known env_type. Acceptance criterion from #50: no `Unknown + benchmark` failures from public configs.""" + from clawloop.train import ENV_BUILDERS, TrainConfig + + raw = json.loads(config_path.read_text()) + cfg = TrainConfig(**raw) # raises pydantic.ValidationError on schema drift + assert cfg.env_type in ENV_BUILDERS, ( + f"{config_path.name} uses env_type={cfg.env_type!r} " + f"which is not in ENV_BUILDERS ({sorted(ENV_BUILDERS)})" + ) + + +# --------------------------------------------------------------------------- +# run: math happy path with --dry-run +# --------------------------------------------------------------------------- + + +def test_run_math_harness_dry_run_smoke(tmp_path: Path): + """`clawloop run --dry-run` runs end-to-end with mocks.""" + raw = json.loads((CONFIGS_DIR / "math_harness.json").read_text()) + raw["n_iterations"] = 1 + raw["episodes_per_iter"] = 1 + cfg_path = tmp_path / "math_tiny.json" + cfg_path.write_text(json.dumps(raw)) + + result = _run_cli("run", str(cfg_path), "--dry-run") + assert result.returncode == 0, f"run math --dry-run failed: {result.stderr}" + + +# --------------------------------------------------------------------------- +# run: missing config path surfaces a real error (not a redirect) +# --------------------------------------------------------------------------- + + +def test_run_missing_config_errors(): + result = _run_cli("run", "/tmp/clawloop-does-not-exist.json") + assert result.returncode != 0 + # Should be a FileNotFoundError, not the old disabled-redirect text. + combined = result.stdout + result.stderr + assert "train_runner.py" not in combined