-
Notifications
You must be signed in to change notification settings - Fork 6
chore: reintroduce clawloop run as a TrainConfig wrapper
#60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,28 +1,30 @@ | ||
| """ClawLoop CLI — entry points for demo and benchmark setup commands. | ||
|
|
||
| The legacy ``run`` and ``eval`` subcommands are disabled: they only wired a | ||
| subset of environments and drifted from the unified ``TrainConfig`` runner. | ||
| They remain in the parser so stale muscle memory gets a truthful redirect | ||
| instead of a misleading ``Unknown benchmark`` failure. | ||
| `clawloop run <config.json>` is a thin wrapper over the unified ``TrainConfig`` | ||
| runner: load JSON, validate via Pydantic, dispatch to ``train()``. The | ||
| ``--dry-run`` flag swaps real LLM clients for mocks so smoke tests work | ||
| without API keys. | ||
|
|
||
| `clawloop eval` is still disabled; legacy invocations get a truthful redirect | ||
| to ``clawloop run`` and ``clawloop demo math --dry-run``. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import json | ||
| import logging | ||
| import sys | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| log = logging.getLogger("clawloop") | ||
|
|
||
| _DISABLED_MSG = ( | ||
| "`clawloop {cmd}` is temporarily disabled. Use one of:\n" | ||
| " - Real benchmark: uv run python examples/train_runner.py \\\n" | ||
| " examples/configs/entropic_harness.json\n" | ||
| _EVAL_DISABLED_MSG = ( | ||
| "`clawloop eval` is disabled. Use one of:\n" | ||
| " - Real benchmark: uv run clawloop run examples/configs/math_harness.json\n" | ||
| " - Other configs: examples/configs/ (math, harbor, entropic, openclaw, taubench)\n" | ||
| " - No-key demo: uv run clawloop demo math --dry-run\n" | ||
| "The config-driven runner covers every supported environment; " | ||
| "reintroduction of `{cmd}` as a thin wrapper is tracked upstream." | ||
| " - No-key demo: uv run clawloop demo math --dry-run" | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -34,11 +36,16 @@ def _build_parser() -> argparse.ArgumentParser: | |
| parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging") | ||
| sub = parser.add_subparsers(dest="command", required=True) | ||
|
|
||
| # Disabled subcommands. add_help=False so `run --help` hits the redirect | ||
| # rather than argparse's auto-generated help output. Any legacy flags land | ||
| # in `unknown` via parse_known_args() in main() and are ignored. | ||
| sub.add_parser("run", help="(disabled) use examples/train_runner.py", add_help=False) | ||
| sub.add_parser("eval", help="(disabled) use examples/train_runner.py", add_help=False) | ||
| run_p = sub.add_parser("run", help="Run a TrainConfig JSON via train()") | ||
| run_p.add_argument("config", type=Path, help="Path to TrainConfig JSON") | ||
| run_p.add_argument( | ||
| "--dry-run", | ||
| action="store_true", | ||
| help="Swap real LLM clients for mocks (no API calls)", | ||
| ) | ||
|
|
||
| # Eval stays disabled. add_help=False so `eval --help` hits the redirect. | ||
| sub.add_parser("eval", help="(disabled) use `clawloop run` instead", add_help=False) | ||
|
|
||
| setup_p = sub.add_parser("setup-bench", help="Install benchmark dependencies") | ||
| setup_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging") | ||
|
|
@@ -82,7 +89,6 @@ def _build_parser() -> argparse.ArgumentParser: | |
| def cmd_setup_bench(args: argparse.Namespace) -> None: | ||
| """Install benchmark external dependencies.""" | ||
| import subprocess | ||
| from pathlib import Path | ||
|
|
||
| bench = args.bench | ||
| if bench not in BENCH_SETUP: | ||
|
|
@@ -135,10 +141,56 @@ def cmd_demo(args: argparse.Namespace) -> None: | |
| sys.exit(1) | ||
|
|
||
|
|
||
| def cmd_run(args: argparse.Namespace) -> None: | ||
| """Load a TrainConfig JSON and dispatch to train().""" | ||
| from clawloop.train import MODE_LAYERS, TrainConfig, train | ||
|
|
||
| raw = json.loads(args.config.read_text()) | ||
| config = TrainConfig(**raw) # Pydantic ValidationError surfaces fail-fast | ||
|
|
||
| log.info( | ||
| "mode=%s env=%s layers=%s", | ||
| config.mode, | ||
| config.env_type, | ||
| MODE_LAYERS[config.mode], | ||
| ) | ||
|
|
||
| if args.dry_run: | ||
| _install_dry_run_clients(config) | ||
|
|
||
| train(config) | ||
|
|
||
|
|
||
| def _install_dry_run_clients(config: "Any") -> None: | ||
| """Patch ``clawloop.train._make_llm_client`` to return mock clients. | ||
|
|
||
| Identifies the role (reflector / task / other) by matching the cfg | ||
| object identity against ``config.llm_clients``. Falls back to a generic | ||
| ``MockLLMClient`` for any unknown role so unfamiliar envs still run. | ||
| """ | ||
| import clawloop.train as _train | ||
| from clawloop.demo_math import MockTaskClient, _build_mock_reflector_responses | ||
| from clawloop.llm import MockLLMClient | ||
|
|
||
| role_by_id = {id(v): k for k, v in config.llm_clients.items()} | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
| original = _train._make_llm_client | ||
|
|
||
| def _mock_make(cfg): | ||
| role = role_by_id.get(id(cfg)) | ||
| if role == "reflector": | ||
| return MockLLMClient(responses=_build_mock_reflector_responses()) | ||
| if role == "task": | ||
| return MockTaskClient() | ||
| return MockLLMClient(responses=["[]"]) | ||
|
|
||
| _train._make_llm_client = _mock_make | ||
| log.info("dry-run: LLM clients patched to mocks (original=%r)", original.__name__) | ||
|
|
||
|
|
||
| def main() -> None: | ||
| parser = _build_parser() | ||
| # Use parse_known_args so disabled subcommands can ignore legacy flags | ||
| # (`clawloop run --bench entropic`) and fall through to the redirect. | ||
| # parse_known_args lets the disabled `eval` subcommand swallow legacy flags | ||
| # (`clawloop eval --bench entropic`) and fall through to the redirect. | ||
| args, _unknown = parser.parse_known_args() | ||
|
|
||
| log_level = logging.DEBUG if getattr(args, "verbose", False) else logging.INFO | ||
|
|
@@ -148,13 +200,14 @@ def main() -> None: | |
| datefmt="%H:%M:%S", | ||
| ) | ||
|
|
||
| if args.command in {"run", "eval"}: | ||
| print(_DISABLED_MSG.format(cmd=args.command), file=sys.stderr) | ||
| if args.command == "eval": | ||
| print(_EVAL_DISABLED_MSG, file=sys.stderr) | ||
| sys.exit(2) | ||
|
|
||
| # For active subcommands, re-parse strictly so typos still error. | ||
| args = parser.parse_args() | ||
| handlers = { | ||
| "run": cmd_run, | ||
| "setup-bench": cmd_setup_bench, | ||
| "demo": cmd_demo, | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -169,6 +169,19 @@ def _build_openclaw(config: TrainConfig, llm_clients: dict[str, LLMClientConfig] | |||||
| return adapter, tasks | ||||||
|
|
||||||
|
|
||||||
| def _build_taubench(config: TrainConfig, llm_clients: dict[str, LLMClientConfig]) -> tuple: | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return type hint
Suggested change
|
||||||
| from clawloop.environments.taubench import TauBenchAdapter | ||||||
|
|
||||||
| taubench_cfg = dict(config.env_config or {}) | ||||||
| adapter = TauBenchAdapter() | ||||||
| adapter.setup(taubench_cfg) | ||||||
| tasks = adapter.list_tasks(taubench_cfg.get("task_split", "test")) | ||||||
| num_tasks = taubench_cfg.get("num_tasks") | ||||||
| if num_tasks is not None: | ||||||
| tasks = tasks[: int(num_tasks)] | ||||||
| return adapter, tasks | ||||||
|
|
||||||
|
|
||||||
| # --------------------------------------------------------------------------- | ||||||
| # Environment registry — add new envs here | ||||||
| # --------------------------------------------------------------------------- | ||||||
|
|
@@ -233,6 +246,7 @@ def _build_openspiel(config: "TrainConfig", llm_clients: dict[str, "LLMClientCon | |||||
| "entropic": _build_entropic, | ||||||
| "openclaw": _build_openclaw, | ||||||
| "openspiel": _build_openspiel, | ||||||
| "taubench": _build_taubench, | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The References
|
||||||
| } | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,22 @@ | ||
| { | ||
| "_comment": "TauBench harness learning config. Set domain to retail or airline.", | ||
| "domain": "retail", | ||
| "llm_agent": "gemini/gemini-2.0-flash-lite", | ||
| "llm_user": "gemini/gemini-2.0-flash-lite", | ||
| "max_steps": 30, | ||
| "max_concurrency": 8, | ||
| "task_split": "test", | ||
| "num_tasks": 10 | ||
| "mode": "harness_learning", | ||
| "env_type": "taubench", | ||
| "system_prompt": "You are a customer service agent. Help the user according to the policy. Be accurate, efficient, and polite.", | ||
| "llm_clients": { | ||
| "reflector": { | ||
| "model": "anthropic/claude-sonnet-4-5-20250929", | ||
| "api_base": "" | ||
| } | ||
| }, | ||
| "env_config": { | ||
| "domain": "retail", | ||
| "llm_agent": "gemini/gemini-2.0-flash-lite", | ||
| "llm_user": "gemini/gemini-2.0-flash-lite", | ||
| "max_steps": 30, | ||
| "max_concurrency": 8, | ||
| "task_split": "test", | ||
| "num_tasks": 10 | ||
| }, | ||
| "n_iterations": 3, | ||
| "episodes_per_iter": 3 | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
_install_dry_run_clientsfunction monkey-patchesclawloop.train._make_llm_clientto provide mock clients. However, several environments, including the newly addedtaubenchand the existingentropic, do not use_make_llm_clientfor their task LLMs. Instead, they manage LLM configuration viaenv_configand internal logic (or external libraries liketau2). This means that using--dry-runwith these environments will not effectively mock the task execution, potentially leading to real API calls or failures if API keys are missing. Consider refactoring these environments to use the unified_make_llm_clienthelper or extending the dry-run mechanism to handle environment-specific mocking.