From d25415c19007bd3794a34da27b5d65103cc5d76e Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 15 Aug 2025 09:41:31 -0700 Subject: [PATCH 1/5] AI integration --- README.md | 4 ++-- pyproject.toml | 1 + src/metacoder/configuration.py | 2 +- src/metacoder/metacoder.py | 2 +- uv.lock | 27 +++++++++++++++++++++++++++ 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b591c01..e4e6846 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ metacoder "Write a Python function to calculate fibonacci numbers" -c claude -w ... # With custom instructions -metacoder "Refactor this code" -c claude --instructions coding_guidelines.md +metacoder "Refactor this code" -c claude --instructions coding_guidelines.md -w my-repo ... -# Using MCPs +# Using MCPs (e.g. GitHub MCP) metacoder "Fix issue 1234" -w path/to/my-repo --mcp-collection github_mcps.yaml ... diff --git a/pyproject.toml b/pyproject.toml index 5a09dfa..020908e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dev = [ "mkdocstrings-python>=1.14.0", "mypy>=1.17.1", "pytest>=8.4.1", + "ruff>=0.12.8", "types-click>=7.1.8", "types-pyyaml>=6.0.12.20250516", ] diff --git a/src/metacoder/configuration.py b/src/metacoder/configuration.py index 03cefca..97222ae 100644 --- a/src/metacoder/configuration.py +++ b/src/metacoder/configuration.py @@ -54,7 +54,7 @@ class AIModelProvider(BaseModel): name: str = Field(..., description="Name of the model provider") api_key: str | None = Field(None, description="API key for the model provider") metadata: dict[str, Any] = Field({}, description="Metadata for the model provider") - base_url: str | None = Field(None, description="Base URL for the model provider") + base_url: str | None = Field(None, description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ") class AIModelConfig(BaseModel): diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py index a74035a..cf40da2 100644 --- a/src/metacoder/metacoder.py +++ b/src/metacoder/metacoder.py @@ -239,7 +239,7 @@ def cli(ctx): "--provider", "-p", type=str, help="AI provider (e.g., openai, anthropic, google)" ) @click.option( - "--model", type=str, help="AI model name (e.g., gpt-4, claude-3-opus, gemini-pro)" + "--model", type=str, help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)" ) @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging") @click.option("--quiet", "-q", is_flag=True, help="Quiet mode") diff --git a/uv.lock b/uv.lock index 875df9f..b68d2f3 100644 --- a/uv.lock +++ b/uv.lock @@ -1515,6 +1515,7 @@ dev = [ { name = "mkdocstrings-python" }, { name = "mypy" }, { name = "pytest" }, + { name = "ruff" }, { name = "types-click" }, { name = "types-pyyaml" }, ] @@ -1539,6 +1540,7 @@ dev = [ { name = "mkdocstrings-python", specifier = ">=1.14.0" }, { name = "mypy", specifier = ">=1.17.1" }, { name = "pytest", specifier = ">=8.4.1" }, + { name = "ruff", specifier = ">=0.12.8" }, { name = "types-click", specifier = ">=7.1.8" }, { name = "types-pyyaml", specifier = ">=6.0.12.20250516" }, ] @@ -3271,6 +3273,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 }, ] +[[package]] +name = "ruff" +version = "0.12.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4b/da/5bd7565be729e86e1442dad2c9a364ceeff82227c2dece7c29697a9795eb/ruff-0.12.8.tar.gz", hash = "sha256:4cb3a45525176e1009b2b64126acf5f9444ea59066262791febf55e40493a033", size = 5242373 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/1e/c843bfa8ad1114fab3eb2b78235dda76acd66384c663a4e0415ecc13aa1e/ruff-0.12.8-py3-none-linux_armv6l.whl", hash = "sha256:63cb5a5e933fc913e5823a0dfdc3c99add73f52d139d6cd5cc8639d0e0465513", size = 11675315 }, + { url = "https://files.pythonhosted.org/packages/24/ee/af6e5c2a8ca3a81676d5480a1025494fd104b8896266502bb4de2a0e8388/ruff-0.12.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9a9bbe28f9f551accf84a24c366c1aa8774d6748438b47174f8e8565ab9dedbc", size = 12456653 }, + { url = "https://files.pythonhosted.org/packages/99/9d/e91f84dfe3866fa648c10512904991ecc326fd0b66578b324ee6ecb8f725/ruff-0.12.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2fae54e752a3150f7ee0e09bce2e133caf10ce9d971510a9b925392dc98d2fec", size = 11659690 }, + { url = "https://files.pythonhosted.org/packages/fe/ac/a363d25ec53040408ebdd4efcee929d48547665858ede0505d1d8041b2e5/ruff-0.12.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0acbcf01206df963d9331b5838fb31f3b44fa979ee7fa368b9b9057d89f4a53", size = 11896923 }, + { url = "https://files.pythonhosted.org/packages/58/9f/ea356cd87c395f6ade9bb81365bd909ff60860975ca1bc39f0e59de3da37/ruff-0.12.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae3e7504666ad4c62f9ac8eedb52a93f9ebdeb34742b8b71cd3cccd24912719f", size = 11477612 }, + { url = "https://files.pythonhosted.org/packages/1a/46/92e8fa3c9dcfd49175225c09053916cb97bb7204f9f899c2f2baca69e450/ruff-0.12.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb82efb5d35d07497813a1c5647867390a7d83304562607f3579602fa3d7d46f", size = 13182745 }, + { url = "https://files.pythonhosted.org/packages/5e/c4/f2176a310f26e6160deaf661ef60db6c3bb62b7a35e57ae28f27a09a7d63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dbea798fc0065ad0b84a2947b0aff4233f0cb30f226f00a2c5850ca4393de609", size = 14206885 }, + { url = "https://files.pythonhosted.org/packages/87/9d/98e162f3eeeb6689acbedbae5050b4b3220754554526c50c292b611d3a63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:49ebcaccc2bdad86fd51b7864e3d808aad404aab8df33d469b6e65584656263a", size = 13639381 }, + { url = "https://files.pythonhosted.org/packages/81/4e/1b7478b072fcde5161b48f64774d6edd59d6d198e4ba8918d9f4702b8043/ruff-0.12.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ac9c570634b98c71c88cb17badd90f13fc076a472ba6ef1d113d8ed3df109fb", size = 12613271 }, + { url = "https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818", size = 12847783 }, + { url = "https://files.pythonhosted.org/packages/4e/2a/0b6ac3dd045acf8aa229b12c9c17bb35508191b71a14904baf99573a21bd/ruff-0.12.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:71c83121512e7743fba5a8848c261dcc454cafb3ef2934a43f1b7a4eb5a447ea", size = 11702672 }, + { url = "https://files.pythonhosted.org/packages/9d/ee/f9fdc9f341b0430110de8b39a6ee5fa68c5706dc7c0aa940817947d6937e/ruff-0.12.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:de4429ef2ba091ecddedd300f4c3f24bca875d3d8b23340728c3cb0da81072c3", size = 11440626 }, + { url = "https://files.pythonhosted.org/packages/89/fb/b3aa2d482d05f44e4d197d1de5e3863feb13067b22c571b9561085c999dc/ruff-0.12.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a2cab5f60d5b65b50fba39a8950c8746df1627d54ba1197f970763917184b161", size = 12462162 }, + { url = "https://files.pythonhosted.org/packages/18/9f/5c5d93e1d00d854d5013c96e1a92c33b703a0332707a7cdbd0a4880a84fb/ruff-0.12.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:45c32487e14f60b88aad6be9fd5da5093dbefb0e3e1224131cb1d441d7cb7d46", size = 12913212 }, + { url = "https://files.pythonhosted.org/packages/71/13/ab9120add1c0e4604c71bfc2e4ef7d63bebece0cfe617013da289539cef8/ruff-0.12.8-py3-none-win32.whl", hash = "sha256:daf3475060a617fd5bc80638aeaf2f5937f10af3ec44464e280a9d2218e720d3", size = 11694382 }, + { url = "https://files.pythonhosted.org/packages/f6/dc/a2873b7c5001c62f46266685863bee2888caf469d1edac84bf3242074be2/ruff-0.12.8-py3-none-win_amd64.whl", hash = "sha256:7209531f1a1fcfbe8e46bcd7ab30e2f43604d8ba1c49029bb420b103d0b5f76e", size = 12740482 }, + { url = "https://files.pythonhosted.org/packages/cb/5c/799a1efb8b5abab56e8a9f2a0b72d12bd64bb55815e9476c7d0a2887d2f7/ruff-0.12.8-py3-none-win_arm64.whl", hash = "sha256:c90e1a334683ce41b0e7a04f41790c429bf5073b62c1ae701c9dc5b3d14f0749", size = 11884718 }, +] + [[package]] name = "sentry-sdk" version = "2.34.1" From 690a65e6b5cf376a86f78d305fe6cc32d4fc84cd Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 15 Aug 2025 17:32:58 -0700 Subject: [PATCH 2/5] ruff-ruff --- src/metacoder/coders/base_coder.py | 44 +++-- src/metacoder/coders/claude.py | 40 +++-- src/metacoder/coders/codex.py | 1 - src/metacoder/coders/dummy.py | 88 ++++++---- src/metacoder/coders/gemini.py | 20 +-- src/metacoder/coders/goose.py | 52 ++++-- src/metacoder/configuration.py | 5 +- src/metacoder/evals/runner.py | 1 - src/metacoder/metacoder.py | 150 +++++++++------- tests/test_claude_tool_use.py | 83 ++++----- tests/test_coders/test_coder_availability.py | 12 +- tests/test_coders/test_coders_basic.py | 24 +-- tests/test_dummy_coder_tool_capture.py | 57 ++++--- tests/test_dummy_tool_use.py | 27 +-- tests/test_evals/test_deep_eval.py | 2 - tests/test_goose_tool_use.py | 170 ++++++++++--------- tests/test_instructions_option.py | 16 +- tests/test_introspect_mcp.py | 23 +-- tests/test_mcps/test_gemini_mcp.py | 44 +++-- tests/test_registry_loading.py | 82 +++++---- 20 files changed, 521 insertions(+), 420 deletions(-) diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py index a8be80b..b44c6ec 100644 --- a/src/metacoder/coders/base_coder.py +++ b/src/metacoder/coders/base_coder.py @@ -21,10 +21,15 @@ class ToolUse(BaseModel): """Tool use from the coder.""" - name: str = Field(..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext") + + name: str = Field( + ..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext" + ) arguments: dict[str, Any] = Field(..., description="Arguments to the tool") success: bool = Field(..., description="Whether the tool call was successful") - error: str | None = Field(default=None, description="Error message if the tool call failed") + error: str | None = Field( + default=None, description="Error message if the tool call failed" + ) result: Any = Field(..., description="Result of the tool") @@ -87,6 +92,7 @@ class BaseCoder(BaseModel, ABC): Subclasses should implement the following methods: - run(self, input_text: str) -> CoderOutput: Run the coder on the input text """ + workdir: str = Field(default="workdir", description="Working dir ") config: CoderConfig | None = Field(default=None, description="Config for the coder") params: dict | None = Field(default=None, description="Parameters for the coder") @@ -115,8 +121,6 @@ def validate_mcp_support(self): ) return self - - @abstractmethod def run(self, input_text: str) -> CoderOutput: """Run the coder on the input text. @@ -129,7 +133,6 @@ def run(self, input_text: str) -> CoderOutput: """ raise NotImplementedError - @classmethod def default_config_paths(cls) -> dict[Path, ConfigFileRole]: """Return config files as a dictionary of filename/dirname to role.""" @@ -220,7 +223,6 @@ def stream_output(pipe, output_lines, stream): return CoderOutput(stdout=stdout_text, stderr=stderr_text) - def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]: """ Expand environment variables in the coder config. @@ -257,7 +259,7 @@ def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]: def expand_prompt(self, input_text: str) -> str: """Expand environment variables in the prompt. - + Typically this just returns the prompt as is: Example: @@ -278,7 +280,7 @@ def expand_prompt(self, input_text: str) -> str: def default_config_objects(self) -> list[CoderConfigObject]: """Default config objects for the coder.""" raise NotImplementedError("default_config_objects is not implemented") - + def set_instructions(self, instructions: str): """Set the instructions for the coder. @@ -291,7 +293,7 @@ def set_instructions(self, instructions: str): >>> coder.set_instructions("you are an awesome coder") >>> coder.config_objects [CoderConfigObject(file_type=, relative_path='CLAUDE.md', content='you are an awesome coder')] - + Args: instructions: The instructions to set """ @@ -300,16 +302,25 @@ def set_instructions(self, instructions: str): if not self.config_objects: self.config_objects = [] for obj in self.config_objects: - if obj.relative_path == str(path) or obj.relative_path == str(path.name): + if obj.relative_path == str(path) or obj.relative_path == str( + path.name + ): obj.content = instructions return else: - self.config_objects.append(CoderConfigObject(relative_path=str(path), content=instructions, file_type=FileType.TEXT)) + self.config_objects.append( + CoderConfigObject( + relative_path=str(path), + content=instructions, + file_type=FileType.TEXT, + ) + ) return else: raise ValueError(f"Cannot set instructions for {typ}") - raise ValueError(f"No primary instruction file found for {self.__class__.__name__}") - + raise ValueError( + f"No primary instruction file found for {self.__class__.__name__}" + ) def prepare_workdir(self): """Prepare the workdir for the coder. @@ -330,11 +341,7 @@ def prepare_workdir(self): # Check if MCP extensions are configured but not supported if self.config and self.config.extensions: logger.debug(f"šŸ”§ Checking MCP extensions: {self.config.extensions}") - mcp_extensions = [ - ext - for ext in self.config.extensions - if ext.enabled - ] + mcp_extensions = [ext for ext in self.config.extensions if ext.enabled] if mcp_extensions and not self.supports_mcp(): raise ValueError( f"MCP extensions are configured but {self.__class__.__name__} does not support MCP. " @@ -353,6 +360,7 @@ def prepare_workdir(self): logger.debug(f" šŸ—‘ļø Removing old config object: {path}") if path.is_dir(): import shutil + shutil.rmtree(path) else: path.unlink() diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py index 1c67c20..cf1af7c 100644 --- a/src/metacoder/coders/claude.py +++ b/src/metacoder/coders/claude.py @@ -147,6 +147,7 @@ def run(self, input_text: str) -> CoderOutput: # time the command start_time = time.time() ao = self.run_process(command, env) + # parse the jsonl output def parse_jsonl_line(text: str) -> dict[str, Any]: try: @@ -154,17 +155,20 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: return result except json.JSONDecodeError: return {"original": text, "error": "JSONDecodeError"} + ao.structured_messages = [ parse_jsonl_line(line) for line in ao.stdout.split("\n") if line ] - ao.structured_messages = [m for m in ao.structured_messages if m is not None] + ao.structured_messages = [ + m for m in ao.structured_messages if m is not None + ] total_cost_usd = None is_error = None - + # Extract tool uses tool_uses = [] pending_tool_uses = {} # Map tool_use_id to tool data - + for message in ao.structured_messages: if "total_cost_usd" in message: total_cost_usd = message["total_cost_usd"] @@ -172,7 +176,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: is_error = message["is_error"] if "result" in message: ao.result_text = message["result"] - + # Check for tool_use in assistant messages if message.get("type") == "assistant" and message.get("message"): msg_content = message["message"].get("content", []) @@ -182,16 +186,16 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: tool_id = content_item.get("id") tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - + # Store pending tool use pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_input, "success": False, # Default to False until we see result "error": None, - "result": None + "result": None, } - + # Check for tool_result in user messages elif message.get("type") == "user" and message.get("message"): msg_content = message["message"].get("content", []) @@ -201,31 +205,35 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: tool_id = content_item.get("tool_use_id") if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] - + # Update with result is_tool_error = content_item.get("is_error", False) tool_data["success"] = not is_tool_error - tool_data["result"] = content_item.get("content", "") - + tool_data["result"] = content_item.get( + "content", "" + ) + if is_tool_error: - tool_data["error"] = content_item.get("content", "Tool error occurred") - + tool_data["error"] = content_item.get( + "content", "Tool error occurred" + ) + # Create ToolUse object tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + # Remove from pending del pending_tool_uses[tool_id] - + # Add any remaining pending tool uses (shouldn't happen in normal flow) for tool_data in pending_tool_uses.values(): tool_data["error"] = "No result received for tool call" tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + if tool_uses: ao.tool_uses = tool_uses - + end_time = time.time() logger.info(f"šŸ¤– Command took {end_time - start_time} seconds") ao.total_cost_usd = total_cost_usd diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py index 2f29483..8e9169e 100644 --- a/src/metacoder/coders/codex.py +++ b/src/metacoder/coders/codex.py @@ -26,7 +26,6 @@ def is_available(cls) -> bool: """Check if codex command is available.""" return shutil.which("codex") is not None - @property def instructions_path(self) -> Path: return Path("AGENTS.md") diff --git a/src/metacoder/coders/dummy.py b/src/metacoder/coders/dummy.py index bb93159..d55378d 100644 --- a/src/metacoder/coders/dummy.py +++ b/src/metacoder/coders/dummy.py @@ -1,17 +1,22 @@ from pathlib import Path -from metacoder.coders.base_coder import BaseCoder, CoderConfigObject, CoderOutput, ToolUse +from metacoder.coders.base_coder import ( + BaseCoder, + CoderConfigObject, + CoderOutput, + ToolUse, +) from metacoder.configuration import ConfigFileRole class DummyCoder(BaseCoder): """ Dummy coder for testing. - + Simulates tool use when input contains keywords: - "tool" or "mcp": Adds a generic test tool - "search" or "pubmed": Simulates a PubMed search tool - "error": Simulates a tool failure - + Multiple keywords can trigger multiple tools. """ @@ -34,58 +39,71 @@ def run(self, input_text: str) -> CoderOutput: instructions_content = None if self.config_objects: for obj in self.config_objects: - if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str(Path("INSTRUCTIONS.md")): + if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str( + Path("INSTRUCTIONS.md") + ): instructions_content = obj.content break - + # Create response based on whether instructions exist if instructions_content: - response = f"Instructions loaded: {instructions_content}\nProcessing: {input_text}" + response = ( + f"Instructions loaded: {instructions_content}\nProcessing: {input_text}" + ) else: response = f"you said: {input_text}" - + output = CoderOutput( stdout=response, stderr="", result_text=response, ) - + # Add fake tool uses if input mentions tools, MCP, or specific services - if any(keyword in input_text.lower() for keyword in ["tool", "mcp", "pubmed", "search"]): + if any( + keyword in input_text.lower() + for keyword in ["tool", "mcp", "pubmed", "search"] + ): # Create some fake tool uses for testing tool_uses = [] - + # Simulate a successful tool call if "search" in input_text.lower() or "pubmed" in input_text.lower(): - tool_uses.append(ToolUse( - name="mcp__pubmed__search_papers", - arguments={"query": "test query", "limit": 10}, - success=True, - error=None, - result={"papers": ["paper1", "paper2"], "count": 2} - )) - + tool_uses.append( + ToolUse( + name="mcp__pubmed__search_papers", + arguments={"query": "test query", "limit": 10}, + success=True, + error=None, + result={"papers": ["paper1", "paper2"], "count": 2}, + ) + ) + # Simulate a tool with an error if "error" in input_text.lower(): - tool_uses.append(ToolUse( - name="mcp__test__failing_tool", - arguments={"param": "value"}, - success=False, - error="Simulated tool error for testing", - result=None - )) - + tool_uses.append( + ToolUse( + name="mcp__test__failing_tool", + arguments={"param": "value"}, + success=False, + error="Simulated tool error for testing", + result=None, + ) + ) + # Default tool if no specific keywords but general tool/mcp mentioned if not tool_uses: - tool_uses.append(ToolUse( - name="mcp__dummy__test_tool", - arguments={"input": input_text}, - success=True, - error=None, - result="Test tool executed successfully" - )) - + tool_uses.append( + ToolUse( + name="mcp__dummy__test_tool", + arguments={"input": input_text}, + success=True, + error=None, + result="Test tool executed successfully", + ) + ) + if tool_uses: output.tool_uses = tool_uses - + return output diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py index f81a5c8..20564a9 100644 --- a/src/metacoder/coders/gemini.py +++ b/src/metacoder/coders/gemini.py @@ -32,7 +32,7 @@ class GeminiCoder(BaseCoder): - `.gemini/commands/` - Custom commands directory MCP Support: - + Gemini CLI supports MCP (Model Context Protocol) servers through the mcpServers configuration in .gemini/settings.json. When MCPs are configured through Metacoder, they will be automatically added to the settings file. @@ -86,29 +86,27 @@ def mcp_config_to_gemini_format(self, mcp: MCPConfig) -> dict[str, Any]: # For HTTP type MCPs elif mcp.type == MCPType.HTTP: - raise NotImplementedError( - "HTTP MCPs are not supported for Gemini CLI yet" - ) + raise NotImplementedError("HTTP MCPs are not supported for Gemini CLI yet") return server_config def default_config_objects(self) -> list[CoderConfigObject]: """Generate config objects including MCP configuration.""" config_objects = [] - + # Create .gemini/settings.json if we have MCP extensions settings_content: dict[str, Any] = {} - + # Add MCP servers configuration if extensions are present if self.config and self.config.extensions: mcp_servers = {} for mcp in self.config.extensions: if mcp.enabled: mcp_servers[mcp.name] = self.mcp_config_to_gemini_format(mcp) - + if mcp_servers: settings_content["mcpServers"] = mcp_servers - + # Add settings.json if we have content to write if settings_content: config_objects.append( @@ -118,10 +116,10 @@ def default_config_objects(self) -> list[CoderConfigObject]: content=settings_content, ) ) - + # Add GEMINI.md if present in config # This could contain instructions specific to the task - + return config_objects def run(self, input_text: str) -> CoderOutput: @@ -136,7 +134,7 @@ def run(self, input_text: str) -> CoderOutput: env["HOME"] = "." text = self.expand_prompt(input_text) - + # Build the command # The gemini CLI uses conversational interface, so we need to handle it differently # For now, we'll use echo to pipe the prompt diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py index 9b76f4b..514dc2b 100644 --- a/src/metacoder/coders/goose.py +++ b/src/metacoder/coders/goose.py @@ -40,7 +40,6 @@ def supports_mcp(cls) -> bool: """GooseCoder supports MCP extensions.""" return True - def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict: """Convert an MCPConfig to Goose extension format.""" extension = { @@ -69,7 +68,7 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict: extension["bundled"] = None return extension - + @classmethod def default_config_paths(cls) -> dict[Path, ConfigFileRole]: return { @@ -196,38 +195,44 @@ def run(self, input_text: str) -> CoderOutput: if ao.structured_messages: tool_uses = [] pending_tool_uses = {} # Map tool request id to tool data - + for message in ao.structured_messages: # Check for tool requests in assistant messages if message.get("role") == "assistant" and "content" in message: for content in message.get("content", []): - if isinstance(content, dict) and content.get("type") == "toolRequest": + if ( + isinstance(content, dict) + and content.get("type") == "toolRequest" + ): tool_id = content.get("id") tool_call = content.get("toolCall", {}) - + if tool_call.get("status") == "success": tool_value = tool_call.get("value", {}) tool_name = tool_value.get("name", "") tool_args = tool_value.get("arguments", {}) - + # Store pending tool use pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_args, "success": False, # Default until we see result "error": None, - "result": None + "result": None, } - + # Check for tool responses in user messages elif message.get("role") == "user" and "content" in message: for content in message.get("content", []): - if isinstance(content, dict) and content.get("type") == "toolResponse": + if ( + isinstance(content, dict) + and content.get("type") == "toolResponse" + ): tool_id = content.get("id") if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] tool_result = content.get("toolResult", {}) - + # Update with result if tool_result.get("status") == "success": tool_data["success"] = True @@ -236,29 +241,40 @@ def run(self, input_text: str) -> CoderOutput: if isinstance(result_value, list): result_texts = [] for item in result_value: - if isinstance(item, dict) and item.get("type") == "text": - result_texts.append(item.get("text", "")) - tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value) + if ( + isinstance(item, dict) + and item.get("type") == "text" + ): + result_texts.append( + item.get("text", "") + ) + tool_data["result"] = ( + "\n".join(result_texts) + if result_texts + else str(result_value) + ) else: tool_data["result"] = str(result_value) else: tool_data["success"] = False - tool_data["error"] = tool_result.get("error", "Tool execution failed") + tool_data["error"] = tool_result.get( + "error", "Tool execution failed" + ) tool_data["result"] = None - + # Create ToolUse object tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + # Remove from pending del pending_tool_uses[tool_id] - + # Add any remaining pending tool uses (shouldn't happen in normal flow) for tool_data in pending_tool_uses.values(): tool_data["error"] = "No result received for tool call" tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + if tool_uses: ao.tool_uses = tool_uses diff --git a/src/metacoder/configuration.py b/src/metacoder/configuration.py index 97222ae..5110fe8 100644 --- a/src/metacoder/configuration.py +++ b/src/metacoder/configuration.py @@ -54,7 +54,10 @@ class AIModelProvider(BaseModel): name: str = Field(..., description="Name of the model provider") api_key: str | None = Field(None, description="API key for the model provider") metadata: dict[str, Any] = Field({}, description="Metadata for the model provider") - base_url: str | None = Field(None, description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ") + base_url: str | None = Field( + None, + description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ", + ) class AIModelConfig(BaseModel): diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py index a12658a..d2ed34b 100644 --- a/src/metacoder/evals/runner.py +++ b/src/metacoder/evals/runner.py @@ -103,7 +103,6 @@ def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder: return coder - class EvalResult(BaseModel): """Result of a single evaluation.""" diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py index cf40da2..f62d3df 100644 --- a/src/metacoder/metacoder.py +++ b/src/metacoder/metacoder.py @@ -51,16 +51,16 @@ def load_mcp_collection(collection_path: Path) -> MCPCollectionConfig: def load_mcp_registry(registry_path: str) -> MCPCollectionConfig: """Load MCPs from the registry based on a path pattern. - + Args: registry_path: Path pattern like 'metacoder' (all) or 'metacoder.basics' - + Returns: MCPCollectionConfig containing all matched MCPs """ # Base directory for registry registry_base = Path(__file__).parent / "mcps" / "registry" - + # Convert dot notation to file path if registry_path == "metacoder": # Load all yaml files in registry @@ -68,21 +68,21 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig: else: # Convert metacoder.basics to basics.yaml if registry_path.startswith("metacoder."): - registry_path = registry_path[len("metacoder."):] + registry_path = registry_path[len("metacoder.") :] yaml_files = [registry_base / f"{registry_path}.yaml"] - + # Collect all MCPs all_mcps = [] for yaml_file in yaml_files: if not yaml_file.exists(): raise click.ClickException(f"Registry file not found: {yaml_file}") - + try: with open(yaml_file, "r") as f: data = yaml.safe_load(f) except yaml.YAMLError as e: raise click.ClickException(f"Invalid YAML in {yaml_file}: {e}") - + # The registry files contain a list of MCP extensions directly if isinstance(data, list): for mcp_data in data: @@ -99,7 +99,7 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig: logger.warning(f"Invalid MCP in {yaml_file}: {e}") for mcp in all_mcps: mcp.enabled = False - + # Create a collection config collection_name = f"Registry: {registry_path}" return MCPCollectionConfig(name=collection_name, description=None, servers=all_mcps) @@ -239,7 +239,9 @@ def cli(ctx): "--provider", "-p", type=str, help="AI provider (e.g., openai, anthropic, google)" ) @click.option( - "--model", type=str, help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)" + "--model", + type=str, + help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)", ) @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging") @click.option("--quiet", "-q", is_flag=True, help="Quiet mode") @@ -323,7 +325,7 @@ def run( raise click.ClickException("Cannot use both verbose and quiet mode") if verbose: logging.basicConfig(level=logging.DEBUG) - elif quiet: # quiet mode is a bit different, it's just no output + elif quiet: # quiet mode is a bit different, it's just no output logging.basicConfig(level=logging.WARNING) else: logging.basicConfig(level=logging.INFO) @@ -358,26 +360,29 @@ def run( click.echo( f" Enabling MCPs: {', '.join(enabled_list)} (all enabled by default)" ) - + # Load MCPs from registry if provided if registry: click.echo(f"šŸ“š Loading MCPs from registry: {registry}") registry_config = load_mcp_registry(registry) - + # Merge with existing MCP collection if any if mcp_collection_config: # Merge the servers lists for mcp in registry_config.servers: # Avoid duplicates by name - if not any(existing.name == mcp.name for existing in mcp_collection_config.servers): + if not any( + existing.name == mcp.name + for existing in mcp_collection_config.servers + ): mcp_collection_config.servers.append(mcp) else: mcp_collection_config = registry_config - + # Show available MCPs from registry registry_mcps = [mcp.name for mcp in registry_config.servers] click.echo(f" Registry MCPs: {', '.join(registry_mcps)}") - + # Note that registry MCPs are not enabled by default if not enable_mcp: click.echo(" Use -e/--enable-mcp to enable specific MCPs") @@ -421,7 +426,7 @@ def run( ) if coder_config and coder_config.extensions: - for mcp in coder_config.extensions : + for mcp in coder_config.extensions: # use emoji to indicate enabled/disabled if mcp.enabled: click.echo(f" āœ… MCP: {mcp.name}") @@ -476,7 +481,9 @@ def run( click.echo("\nšŸ“‹ Tool uses:") for tool_use in result.tool_uses: success = "āœ…" if tool_use.success else "āŒ" - click.echo(f" {success} {tool_use.name} with arguments: {tool_use.arguments}") + click.echo( + f" {success} {tool_use.name} with arguments: {tool_use.arguments}" + ) if tool_use.error: click.echo(f" Error: {tool_use.error}") @@ -485,7 +492,7 @@ def run( f"\nšŸ“‹ Structured messages ({len(result.structured_messages)} total)" ) for i, msg in enumerate(result.structured_messages): - click.echo(f" {i+1}. {msg}") + click.echo(f" {i + 1}. {msg}") @cli.command("list-coders") @@ -588,10 +595,10 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: click.echo("\nšŸ“ˆ Summary:") click.echo(f" Total: {summary['total_evaluations']}") click.echo( - f" Passed: {summary['passed']} ({summary['passed']/summary['total_evaluations']*100:.1f}%)" + f" Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)" ) click.echo( - f" Failed: {summary['failed']} ({summary['failed']/summary['total_evaluations']*100:.1f}%)" + f" Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)" ) if summary["errors"] > 0: click.echo(f" Errors: {summary['errors']} āš ļø") @@ -640,22 +647,22 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose: bool): """ Introspect an MCP server to list its available tools, resources, and prompts. - + MCP_SPEC can be: - A URL (http://localhost:8080) - A command (uvx mcp-server-fetch) - An MCP name when used with --registry - + Examples: - + \b # Introspect a running MCP server metacoder introspect-mcp http://localhost:8080 - + \b # Introspect an MCP from registry metacoder introspect-mcp fetch --registry metacoder.basics - + \b # Introspect a command-based MCP metacoder introspect-mcp "uvx mcp-server-fetch" @@ -665,18 +672,24 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) - + # Run the introspection with proper cleanup import os import sys - + # Suppress the specific asyncio warning by running with -W flag env = os.environ.copy() - env['PYTHONWARNINGS'] = 'ignore::RuntimeWarning:asyncio.base_subprocess' - + env["PYTHONWARNINGS"] = "ignore::RuntimeWarning:asyncio.base_subprocess" + # Run in a subprocess to isolate the asyncio event loop import subprocess - args = [sys.executable, "-W", "ignore::RuntimeWarning:asyncio.base_subprocess", "-c", f""" + + args = [ + sys.executable, + "-W", + "ignore::RuntimeWarning:asyncio.base_subprocess", + "-c", + f""" import asyncio import sys sys.path.insert(0, {repr(str(Path(__file__).parent.parent))}) @@ -688,26 +701,26 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose except Exception as e: print(f"Error: {{e}}", file=sys.stderr) sys.exit(1) -"""] - +""", + ] + try: # Run with stderr captured to filter out asyncio warnings result = subprocess.run( - args, - env=env, - timeout=timeout + 5, - stderr=subprocess.PIPE, - text=True + args, env=env, timeout=timeout + 5, stderr=subprocess.PIPE, text=True ) - + # Filter out the specific asyncio warning from stderr if result.stderr: error_lines = [] skip_next = 0 lines = result.stderr.splitlines() - + for i, line in enumerate(lines): - if "Exception ignored in: 0: @@ -717,12 +730,12 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose skip_next = 0 # Stop skipping after this line else: error_lines.append(line) - + # Print any remaining stderr if error_lines: for line in error_lines: click.echo(line, err=True) - + if result.returncode != 0: raise click.ClickException("Failed to introspect MCP server") except subprocess.TimeoutExpired: @@ -736,48 +749,50 @@ async def _introspect_mcp_async( ): """Async implementation of MCP introspection.""" from fastmcp import Client - + mcp_config = None spec_to_use: Union[str, list[str]] = mcp_spec - + # If registry is specified, load the MCP config if registry: click.echo(f"šŸ“š Loading MCP '{mcp_spec}' from registry: {registry}") registry_config = load_mcp_registry(registry) - + # Find the MCP in the registry mcp_config = None for mcp in registry_config.servers: if mcp.name == mcp_spec: mcp_config = mcp break - + if not mcp_config: available = [mcp.name for mcp in registry_config.servers] raise click.ClickException( f"MCP '{mcp_spec}' not found in registry. Available: {', '.join(available)}" ) - + # Build the command from MCP config if mcp_config.command and mcp_config.args: spec_to_use = [mcp_config.command] + mcp_config.args else: - raise click.ClickException(f"MCP '{mcp_spec}' has incomplete command configuration") - + raise click.ClickException( + f"MCP '{mcp_spec}' has incomplete command configuration" + ) + click.echo(f"šŸ” Introspecting MCP: {spec_to_use}") - + # Create client based on the spec type if isinstance(spec_to_use, list): # Command-based MCP - FastMCP expects a single server config dict server_config = { "server_name": { "command": spec_to_use[0], - "args": spec_to_use[1:] if len(spec_to_use) > 1 else [] + "args": spec_to_use[1:] if len(spec_to_use) > 1 else [], } } if mcp_config and mcp_config.env: server_config["server_name"]["env"] = mcp_config.env # type: ignore - + # FastMCP expects the full config with mcpServers key full_config = {"mcpServers": server_config} client = Client(full_config) @@ -787,28 +802,29 @@ async def _introspect_mcp_async( else: # Try as command import shlex + parts = shlex.split(spec_to_use) server_config = { "server_name": { "command": parts[0], - "args": parts[1:] if len(parts) > 1 else [] + "args": parts[1:] if len(parts) > 1 else [], } } full_config = {"mcpServers": server_config} client = Client(full_config) - + async with client: click.echo("āœ… Connected to MCP server") - + # Get server info if available - if hasattr(client, 'server_info'): + if hasattr(client, "server_info"): info = client.server_info click.echo("\nšŸ“‹ Server Info:") click.echo(f" Name: {info.name}") click.echo(f" Version: {info.version}") - if hasattr(info, 'description') and info.description: + if hasattr(info, "description") and info.description: click.echo(f" Description: {info.description}") - + # List tools click.echo("\nšŸ”§ Available Tools:") try: @@ -818,13 +834,15 @@ async def _introspect_mcp_async( click.echo(f"\n šŸ“Œ {tool.name}") if tool.description: click.echo(f" Description: {tool.description}") - if verbose and hasattr(tool, 'inputSchema') and tool.inputSchema: - click.echo(f" Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}") + if verbose and hasattr(tool, "inputSchema") and tool.inputSchema: + click.echo( + f" Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}" + ) else: click.echo(" (No tools available)") except Exception as e: click.echo(f" āš ļø Error listing tools: {e}") - + # List resources click.echo("\nšŸ“ Available Resources:") try: @@ -841,7 +859,7 @@ async def _introspect_mcp_async( click.echo(" (No resources available)") except Exception as e: click.echo(f" āš ļø Error listing resources: {e}") - + # List prompts click.echo("\nšŸ’¬ Available Prompts:") try: @@ -851,16 +869,18 @@ async def _introspect_mcp_async( click.echo(f"\n šŸ’” {prompt.name}") if prompt.description: click.echo(f" Description: {prompt.description}") - if verbose and hasattr(prompt, 'arguments') and prompt.arguments: + if verbose and hasattr(prompt, "arguments") and prompt.arguments: click.echo(" Arguments:") for arg in prompt.arguments: req = "required" if arg.required else "optional" - click.echo(f" - {arg.name} ({req}): {arg.description}") + click.echo( + f" - {arg.name} ({req}): {arg.description}" + ) else: click.echo(" (No prompts available)") except Exception as e: click.echo(f" āš ļø Error listing prompts: {e}") - + click.echo("\nāœ… Introspection complete!") diff --git a/tests/test_claude_tool_use.py b/tests/test_claude_tool_use.py index 817617f..3d74485 100644 --- a/tests/test_claude_tool_use.py +++ b/tests/test_claude_tool_use.py @@ -1,10 +1,11 @@ """Test ClaudeCoder tool use extraction.""" + from metacoder.coders.base_coder import CoderOutput, ToolUse def test_claude_tool_use_extraction(): """Test that ClaudeCoder correctly extracts tool uses from structured messages.""" - + # Create a mock output with tool use in structured messages output = CoderOutput( stdout="", @@ -18,31 +19,31 @@ def test_claude_tool_use_extraction(): "type": "tool_use", "id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s", "name": "mcp__pubmed__get_paper_fulltext", - "input": {"pmid": "35743164"} + "input": {"pmid": "35743164"}, } ] - } + }, }, { - "type": "user", + "type": "user", "message": { "content": [ { "type": "tool_result", "content": "Paper content here...", "is_error": False, - "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s" + "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s", } ] - } - } - ] + }, + }, + ], ) - + # Process structured messages to extract tool uses tool_uses = [] pending_tool_uses = {} - + for message in output.structured_messages: # Check for tool_use in assistant messages if message.get("type") == "assistant" and message.get("message"): @@ -53,16 +54,16 @@ def test_claude_tool_use_extraction(): tool_id = content_item.get("id") tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - + # Store pending tool use pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_input, "success": False, "error": None, - "result": None + "result": None, } - + # Check for tool_result in user messages elif message.get("type") == "user" and message.get("message"): msg_content = message["message"].get("content", []) @@ -72,22 +73,24 @@ def test_claude_tool_use_extraction(): tool_id = content_item.get("tool_use_id") if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] - + # Update with result is_tool_error = content_item.get("is_error", False) tool_data["success"] = not is_tool_error tool_data["result"] = content_item.get("content", "") - + if is_tool_error: - tool_data["error"] = content_item.get("content", "Tool error occurred") - + tool_data["error"] = content_item.get( + "content", "Tool error occurred" + ) + # Create ToolUse object tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + # Remove from pending del pending_tool_uses[tool_id] - + # Verify extraction assert len(tool_uses) == 1 tool_use = tool_uses[0] @@ -100,7 +103,7 @@ def test_claude_tool_use_extraction(): def test_claude_tool_use_error(): """Test that ClaudeCoder correctly handles tool errors.""" - + # Create a mock output with tool error output = CoderOutput( stdout="", @@ -114,31 +117,31 @@ def test_claude_tool_use_error(): "type": "tool_use", "id": "toolu_test", "name": "mcp__pubmed__get_paper_fulltext", - "input": {"pmid": "invalid"} + "input": {"pmid": "invalid"}, } ] - } + }, }, { - "type": "user", + "type": "user", "message": { "content": [ { "type": "tool_result", "content": "MCP tool response exceeds maximum allowed tokens", "is_error": True, - "tool_use_id": "toolu_test" + "tool_use_id": "toolu_test", } ] - } - } - ] + }, + }, + ], ) - + # Process structured messages to extract tool uses tool_uses = [] pending_tool_uses = {} - + for message in output.structured_messages: # Check for tool_use in assistant messages if message.get("type") == "assistant" and message.get("message"): @@ -149,16 +152,16 @@ def test_claude_tool_use_error(): tool_id = content_item.get("id") tool_name = content_item.get("name", "") tool_input = content_item.get("input", {}) - + # Store pending tool use pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_input, "success": False, "error": None, - "result": None + "result": None, } - + # Check for tool_result in user messages elif message.get("type") == "user" and message.get("message"): msg_content = message["message"].get("content", []) @@ -168,22 +171,24 @@ def test_claude_tool_use_error(): tool_id = content_item.get("tool_use_id") if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] - + # Update with result is_tool_error = content_item.get("is_error", False) tool_data["success"] = not is_tool_error tool_data["result"] = content_item.get("content", "") - + if is_tool_error: - tool_data["error"] = content_item.get("content", "Tool error occurred") - + tool_data["error"] = content_item.get( + "content", "Tool error occurred" + ) + # Create ToolUse object tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + # Remove from pending del pending_tool_uses[tool_id] - + # Verify error handling assert len(tool_uses) == 1 tool_use = tool_uses[0] @@ -191,4 +196,4 @@ def test_claude_tool_use_error(): assert tool_use.arguments == {"pmid": "invalid"} assert tool_use.success is False assert tool_use.error == "MCP tool response exceeds maximum allowed tokens" - assert tool_use.result == "MCP tool response exceeds maximum allowed tokens" \ No newline at end of file + assert tool_use.result == "MCP tool response exceeds maximum allowed tokens" diff --git a/tests/test_coders/test_coder_availability.py b/tests/test_coders/test_coder_availability.py index d9a75d3..4b53b63 100644 --- a/tests/test_coders/test_coder_availability.py +++ b/tests/test_coders/test_coder_availability.py @@ -59,12 +59,12 @@ def test_all_coders_have_availability_method(): from metacoder.metacoder import AVAILABLE_CODERS for coder_name, coder_class in AVAILABLE_CODERS.items(): - assert hasattr( - coder_class, "is_available" - ), f"{coder_name} missing is_available method" - assert callable( - coder_class.is_available - ), f"{coder_name}.is_available is not callable" + assert hasattr(coder_class, "is_available"), ( + f"{coder_name} missing is_available method" + ) + assert callable(coder_class.is_available), ( + f"{coder_name}.is_available is not callable" + ) @patch("shutil.which") diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py index 4180e92..cb73641 100644 --- a/tests/test_coders/test_coders_basic.py +++ b/tests/test_coders/test_coders_basic.py @@ -62,17 +62,17 @@ def test_llm_coder_basic_arithmetic(coder_name, coder_class): # Check result assert result is not None - assert ( - result.stdout or result.result_text - ), "Coder should produce some output" + assert result.stdout or result.result_text, ( + "Coder should produce some output" + ) # Get the actual output text output_text = result.result_text or result.stdout # All LLM coders should include "4" in their answer - assert ( - "4" in output_text - ), f"{coder_name} should answer '4' to 'What is 2+2?'" + assert "4" in output_text, ( + f"{coder_name} should answer '4' to 'What is 2+2?'" + ) except Exception as e: pytest.fail(f"Coder {coder_name} failed with error: {e}") @@ -137,12 +137,12 @@ def test_llm_coder_code_generation(coder_name, coder_class): assert output_text, "Coder should produce some output" # Verify the output contains Python code elements - assert ( - "def" in output_text - ), f"{coder_name} should generate a Python function" - assert ( - "return" in output_text or "print" in output_text - ), f"{coder_name} should have return or print" + assert "def" in output_text, ( + f"{coder_name} should generate a Python function" + ) + assert "return" in output_text or "print" in output_text, ( + f"{coder_name} should have return or print" + ) except Exception as e: pytest.fail(f"Coder {coder_name} failed with error: {e}") diff --git a/tests/test_dummy_coder_tool_capture.py b/tests/test_dummy_coder_tool_capture.py index a3d6364..b92b7ab 100644 --- a/tests/test_dummy_coder_tool_capture.py +++ b/tests/test_dummy_coder_tool_capture.py @@ -1,4 +1,5 @@ """Test that DummyCoder properly captures tool calls in CoderOutput.""" + from metacoder.coders.dummy import DummyCoder from metacoder.coders.base_coder import CoderOutput, ToolUse @@ -6,23 +7,25 @@ def test_dummy_coder_captures_tool_calls(): """Test that DummyCoder captures tool calls in the CoderOutput.""" coder = DummyCoder(workdir="test") - + # Run with input that triggers tool use output = coder.run("Use MCP to search PubMed for cancer research") - + # Verify output is a CoderOutput instance assert isinstance(output, CoderOutput) - + # Verify basic output fields assert output.stdout == "you said: Use MCP to search PubMed for cancer research" assert output.stderr == "" - assert output.result_text == "you said: Use MCP to search PubMed for cancer research" - + assert ( + output.result_text == "you said: Use MCP to search PubMed for cancer research" + ) + # Verify tool_uses is populated assert output.tool_uses is not None assert isinstance(output.tool_uses, list) assert len(output.tool_uses) == 1 - + # Verify the tool use is properly structured tool_use = output.tool_uses[0] assert isinstance(tool_use, ToolUse) @@ -36,27 +39,27 @@ def test_dummy_coder_captures_tool_calls(): def test_dummy_coder_captures_multiple_tools(): """Test that DummyCoder can capture multiple tool calls.""" coder = DummyCoder(workdir="test") - + # Run with input that triggers multiple tools output = coder.run("Search PubMed and then cause an error") - + # Verify multiple tools are captured assert output.tool_uses is not None assert len(output.tool_uses) == 2 - + # Verify each tool is a proper ToolUse instance for tool in output.tool_uses: assert isinstance(tool, ToolUse) - assert hasattr(tool, 'name') - assert hasattr(tool, 'arguments') - assert hasattr(tool, 'success') - assert hasattr(tool, 'error') - assert hasattr(tool, 'result') - + assert hasattr(tool, "name") + assert hasattr(tool, "arguments") + assert hasattr(tool, "success") + assert hasattr(tool, "error") + assert hasattr(tool, "result") + # Check first tool (PubMed search) assert output.tool_uses[0].name == "mcp__pubmed__search_papers" assert output.tool_uses[0].success is True - + # Check second tool (error) assert output.tool_uses[1].name == "mcp__test__failing_tool" assert output.tool_uses[1].success is False @@ -66,14 +69,14 @@ def test_dummy_coder_captures_multiple_tools(): def test_dummy_coder_no_tools_when_not_triggered(): """Test that DummyCoder doesn't add tools when not triggered.""" coder = DummyCoder(workdir="test") - + # Run with input that doesn't trigger tools output = coder.run("What is the weather today?") - + # Verify output structure assert isinstance(output, CoderOutput) assert output.stdout == "you said: What is the weather today?" - + # Verify no tools are added assert output.tool_uses is None @@ -81,14 +84,14 @@ def test_dummy_coder_no_tools_when_not_triggered(): def test_dummy_coder_tool_error_capture(): """Test that DummyCoder properly captures tool errors.""" coder = DummyCoder(workdir="test") - + # Run with input that triggers an error output = coder.run("Use tool with error") - + # Verify error tool is captured assert output.tool_uses is not None assert len(output.tool_uses) == 1 - + error_tool = output.tool_uses[0] assert error_tool.name == "mcp__test__failing_tool" assert error_tool.success is False @@ -100,22 +103,22 @@ def test_dummy_coder_tool_error_capture(): def test_dummy_coder_tool_serialization(): """Test that tool uses can be serialized properly.""" coder = DummyCoder(workdir="test") - + # Run with tool trigger output = coder.run("Use MCP tool") - + # Verify tool uses can be converted to dict (for serialization) assert output.tool_uses is not None tool_dict = output.tool_uses[0].model_dump() - + assert isinstance(tool_dict, dict) assert "name" in tool_dict assert "arguments" in tool_dict assert "success" in tool_dict assert "error" in tool_dict assert "result" in tool_dict - + # Verify values assert tool_dict["name"] == "mcp__dummy__test_tool" assert tool_dict["success"] is True - assert tool_dict["error"] is None \ No newline at end of file + assert tool_dict["error"] is None diff --git a/tests/test_dummy_tool_use.py b/tests/test_dummy_tool_use.py index ca3f2b4..f9f675e 100644 --- a/tests/test_dummy_tool_use.py +++ b/tests/test_dummy_tool_use.py @@ -1,4 +1,5 @@ """Test DummyCoder fake tool use generation.""" + from metacoder.coders.dummy import DummyCoder @@ -6,7 +7,7 @@ def test_dummy_no_tools(): """Test that dummy coder doesn't add tools when not mentioned.""" coder = DummyCoder(workdir="test") output = coder.run("What is 2 + 2?") - + assert output.stdout == "you said: What is 2 + 2?" assert output.tool_uses is None @@ -15,10 +16,10 @@ def test_dummy_default_tool(): """Test that dummy coder adds default tool when mentioned.""" coder = DummyCoder(workdir="test") output = coder.run("Use a tool to help me") - + assert output.tool_uses is not None assert len(output.tool_uses) == 1 - + tool = output.tool_uses[0] assert tool.name == "mcp__dummy__test_tool" assert tool.arguments == {"input": "Use a tool to help me"} @@ -31,10 +32,10 @@ def test_dummy_pubmed_search(): """Test that dummy coder simulates PubMed search.""" coder = DummyCoder(workdir="test") output = coder.run("Search PubMed for papers about cancer") - + assert output.tool_uses is not None assert len(output.tool_uses) == 1 - + tool = output.tool_uses[0] assert tool.name == "mcp__pubmed__search_papers" assert tool.arguments == {"query": "test query", "limit": 10} @@ -47,10 +48,10 @@ def test_dummy_tool_error(): """Test that dummy coder simulates tool errors.""" coder = DummyCoder(workdir="test") output = coder.run("Use MCP tool but simulate an error") - + assert output.tool_uses is not None assert len(output.tool_uses) == 1 - + tool = output.tool_uses[0] assert tool.name == "mcp__test__failing_tool" assert tool.arguments == {"param": "value"} @@ -63,15 +64,15 @@ def test_dummy_multiple_tools(): """Test that dummy coder can simulate multiple tools.""" coder = DummyCoder(workdir="test") output = coder.run("Search PubMed and then simulate an error with MCP") - + assert output.tool_uses is not None assert len(output.tool_uses) == 2 - + # First tool - PubMed search tool1 = output.tool_uses[0] assert tool1.name == "mcp__pubmed__search_papers" assert tool1.success is True - + # Second tool - error simulation tool2 = output.tool_uses[1] assert tool2.name == "mcp__test__failing_tool" @@ -82,10 +83,10 @@ def test_dummy_mcp_keyword(): """Test that MCP keyword triggers tool use.""" coder = DummyCoder(workdir="test") output = coder.run("Test MCP functionality") - + assert output.tool_uses is not None assert len(output.tool_uses) == 1 - + tool = output.tool_uses[0] assert tool.name == "mcp__dummy__test_tool" - assert tool.success is True \ No newline at end of file + assert tool.success is True diff --git a/tests/test_evals/test_deep_eval.py b/tests/test_evals/test_deep_eval.py index c55dec2..3690e70 100644 --- a/tests/test_evals/test_deep_eval.py +++ b/tests/test_evals/test_deep_eval.py @@ -61,8 +61,6 @@ def test_hallucination_eval(metric_cls): print(yaml.dump(results.model_dump())) - - correctness_metric = GEval( name="Correctness", criteria="Determine whether the actual output is factually correct based on the expected output.", diff --git a/tests/test_goose_tool_use.py b/tests/test_goose_tool_use.py index 94d83aa..3b643ae 100644 --- a/tests/test_goose_tool_use.py +++ b/tests/test_goose_tool_use.py @@ -1,4 +1,5 @@ """Test GooseCoder tool use extraction.""" + from metacoder.coders.base_coder import ToolUse @@ -13,7 +14,7 @@ def test_goose_tool_use_extraction(): "content": [ { "type": "text", - "text": "I'll help you find information about diseases associated with ITPR1 mutations." + "text": "I'll help you find information about diseases associated with ITPR1 mutations.", }, { "type": "toolRequest", @@ -22,11 +23,11 @@ def test_goose_tool_use_extraction(): "status": "success", "value": { "name": "pubmed__get_paper_fulltext", - "arguments": {"pmid": "35743164"} - } - } - } - ] + "arguments": {"pmid": "35743164"}, + }, + }, + }, + ], }, { "id": None, @@ -38,22 +39,17 @@ def test_goose_tool_use_extraction(): "id": "toolu_01RbESTBH9tyWu9Q9uAVRjja", "toolResult": { "status": "success", - "value": [ - { - "type": "text", - "text": "Paper content here..." - } - ] - } + "value": [{"type": "text", "text": "Paper content here..."}], + }, } - ] - } + ], + }, ] - + # Process structured messages to extract tool uses (mimicking goose logic) tool_uses = [] pending_tool_uses = {} - + for message in structured_messages: # Check for tool requests in assistant messages if message.get("role") == "assistant" and "content" in message: @@ -61,21 +57,21 @@ def test_goose_tool_use_extraction(): if isinstance(content, dict) and content.get("type") == "toolRequest": tool_id = content.get("id") tool_call = content.get("toolCall", {}) - + if tool_call.get("status") == "success": tool_value = tool_call.get("value", {}) tool_name = tool_value.get("name", "") tool_args = tool_value.get("arguments", {}) - + # Store pending tool use pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_args, "success": False, "error": None, - "result": None + "result": None, } - + # Check for tool responses in user messages elif message.get("role") == "user" and "content" in message: for content in message.get("content", []): @@ -84,7 +80,7 @@ def test_goose_tool_use_extraction(): if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] tool_result = content.get("toolResult", {}) - + # Update with result if tool_result.get("status") == "success": tool_data["success"] = True @@ -93,23 +89,32 @@ def test_goose_tool_use_extraction(): if isinstance(result_value, list): result_texts = [] for item in result_value: - if isinstance(item, dict) and item.get("type") == "text": + if ( + isinstance(item, dict) + and item.get("type") == "text" + ): result_texts.append(item.get("text", "")) - tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value) + tool_data["result"] = ( + "\n".join(result_texts) + if result_texts + else str(result_value) + ) else: tool_data["result"] = str(result_value) else: tool_data["success"] = False - tool_data["error"] = tool_result.get("error", "Tool execution failed") + tool_data["error"] = tool_result.get( + "error", "Tool execution failed" + ) tool_data["result"] = None - + # Create ToolUse object tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) - + # Remove from pending del pending_tool_uses[tool_id] - + # Verify extraction assert len(tool_uses) == 1 tool_use = tool_uses[0] @@ -132,13 +137,10 @@ def test_goose_tool_use_error(): "id": "toolu_test", "toolCall": { "status": "success", - "value": { - "name": "test_tool", - "arguments": {"param": "value"} - } - } + "value": {"name": "test_tool", "arguments": {"param": "value"}}, + }, } - ] + ], }, { "role": "user", @@ -148,37 +150,37 @@ def test_goose_tool_use_error(): "id": "toolu_test", "toolResult": { "status": "error", - "error": "Tool failed to execute" - } + "error": "Tool failed to execute", + }, } - ] - } + ], + }, ] - + # Process structured messages to extract tool uses tool_uses = [] pending_tool_uses = {} - + for message in structured_messages: if message.get("role") == "assistant" and "content" in message: for content in message.get("content", []): if isinstance(content, dict) and content.get("type") == "toolRequest": tool_id = content.get("id") tool_call = content.get("toolCall", {}) - + if tool_call.get("status") == "success": tool_value = tool_call.get("value", {}) tool_name = tool_value.get("name", "") tool_args = tool_value.get("arguments", {}) - + pending_tool_uses[tool_id] = { "name": tool_name, "arguments": tool_args, "success": False, "error": None, - "result": None + "result": None, } - + elif message.get("role") == "user" and "content" in message: for content in message.get("content", []): if isinstance(content, dict) and content.get("type") == "toolResponse": @@ -186,27 +188,36 @@ def test_goose_tool_use_error(): if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] tool_result = content.get("toolResult", {}) - + if tool_result.get("status") == "success": tool_data["success"] = True result_value = tool_result.get("value", []) if isinstance(result_value, list): result_texts = [] for item in result_value: - if isinstance(item, dict) and item.get("type") == "text": + if ( + isinstance(item, dict) + and item.get("type") == "text" + ): result_texts.append(item.get("text", "")) - tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value) + tool_data["result"] = ( + "\n".join(result_texts) + if result_texts + else str(result_value) + ) else: tool_data["result"] = str(result_value) else: tool_data["success"] = False - tool_data["error"] = tool_result.get("error", "Tool execution failed") + tool_data["error"] = tool_result.get( + "error", "Tool execution failed" + ) tool_data["result"] = None - + tool_use = ToolUse(**tool_data) tool_uses.append(tool_use) del pending_tool_uses[tool_id] - + # Verify error handling assert len(tool_uses) == 1 tool_use = tool_uses[0] @@ -230,11 +241,11 @@ def test_goose_multiple_tools(): "status": "success", "value": { "name": "search_tool", - "arguments": {"query": "test"} - } - } + "arguments": {"query": "test"}, + }, + }, } - ] + ], }, { "role": "user", @@ -244,10 +255,10 @@ def test_goose_multiple_tools(): "id": "tool1", "toolResult": { "status": "success", - "value": [{"type": "text", "text": "Search results"}] - } + "value": [{"type": "text", "text": "Search results"}], + }, } - ] + ], }, { "role": "assistant", @@ -259,11 +270,11 @@ def test_goose_multiple_tools(): "status": "success", "value": { "name": "fetch_tool", - "arguments": {"url": "http://example.com"} - } - } + "arguments": {"url": "http://example.com"}, + }, + }, } - ] + ], }, { "role": "user", @@ -273,24 +284,24 @@ def test_goose_multiple_tools(): "id": "tool2", "toolResult": { "status": "success", - "value": [{"type": "text", "text": "Fetched content"}] - } + "value": [{"type": "text", "text": "Fetched content"}], + }, } - ] - } + ], + }, ] - + # Process structured messages tool_uses = [] pending_tool_uses = {} - + for message in structured_messages: if message.get("role") == "assistant" and "content" in message: for content in message.get("content", []): if isinstance(content, dict) and content.get("type") == "toolRequest": tool_id = content.get("id") tool_call = content.get("toolCall", {}) - + if tool_call.get("status") == "success": tool_value = tool_call.get("value", {}) pending_tool_uses[tool_id] = { @@ -298,9 +309,9 @@ def test_goose_multiple_tools(): "arguments": tool_value.get("arguments", {}), "success": False, "error": None, - "result": None + "result": None, } - + elif message.get("role") == "user" and "content" in message: for content in message.get("content", []): if isinstance(content, dict) and content.get("type") == "toolResponse": @@ -308,22 +319,29 @@ def test_goose_multiple_tools(): if tool_id in pending_tool_uses: tool_data = pending_tool_uses[tool_id] tool_result = content.get("toolResult", {}) - + if tool_result.get("status") == "success": tool_data["success"] = True result_value = tool_result.get("value", []) if isinstance(result_value, list): result_texts = [] for item in result_value: - if isinstance(item, dict) and item.get("type") == "text": + if ( + isinstance(item, dict) + and item.get("type") == "text" + ): result_texts.append(item.get("text", "")) - tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value) + tool_data["result"] = ( + "\n".join(result_texts) + if result_texts + else str(result_value) + ) else: tool_data["result"] = str(result_value) - + tool_uses.append(ToolUse(**tool_data)) del pending_tool_uses[tool_id] - + # Verify multiple tools assert len(tool_uses) == 2 assert tool_uses[0].name == "search_tool" @@ -331,4 +349,4 @@ def test_goose_multiple_tools(): assert tool_uses[0].result == "Search results" assert tool_uses[1].name == "fetch_tool" assert tool_uses[1].success is True - assert tool_uses[1].result == "Fetched content" \ No newline at end of file + assert tool_uses[1].result == "Fetched content" diff --git a/tests/test_instructions_option.py b/tests/test_instructions_option.py index 681e275..ff9cc4e 100644 --- a/tests/test_instructions_option.py +++ b/tests/test_instructions_option.py @@ -21,7 +21,7 @@ def test_instructions_option_with_dummy_coder(runner): instructions_file = Path(temp_dir) / "test_instructions.md" instructions_content = "# Test Instructions\n\nBe helpful and concise." instructions_file.write_text(instructions_content) - + # Run with instructions result = runner.invoke( main, @@ -36,7 +36,7 @@ def test_instructions_option_with_dummy_coder(runner): temp_dir, ], ) - + # Check that instructions were loaded assert result.exit_code == 0 assert "Loaded instructions from:" in result.output @@ -58,7 +58,7 @@ def test_no_instructions_still_works(runner): temp_dir, ], ) - + assert result.exit_code == 0 assert "you said: Hello" in result.output assert "Instructions loaded:" not in result.output @@ -80,7 +80,7 @@ def test_instructions_file_not_found(runner): temp_dir, ], ) - + # Should fail with appropriate error assert result.exit_code != 0 assert "does not exist" in result.output @@ -98,11 +98,11 @@ def test_instructions_with_config(runner): extensions: [] """ config_file.write_text(config_content) - + # Create instructions file instructions_file = Path(temp_dir) / "instructions.md" instructions_file.write_text("Custom instructions") - + result = runner.invoke( main, [ @@ -118,6 +118,6 @@ def test_instructions_with_config(runner): temp_dir, ], ) - + assert result.exit_code == 0 - assert "Loaded instructions from:" in result.output \ No newline at end of file + assert "Loaded instructions from:" in result.output diff --git a/tests/test_introspect_mcp.py b/tests/test_introspect_mcp.py index 2f51ae0..4e5fd59 100644 --- a/tests/test_introspect_mcp.py +++ b/tests/test_introspect_mcp.py @@ -6,7 +6,7 @@ def test_introspect_mcp_help(): """Test introspect-mcp help command.""" runner = CliRunner() result = runner.invoke(cli, ["introspect-mcp", "--help"]) - + assert result.exit_code == 0 assert "Introspect an MCP server" in result.output assert "MCP_SPEC" in result.output @@ -17,12 +17,10 @@ def test_introspect_mcp_help(): def test_introspect_mcp_with_invalid_registry(): """Test introspect-mcp with non-existent registry MCP.""" runner = CliRunner() - result = runner.invoke(cli, [ - "introspect-mcp", - "nonexistent", - "--registry", "metacoder.basics" - ]) - + result = runner.invoke( + cli, ["introspect-mcp", "nonexistent", "--registry", "metacoder.basics"] + ) + assert result.exit_code != 0 assert "not found in registry" in result.output @@ -30,12 +28,9 @@ def test_introspect_mcp_with_invalid_registry(): def test_introspect_mcp_with_registry_no_mcp(): """Test introspect-mcp with invalid registry.""" runner = CliRunner() - result = runner.invoke(cli, [ - "introspect-mcp", - "fetch", - "--registry", "metacoder.nonexistent" - ]) - + result = runner.invoke( + cli, ["introspect-mcp", "fetch", "--registry", "metacoder.nonexistent"] + ) + assert result.exit_code != 0 assert "Registry file not found" in result.output - diff --git a/tests/test_mcps/test_gemini_mcp.py b/tests/test_mcps/test_gemini_mcp.py index 25b9c24..288f61c 100644 --- a/tests/test_mcps/test_gemini_mcp.py +++ b/tests/test_mcps/test_gemini_mcp.py @@ -13,7 +13,7 @@ def test_gemini_supports_mcp(): def test_gemini_mcp_config_conversion(): """Test conversion of MCPConfig to Gemini format.""" coder = GeminiCoder(workdir="/tmp/test") - + # Test stdio MCP mcp = MCPConfig( name="test_server", @@ -21,11 +21,11 @@ def test_gemini_mcp_config_conversion(): args=["-y", "@modelcontextprotocol/server-test"], env={"API_KEY": "${TEST_KEY}"}, enabled=True, - type=MCPType.STDIO + type=MCPType.STDIO, ) - + result = coder.mcp_config_to_gemini_format(mcp) - + assert result["command"] == "npx" assert result["args"] == ["-y", "@modelcontextprotocol/server-test"] assert result["env"] == {"API_KEY": "${TEST_KEY}"} @@ -35,13 +35,9 @@ def test_gemini_mcp_config_conversion(): def test_gemini_http_mcp_not_supported(): """Test that HTTP MCPs raise NotImplementedError.""" coder = GeminiCoder(workdir="/tmp/test") - - mcp = MCPConfig( - name="http_server", - enabled=True, - type=MCPType.HTTP - ) - + + mcp = MCPConfig(name="http_server", enabled=True, type=MCPType.HTTP) + with pytest.raises(NotImplementedError, match="HTTP MCPs are not supported"): coder.mcp_config_to_gemini_format(mcp) @@ -56,7 +52,7 @@ def test_gemini_mcp_settings_generation(): command="npx", args=["-y", "@modelcontextprotocol/server-filesystem"], enabled=True, - type=MCPType.STDIO + type=MCPType.STDIO, ), MCPConfig( name="github", @@ -64,41 +60,41 @@ def test_gemini_mcp_settings_generation(): args=["mcp-github"], env={"GITHUB_TOKEN": "${GITHUB_TOKEN}"}, enabled=True, - type=MCPType.STDIO + type=MCPType.STDIO, ), MCPConfig( name="disabled_server", command="uvx", args=["mcp-disabled"], enabled=False, - type=MCPType.STDIO + type=MCPType.STDIO, ), - ] + ], ) - + coder = GeminiCoder(workdir="/tmp/test", config=config) config_objects = coder.default_config_objects() - + # Should have created settings.json assert len(config_objects) == 1 settings_obj = config_objects[0] - + assert settings_obj.relative_path == ".gemini/settings.json" assert "mcpServers" in settings_obj.content - + mcp_servers = settings_obj.content["mcpServers"] - + # Should only include enabled servers assert "filesystem" in mcp_servers assert "github" in mcp_servers assert "disabled_server" not in mcp_servers - + # Check filesystem server config fs_config = mcp_servers["filesystem"] assert fs_config["command"] == "npx" assert fs_config["args"] == ["-y", "@modelcontextprotocol/server-filesystem"] assert fs_config["timeout"] == 30000 - + # Check github server config gh_config = mcp_servers["github"] assert gh_config["command"] == "uvx" @@ -111,6 +107,6 @@ def test_gemini_no_mcp_no_settings(): """Test that no settings.json is created when no MCPs are configured.""" coder = GeminiCoder(workdir="/tmp/test") config_objects = coder.default_config_objects() - + # Should not create any config files when no MCPs - assert len(config_objects) == 0 \ No newline at end of file + assert len(config_objects) == 0 diff --git a/tests/test_registry_loading.py b/tests/test_registry_loading.py index c96781e..c2f67a2 100644 --- a/tests/test_registry_loading.py +++ b/tests/test_registry_loading.py @@ -7,15 +7,15 @@ def test_load_mcp_registry_basics(): """Test loading basics registry.""" collection = load_mcp_registry("metacoder.basics") - + assert isinstance(collection, MCPCollectionConfig) assert len(collection.servers) > 0 - + # Check that fetch is in basics mcp_names = [mcp.name for mcp in collection.servers] assert "fetch" in mcp_names assert "taskmasterai" in mcp_names - + # Check that all are disabled by default for mcp in collection.servers: assert not mcp.enabled and mcp.enabled is not None @@ -24,10 +24,10 @@ def test_load_mcp_registry_basics(): def test_load_mcp_registry_scilit(): """Test loading scilit registry.""" collection = load_mcp_registry("metacoder.scilit") - + assert isinstance(collection, MCPCollectionConfig) assert len(collection.servers) > 0 - + # Check that scilit MCPs are present mcp_names = [mcp.name for mcp in collection.servers] assert "pdfreader" in mcp_names @@ -38,9 +38,9 @@ def test_load_mcp_registry_scilit(): def test_load_mcp_registry_all(): """Test loading all registries with 'metacoder'.""" collection = load_mcp_registry("metacoder") - + assert isinstance(collection, MCPCollectionConfig) - + # Should have MCPs from both basics and scilit mcp_names = [mcp.name for mcp in collection.servers] assert "fetch" in mcp_names # from basics @@ -50,7 +50,7 @@ def test_load_mcp_registry_all(): def test_load_mcp_registry_without_prefix(): """Test loading registry without metacoder prefix.""" collection = load_mcp_registry("basics") - + # Should work the same as with prefix mcp_names = [mcp.name for mcp in collection.servers] assert "fetch" in mcp_names @@ -59,17 +59,24 @@ def test_load_mcp_registry_without_prefix(): def test_cli_with_registry(): """Test CLI with registry option.""" runner = CliRunner() - + # Test with registry and enable specific MCP - result = runner.invoke(cli, [ - "run", - "test prompt", - "--coder", "dummy", - "--registry", "metacoder.basics", - "--enable-mcp", "fetch", - "--workdir", "test_workdir" - ]) - + result = runner.invoke( + cli, + [ + "run", + "test prompt", + "--coder", + "dummy", + "--registry", + "metacoder.basics", + "--enable-mcp", + "fetch", + "--workdir", + "test_workdir", + ], + ) + assert result.exit_code == 0 assert "Loading MCPs from registry: metacoder.basics" in result.output assert "Registry MCPs:" in result.output @@ -79,7 +86,7 @@ def test_cli_with_registry(): def test_cli_registry_with_mcp_collection(): """Test CLI with both registry and MCP collection.""" runner = CliRunner() - + # Create a temporary MCP collection file with runner.isolated_filesystem(): with open("test_mcps.yaml", "w") as f: @@ -91,18 +98,27 @@ def test_cli_registry_with_mcp_collection(): args: ["test"] enabled: true """) - - result = runner.invoke(cli, [ - "run", - "test prompt", - "--coder", "dummy", - "--mcp-collection", "test_mcps.yaml", - "--registry", "metacoder.basics", - "--enable-mcp", "fetch", - "--enable-mcp", "custom_mcp", - "--workdir", "test_workdir" - ]) - + + result = runner.invoke( + cli, + [ + "run", + "test prompt", + "--coder", + "dummy", + "--mcp-collection", + "test_mcps.yaml", + "--registry", + "metacoder.basics", + "--enable-mcp", + "fetch", + "--enable-mcp", + "custom_mcp", + "--workdir", + "test_workdir", + ], + ) + assert result.exit_code == 0 assert "Loading MCP collection from: test_mcps.yaml" in result.output assert "Loading MCPs from registry: metacoder.basics" in result.output @@ -113,5 +129,5 @@ def test_registry_nonexistent(): """Test loading nonexistent registry.""" with pytest.raises(Exception) as exc_info: load_mcp_registry("metacoder.nonexistent") - - assert "Registry file not found" in str(exc_info.value) \ No newline at end of file + + assert "Registry file not found" in str(exc_info.value) From 9e73e3bc529d053b471a55145dedf3b4e5912b30 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 15 Aug 2025 17:43:05 -0700 Subject: [PATCH 3/5] refactor: Convert METRICS to lazy initialization to avoid network calls on import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The METRICS dictionary was previously instantiated at module level, causing GEval objects to be created during import. This triggered network dependencies even when the metrics weren't being used. Changes: - Replace top-level METRICS dictionary with get_default_metrics() function - Move GEval and DummyMetric instantiation into the function for lazy creation - Update run_single_eval() to call get_default_metrics() when needed This prevents unnecessary network calls during module import and improves startup performance, especially in environments where network access may be restricted or when metrics are not used in the execution path. šŸ¤– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/metacoder/evals/runner.py | 50 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py index d2ed34b..67a9619 100644 --- a/src/metacoder/evals/runner.py +++ b/src/metacoder/evals/runner.py @@ -59,30 +59,27 @@ def is_successful(self) -> bool: return self.success -correctness_metric = GEval( - name="Correctness", - criteria="Determine whether the actual output is factually correct based on the expected output.", - # NOTE: you can only provide either criteria or evaluation_steps, and not both - evaluation_steps=[ - "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", - "You should also heavily penalize omission of detail", - "Vague language, or contradicting OPINIONS, are OK", - ], - threshold=0.8, - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - ], -) - -# instances -dummy_metric = DummyMetric(threshold=0.5) - -METRICS = { - "CorrectnessMetric": correctness_metric, - "DummyMetric": dummy_metric, -} +def get_default_metrics() -> Dict[str, BaseMetric]: + """Get default metrics. Creates instances lazily to avoid network calls during import.""" + return { + "CorrectnessMetric": GEval( + name="Correctness", + criteria="Determine whether the actual output is factually correct based on the expected output.", + # NOTE: you can only provide either criteria or evaluation_steps, and not both + evaluation_steps=[ + "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", + "You should also heavily penalize omission of detail", + "Vague language, or contradicting OPINIONS, are OK", + ], + threshold=0.8, + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + ), + "DummyMetric": DummyMetric(threshold=0.5), + } def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder: @@ -226,8 +223,9 @@ def run_single_eval( # Run each metric for metric_name in case.metrics: - if metric_name in METRICS: - metric = METRICS[metric_name] + default_metrics = get_default_metrics() + if metric_name in default_metrics: + metric = default_metrics[metric_name] else: # Get metric class and instantiate metric_class = self.get_metric_class(metric_name) From 8e82f1059cca4cca95b61888ba31d54a2992491e Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 15 Aug 2025 17:47:38 -0700 Subject: [PATCH 4/5] doesn't actually test metacoder functionality --- tests/test_evals/test_deep_eval.py | 95 ------------------------------ 1 file changed, 95 deletions(-) delete mode 100644 tests/test_evals/test_deep_eval.py diff --git a/tests/test_evals/test_deep_eval.py b/tests/test_evals/test_deep_eval.py deleted file mode 100644 index 3690e70..0000000 --- a/tests/test_evals/test_deep_eval.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Test the deepeval library. - -https://github.com/metacoder-ai/deepeval - -Note this doesn't actually test any metacoder functonality, it is more to explore -deepeval metrics, it can probably be removed in the future. -""" - -from deepeval.metrics import GEval -from deepeval.test_case import LLMTestCaseParams -from deepeval import evaluate -from deepeval.metrics import ( - FaithfulnessMetric, - HallucinationMetric, -) -from deepeval.test_case import LLMTestCase -import pytest - - -@pytest.mark.llm -@pytest.mark.parametrize("metric_cls", [FaithfulnessMetric]) -def test_generic_eval(metric_cls): - """Test FaithfulnessMetric with correct output matching context.""" - metric = metric_cls(threshold=0.7) - test_case = LLMTestCase( - input="What is the title of PMID:28027860?", - expected_output="The answer to the question 'what is the title of PMID:28027860?' is 'From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.'", - actual_output='The answer to the question "what is the title of PMID:28027860?" is "From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."', - context=[ - "Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy." - ], - retrieval_context=[ - "PMID:28027860? Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy." - ], - ) - results = evaluate([test_case], [metric]) - import yaml - - print(results) - print(yaml.dump(results.model_dump())) - - -@pytest.mark.llm -@pytest.mark.parametrize("metric_cls", [HallucinationMetric]) -def test_hallucination_eval(metric_cls): - """Test HallucinationMetric detects incorrect information not supported by context.""" - metric = metric_cls(threshold=0.7) - test_case = LLMTestCase( - input="What is the title of PMID:28027860?", - expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.", - actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."', - context=[ - "Title of PMID:28027860: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy." - ], - ) - results = evaluate([test_case], [metric]) - import yaml - - print(results) - print(yaml.dump(results.model_dump())) - - -correctness_metric = GEval( - name="Correctness", - criteria="Determine whether the actual output is factually correct based on the expected output.", - # NOTE: you can only provide either criteria or evaluation_steps, and not both - evaluation_steps=[ - "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", - "You should also heavily penalize omission of detail", - "Vague language, or contradicting OPINIONS, are OK", - ], - threshold=0.8, - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - ], -) - - -@pytest.mark.llm -def test_geval_eval(): - """Test GEval correctness metric catches factual errors in output.""" - metric = correctness_metric - test_case = LLMTestCase( - input="What is the title of PMID:28027860?", - expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.", - actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."', - ) - results = evaluate([test_case], [metric]) - import yaml - - print(results) - print(yaml.dump(results.model_dump())) From 1017f637ed8e9f7096a7be49f6a892bb0dc9208b Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 15 Aug 2025 17:55:30 -0700 Subject: [PATCH 5/5] marking more tests as llm --- tests/test_evals/test_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py index 0515237..d1f0c3e 100644 --- a/tests/test_evals/test_runner.py +++ b/tests/test_evals/test_runner.py @@ -1,6 +1,9 @@ """Tests for the evaluation runner. This uses only dummy coders, so can be used in non-integration contexts. + +TODO: some of these are marked llm because they use an LLM in the eval +phase, even if they use a dummy coder - figure a way to have a dummy LLM Eval too """ import pytest @@ -137,6 +140,7 @@ def test_create_test_case_with_list_context(self): test_case = runner.create_test_case(eval_case, "4") assert test_case.retrieval_context == ["Math fact 1", "Math fact 2"] + @pytest.mark.llm def test_run_single_eval_with_dummy(self, simple_config, tmp_path): """Test running a single evaluation with dummy coder.""" runner = EvalRunner() @@ -244,6 +248,7 @@ def test_save_and_load_results(self, tmp_path): assert data["results"][0]["model"] == "model1" assert data["results"][0]["score"] == 0.9 + @pytest.mark.llm def test_run_all_evals_with_dummy(self, simple_config, tmp_path): """Test running all evaluations with dummy coder.""" runner = EvalRunner()