From d25415c19007bd3794a34da27b5d65103cc5d76e Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 15 Aug 2025 09:41:31 -0700
Subject: [PATCH 1/5] AI integration

---
 README.md                      |  4 ++--
 pyproject.toml                 |  1 +
 src/metacoder/configuration.py |  2 +-
 src/metacoder/metacoder.py     |  2 +-
 uv.lock                        | 27 +++++++++++++++++++++++++++
 5 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b591c01..e4e6846 100644
--- a/README.md
+++ b/README.md
@@ -24,10 +24,10 @@ metacoder "Write a Python function to calculate fibonacci numbers" -c claude -w
 ...
 
 # With custom instructions
-metacoder "Refactor this code" -c claude --instructions coding_guidelines.md
+metacoder "Refactor this code" -c claude --instructions coding_guidelines.md -w my-repo
 ...
 
-# Using MCPs
+# Using MCPs (e.g. GitHub MCP)
 metacoder "Fix issue 1234" -w path/to/my-repo --mcp-collection github_mcps.yaml
 ...
 
diff --git a/pyproject.toml b/pyproject.toml
index 5a09dfa..020908e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dev = [
     "mkdocstrings-python>=1.14.0",
     "mypy>=1.17.1",
     "pytest>=8.4.1",
+    "ruff>=0.12.8",
     "types-click>=7.1.8",
     "types-pyyaml>=6.0.12.20250516",
 ]
diff --git a/src/metacoder/configuration.py b/src/metacoder/configuration.py
index 03cefca..97222ae 100644
--- a/src/metacoder/configuration.py
+++ b/src/metacoder/configuration.py
@@ -54,7 +54,7 @@ class AIModelProvider(BaseModel):
     name: str = Field(..., description="Name of the model provider")
     api_key: str | None = Field(None, description="API key for the model provider")
     metadata: dict[str, Any] = Field({}, description="Metadata for the model provider")
-    base_url: str | None = Field(None, description="Base URL for the model provider")
+    base_url: str | None = Field(None, description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ")
 
 
 class AIModelConfig(BaseModel):
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index a74035a..cf40da2 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -239,7 +239,7 @@ def cli(ctx):
     "--provider", "-p", type=str, help="AI provider (e.g., openai, anthropic, google)"
 )
 @click.option(
-    "--model", type=str, help="AI model name (e.g., gpt-4, claude-3-opus, gemini-pro)"
+    "--model", type=str, help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)"
 )
 @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
 @click.option("--quiet", "-q", is_flag=True, help="Quiet mode")
diff --git a/uv.lock b/uv.lock
index 875df9f..b68d2f3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1515,6 +1515,7 @@ dev = [
     { name = "mkdocstrings-python" },
     { name = "mypy" },
     { name = "pytest" },
+    { name = "ruff" },
     { name = "types-click" },
     { name = "types-pyyaml" },
 ]
@@ -1539,6 +1540,7 @@ dev = [
     { name = "mkdocstrings-python", specifier = ">=1.14.0" },
     { name = "mypy", specifier = ">=1.17.1" },
     { name = "pytest", specifier = ">=8.4.1" },
+    { name = "ruff", specifier = ">=0.12.8" },
     { name = "types-click", specifier = ">=7.1.8" },
     { name = "types-pyyaml", specifier = ">=6.0.12.20250516" },
 ]
@@ -3271,6 +3273,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 },
 ]
 
+[[package]]
+name = "ruff"
+version = "0.12.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4b/da/5bd7565be729e86e1442dad2c9a364ceeff82227c2dece7c29697a9795eb/ruff-0.12.8.tar.gz", hash = "sha256:4cb3a45525176e1009b2b64126acf5f9444ea59066262791febf55e40493a033", size = 5242373 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/1e/c843bfa8ad1114fab3eb2b78235dda76acd66384c663a4e0415ecc13aa1e/ruff-0.12.8-py3-none-linux_armv6l.whl", hash = "sha256:63cb5a5e933fc913e5823a0dfdc3c99add73f52d139d6cd5cc8639d0e0465513", size = 11675315 },
+    { url = "https://files.pythonhosted.org/packages/24/ee/af6e5c2a8ca3a81676d5480a1025494fd104b8896266502bb4de2a0e8388/ruff-0.12.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9a9bbe28f9f551accf84a24c366c1aa8774d6748438b47174f8e8565ab9dedbc", size = 12456653 },
+    { url = "https://files.pythonhosted.org/packages/99/9d/e91f84dfe3866fa648c10512904991ecc326fd0b66578b324ee6ecb8f725/ruff-0.12.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2fae54e752a3150f7ee0e09bce2e133caf10ce9d971510a9b925392dc98d2fec", size = 11659690 },
+    { url = "https://files.pythonhosted.org/packages/fe/ac/a363d25ec53040408ebdd4efcee929d48547665858ede0505d1d8041b2e5/ruff-0.12.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0acbcf01206df963d9331b5838fb31f3b44fa979ee7fa368b9b9057d89f4a53", size = 11896923 },
+    { url = "https://files.pythonhosted.org/packages/58/9f/ea356cd87c395f6ade9bb81365bd909ff60860975ca1bc39f0e59de3da37/ruff-0.12.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae3e7504666ad4c62f9ac8eedb52a93f9ebdeb34742b8b71cd3cccd24912719f", size = 11477612 },
+    { url = "https://files.pythonhosted.org/packages/1a/46/92e8fa3c9dcfd49175225c09053916cb97bb7204f9f899c2f2baca69e450/ruff-0.12.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb82efb5d35d07497813a1c5647867390a7d83304562607f3579602fa3d7d46f", size = 13182745 },
+    { url = "https://files.pythonhosted.org/packages/5e/c4/f2176a310f26e6160deaf661ef60db6c3bb62b7a35e57ae28f27a09a7d63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dbea798fc0065ad0b84a2947b0aff4233f0cb30f226f00a2c5850ca4393de609", size = 14206885 },
+    { url = "https://files.pythonhosted.org/packages/87/9d/98e162f3eeeb6689acbedbae5050b4b3220754554526c50c292b611d3a63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:49ebcaccc2bdad86fd51b7864e3d808aad404aab8df33d469b6e65584656263a", size = 13639381 },
+    { url = "https://files.pythonhosted.org/packages/81/4e/1b7478b072fcde5161b48f64774d6edd59d6d198e4ba8918d9f4702b8043/ruff-0.12.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ac9c570634b98c71c88cb17badd90f13fc076a472ba6ef1d113d8ed3df109fb", size = 12613271 },
+    { url = "https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818", size = 12847783 },
+    { url = "https://files.pythonhosted.org/packages/4e/2a/0b6ac3dd045acf8aa229b12c9c17bb35508191b71a14904baf99573a21bd/ruff-0.12.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:71c83121512e7743fba5a8848c261dcc454cafb3ef2934a43f1b7a4eb5a447ea", size = 11702672 },
+    { url = "https://files.pythonhosted.org/packages/9d/ee/f9fdc9f341b0430110de8b39a6ee5fa68c5706dc7c0aa940817947d6937e/ruff-0.12.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:de4429ef2ba091ecddedd300f4c3f24bca875d3d8b23340728c3cb0da81072c3", size = 11440626 },
+    { url = "https://files.pythonhosted.org/packages/89/fb/b3aa2d482d05f44e4d197d1de5e3863feb13067b22c571b9561085c999dc/ruff-0.12.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a2cab5f60d5b65b50fba39a8950c8746df1627d54ba1197f970763917184b161", size = 12462162 },
+    { url = "https://files.pythonhosted.org/packages/18/9f/5c5d93e1d00d854d5013c96e1a92c33b703a0332707a7cdbd0a4880a84fb/ruff-0.12.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:45c32487e14f60b88aad6be9fd5da5093dbefb0e3e1224131cb1d441d7cb7d46", size = 12913212 },
+    { url = "https://files.pythonhosted.org/packages/71/13/ab9120add1c0e4604c71bfc2e4ef7d63bebece0cfe617013da289539cef8/ruff-0.12.8-py3-none-win32.whl", hash = "sha256:daf3475060a617fd5bc80638aeaf2f5937f10af3ec44464e280a9d2218e720d3", size = 11694382 },
+    { url = "https://files.pythonhosted.org/packages/f6/dc/a2873b7c5001c62f46266685863bee2888caf469d1edac84bf3242074be2/ruff-0.12.8-py3-none-win_amd64.whl", hash = "sha256:7209531f1a1fcfbe8e46bcd7ab30e2f43604d8ba1c49029bb420b103d0b5f76e", size = 12740482 },
+    { url = "https://files.pythonhosted.org/packages/cb/5c/799a1efb8b5abab56e8a9f2a0b72d12bd64bb55815e9476c7d0a2887d2f7/ruff-0.12.8-py3-none-win_arm64.whl", hash = "sha256:c90e1a334683ce41b0e7a04f41790c429bf5073b62c1ae701c9dc5b3d14f0749", size = 11884718 },
+]
+
 [[package]]
 name = "sentry-sdk"
 version = "2.34.1"

From 690a65e6b5cf376a86f78d305fe6cc32d4fc84cd Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 15 Aug 2025 17:32:58 -0700
Subject: [PATCH 2/5] ruff-ruff

---
 src/metacoder/coders/base_coder.py           |  44 +++--
 src/metacoder/coders/claude.py               |  40 +++--
 src/metacoder/coders/codex.py                |   1 -
 src/metacoder/coders/dummy.py                |  88 ++++++----
 src/metacoder/coders/gemini.py               |  20 +--
 src/metacoder/coders/goose.py                |  52 ++++--
 src/metacoder/configuration.py               |   5 +-
 src/metacoder/evals/runner.py                |   1 -
 src/metacoder/metacoder.py                   | 150 +++++++++-------
 tests/test_claude_tool_use.py                |  83 ++++-----
 tests/test_coders/test_coder_availability.py |  12 +-
 tests/test_coders/test_coders_basic.py       |  24 +--
 tests/test_dummy_coder_tool_capture.py       |  57 ++++---
 tests/test_dummy_tool_use.py                 |  27 +--
 tests/test_evals/test_deep_eval.py           |   2 -
 tests/test_goose_tool_use.py                 | 170 ++++++++++---------
 tests/test_instructions_option.py            |  16 +-
 tests/test_introspect_mcp.py                 |  23 +--
 tests/test_mcps/test_gemini_mcp.py           |  44 +++--
 tests/test_registry_loading.py               |  82 +++++----
 20 files changed, 521 insertions(+), 420 deletions(-)

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index a8be80b..b44c6ec 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -21,10 +21,15 @@
 
 class ToolUse(BaseModel):
     """Tool use from the coder."""
-    name: str = Field(..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext")
+
+    name: str = Field(
+        ..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext"
+    )
     arguments: dict[str, Any] = Field(..., description="Arguments to the tool")
     success: bool = Field(..., description="Whether the tool call was successful")
-    error: str | None = Field(default=None, description="Error message if the tool call failed")
+    error: str | None = Field(
+        default=None, description="Error message if the tool call failed"
+    )
     result: Any = Field(..., description="Result of the tool")
 
 
@@ -87,6 +92,7 @@ class BaseCoder(BaseModel, ABC):
     Subclasses should implement the following methods:
     - run(self, input_text: str) -> CoderOutput: Run the coder on the input text
     """
+
     workdir: str = Field(default="workdir", description="Working dir ")
     config: CoderConfig | None = Field(default=None, description="Config for the coder")
     params: dict | None = Field(default=None, description="Parameters for the coder")
@@ -115,8 +121,6 @@ def validate_mcp_support(self):
                 )
         return self
 
-
-
     @abstractmethod
     def run(self, input_text: str) -> CoderOutput:
         """Run the coder on the input text.
@@ -129,7 +133,6 @@ def run(self, input_text: str) -> CoderOutput:
         """
         raise NotImplementedError
 
-
     @classmethod
     def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
         """Return config files as a dictionary of filename/dirname to role."""
@@ -220,7 +223,6 @@ def stream_output(pipe, output_lines, stream):
 
         return CoderOutput(stdout=stdout_text, stderr=stderr_text)
 
-
     def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]:
         """
         Expand environment variables in the coder config.
@@ -257,7 +259,7 @@ def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]:
 
     def expand_prompt(self, input_text: str) -> str:
         """Expand environment variables in the prompt.
-        
+
         Typically this just returns the prompt as is:
 
         Example:
@@ -278,7 +280,7 @@ def expand_prompt(self, input_text: str) -> str:
     def default_config_objects(self) -> list[CoderConfigObject]:
         """Default config objects for the coder."""
         raise NotImplementedError("default_config_objects is not implemented")
-    
+
     def set_instructions(self, instructions: str):
         """Set the instructions for the coder.
 
@@ -291,7 +293,7 @@ def set_instructions(self, instructions: str):
             >>> coder.set_instructions("you are an awesome coder")
             >>> coder.config_objects
             [CoderConfigObject(file_type=<FileType.TEXT: 'text'>, relative_path='CLAUDE.md', content='you are an awesome coder')]
-        
+
         Args:
             instructions: The instructions to set
         """
@@ -300,16 +302,25 @@ def set_instructions(self, instructions: str):
                 if not self.config_objects:
                     self.config_objects = []
                 for obj in self.config_objects:
-                    if obj.relative_path == str(path) or obj.relative_path == str(path.name):
+                    if obj.relative_path == str(path) or obj.relative_path == str(
+                        path.name
+                    ):
                         obj.content = instructions
                         return
                 else:
-                    self.config_objects.append(CoderConfigObject(relative_path=str(path), content=instructions, file_type=FileType.TEXT))
+                    self.config_objects.append(
+                        CoderConfigObject(
+                            relative_path=str(path),
+                            content=instructions,
+                            file_type=FileType.TEXT,
+                        )
+                    )
                     return
             else:
                 raise ValueError(f"Cannot set instructions for {typ}")
-        raise ValueError(f"No primary instruction file found for {self.__class__.__name__}")
-            
+        raise ValueError(
+            f"No primary instruction file found for {self.__class__.__name__}"
+        )
 
     def prepare_workdir(self):
         """Prepare the workdir for the coder.
@@ -330,11 +341,7 @@ def prepare_workdir(self):
         # Check if MCP extensions are configured but not supported
         if self.config and self.config.extensions:
             logger.debug(f"🔧 Checking MCP extensions: {self.config.extensions}")
-            mcp_extensions = [
-                ext
-                for ext in self.config.extensions
-                if ext.enabled
-            ]
+            mcp_extensions = [ext for ext in self.config.extensions if ext.enabled]
             if mcp_extensions and not self.supports_mcp():
                 raise ValueError(
                     f"MCP extensions are configured but {self.__class__.__name__} does not support MCP. "
@@ -353,6 +360,7 @@ def prepare_workdir(self):
                     logger.debug(f" 🗑️ Removing old config object: {path}")
                     if path.is_dir():
                         import shutil
+
                         shutil.rmtree(path)
                     else:
                         path.unlink()
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index 1c67c20..cf1af7c 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -147,6 +147,7 @@ def run(self, input_text: str) -> CoderOutput:
             # time the command
             start_time = time.time()
             ao = self.run_process(command, env)
+
             # parse the jsonl output
             def parse_jsonl_line(text: str) -> dict[str, Any]:
                 try:
@@ -154,17 +155,20 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                     return result
                 except json.JSONDecodeError:
                     return {"original": text, "error": "JSONDecodeError"}
+
             ao.structured_messages = [
                 parse_jsonl_line(line) for line in ao.stdout.split("\n") if line
             ]
-            ao.structured_messages = [m for m in ao.structured_messages if m is not None]
+            ao.structured_messages = [
+                m for m in ao.structured_messages if m is not None
+            ]
             total_cost_usd = None
             is_error = None
-            
+
             # Extract tool uses
             tool_uses = []
             pending_tool_uses = {}  # Map tool_use_id to tool data
-            
+
             for message in ao.structured_messages:
                 if "total_cost_usd" in message:
                     total_cost_usd = message["total_cost_usd"]
@@ -172,7 +176,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                     is_error = message["is_error"]
                 if "result" in message:
                     ao.result_text = message["result"]
-                
+
                 # Check for tool_use in assistant messages
                 if message.get("type") == "assistant" and message.get("message"):
                     msg_content = message["message"].get("content", [])
@@ -182,16 +186,16 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                                 tool_id = content_item.get("id")
                                 tool_name = content_item.get("name", "")
                                 tool_input = content_item.get("input", {})
-                                
+
                                 # Store pending tool use
                                 pending_tool_uses[tool_id] = {
                                     "name": tool_name,
                                     "arguments": tool_input,
                                     "success": False,  # Default to False until we see result
                                     "error": None,
-                                    "result": None
+                                    "result": None,
                                 }
-                
+
                 # Check for tool_result in user messages
                 elif message.get("type") == "user" and message.get("message"):
                     msg_content = message["message"].get("content", [])
@@ -201,31 +205,35 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                                 tool_id = content_item.get("tool_use_id")
                                 if tool_id in pending_tool_uses:
                                     tool_data = pending_tool_uses[tool_id]
-                                    
+
                                     # Update with result
                                     is_tool_error = content_item.get("is_error", False)
                                     tool_data["success"] = not is_tool_error
-                                    tool_data["result"] = content_item.get("content", "")
-                                    
+                                    tool_data["result"] = content_item.get(
+                                        "content", ""
+                                    )
+
                                     if is_tool_error:
-                                        tool_data["error"] = content_item.get("content", "Tool error occurred")
-                                    
+                                        tool_data["error"] = content_item.get(
+                                            "content", "Tool error occurred"
+                                        )
+
                                     # Create ToolUse object
                                     tool_use = ToolUse(**tool_data)
                                     tool_uses.append(tool_use)
-                                    
+
                                     # Remove from pending
                                     del pending_tool_uses[tool_id]
-            
+
             # Add any remaining pending tool uses (shouldn't happen in normal flow)
             for tool_data in pending_tool_uses.values():
                 tool_data["error"] = "No result received for tool call"
                 tool_use = ToolUse(**tool_data)
                 tool_uses.append(tool_use)
-            
+
             if tool_uses:
                 ao.tool_uses = tool_uses
-                
+
             end_time = time.time()
             logger.info(f"🤖 Command took {end_time - start_time} seconds")
             ao.total_cost_usd = total_cost_usd
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 2f29483..8e9169e 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -26,7 +26,6 @@ def is_available(cls) -> bool:
         """Check if codex command is available."""
         return shutil.which("codex") is not None
 
-
     @property
     def instructions_path(self) -> Path:
         return Path("AGENTS.md")
diff --git a/src/metacoder/coders/dummy.py b/src/metacoder/coders/dummy.py
index bb93159..d55378d 100644
--- a/src/metacoder/coders/dummy.py
+++ b/src/metacoder/coders/dummy.py
@@ -1,17 +1,22 @@
 from pathlib import Path
-from metacoder.coders.base_coder import BaseCoder, CoderConfigObject, CoderOutput, ToolUse
+from metacoder.coders.base_coder import (
+    BaseCoder,
+    CoderConfigObject,
+    CoderOutput,
+    ToolUse,
+)
 from metacoder.configuration import ConfigFileRole
 
 
 class DummyCoder(BaseCoder):
     """
     Dummy coder for testing.
-    
+
     Simulates tool use when input contains keywords:
     - "tool" or "mcp": Adds a generic test tool
     - "search" or "pubmed": Simulates a PubMed search tool
     - "error": Simulates a tool failure
-    
+
     Multiple keywords can trigger multiple tools.
     """
 
@@ -34,58 +39,71 @@ def run(self, input_text: str) -> CoderOutput:
         instructions_content = None
         if self.config_objects:
             for obj in self.config_objects:
-                if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str(Path("INSTRUCTIONS.md")):
+                if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str(
+                    Path("INSTRUCTIONS.md")
+                ):
                     instructions_content = obj.content
                     break
-        
+
         # Create response based on whether instructions exist
         if instructions_content:
-            response = f"Instructions loaded: {instructions_content}\nProcessing: {input_text}"
+            response = (
+                f"Instructions loaded: {instructions_content}\nProcessing: {input_text}"
+            )
         else:
             response = f"you said: {input_text}"
-            
+
         output = CoderOutput(
             stdout=response,
             stderr="",
             result_text=response,
         )
-        
+
         # Add fake tool uses if input mentions tools, MCP, or specific services
-        if any(keyword in input_text.lower() for keyword in ["tool", "mcp", "pubmed", "search"]):
+        if any(
+            keyword in input_text.lower()
+            for keyword in ["tool", "mcp", "pubmed", "search"]
+        ):
             # Create some fake tool uses for testing
             tool_uses = []
-            
+
             # Simulate a successful tool call
             if "search" in input_text.lower() or "pubmed" in input_text.lower():
-                tool_uses.append(ToolUse(
-                    name="mcp__pubmed__search_papers",
-                    arguments={"query": "test query", "limit": 10},
-                    success=True,
-                    error=None,
-                    result={"papers": ["paper1", "paper2"], "count": 2}
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__pubmed__search_papers",
+                        arguments={"query": "test query", "limit": 10},
+                        success=True,
+                        error=None,
+                        result={"papers": ["paper1", "paper2"], "count": 2},
+                    )
+                )
+
             # Simulate a tool with an error
             if "error" in input_text.lower():
-                tool_uses.append(ToolUse(
-                    name="mcp__test__failing_tool", 
-                    arguments={"param": "value"},
-                    success=False,
-                    error="Simulated tool error for testing",
-                    result=None
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__test__failing_tool",
+                        arguments={"param": "value"},
+                        success=False,
+                        error="Simulated tool error for testing",
+                        result=None,
+                    )
+                )
+
             # Default tool if no specific keywords but general tool/mcp mentioned
             if not tool_uses:
-                tool_uses.append(ToolUse(
-                    name="mcp__dummy__test_tool",
-                    arguments={"input": input_text},
-                    success=True,
-                    error=None,
-                    result="Test tool executed successfully"
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__dummy__test_tool",
+                        arguments={"input": input_text},
+                        success=True,
+                        error=None,
+                        result="Test tool executed successfully",
+                    )
+                )
+
             if tool_uses:
                 output.tool_uses = tool_uses
-        
+
         return output
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index f81a5c8..20564a9 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -32,7 +32,7 @@ class GeminiCoder(BaseCoder):
     - `.gemini/commands/` - Custom commands directory
 
     MCP Support:
-    
+
     Gemini CLI supports MCP (Model Context Protocol) servers through the
     mcpServers configuration in .gemini/settings.json. When MCPs are configured
     through Metacoder, they will be automatically added to the settings file.
@@ -86,29 +86,27 @@ def mcp_config_to_gemini_format(self, mcp: MCPConfig) -> dict[str, Any]:
 
         # For HTTP type MCPs
         elif mcp.type == MCPType.HTTP:
-            raise NotImplementedError(
-                "HTTP MCPs are not supported for Gemini CLI yet"
-            )
+            raise NotImplementedError("HTTP MCPs are not supported for Gemini CLI yet")
 
         return server_config
 
     def default_config_objects(self) -> list[CoderConfigObject]:
         """Generate config objects including MCP configuration."""
         config_objects = []
-        
+
         # Create .gemini/settings.json if we have MCP extensions
         settings_content: dict[str, Any] = {}
-        
+
         # Add MCP servers configuration if extensions are present
         if self.config and self.config.extensions:
             mcp_servers = {}
             for mcp in self.config.extensions:
                 if mcp.enabled:
                     mcp_servers[mcp.name] = self.mcp_config_to_gemini_format(mcp)
-            
+
             if mcp_servers:
                 settings_content["mcpServers"] = mcp_servers
-        
+
         # Add settings.json if we have content to write
         if settings_content:
             config_objects.append(
@@ -118,10 +116,10 @@ def default_config_objects(self) -> list[CoderConfigObject]:
                     content=settings_content,
                 )
             )
-        
+
         # Add GEMINI.md if present in config
         # This could contain instructions specific to the task
-        
+
         return config_objects
 
     def run(self, input_text: str) -> CoderOutput:
@@ -136,7 +134,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["HOME"] = "."
 
             text = self.expand_prompt(input_text)
-            
+
             # Build the command
             # The gemini CLI uses conversational interface, so we need to handle it differently
             # For now, we'll use echo to pipe the prompt
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 9b76f4b..514dc2b 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -40,7 +40,6 @@ def supports_mcp(cls) -> bool:
         """GooseCoder supports MCP extensions."""
         return True
 
-
     def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
         """Convert an MCPConfig to Goose extension format."""
         extension = {
@@ -69,7 +68,7 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
         extension["bundled"] = None
 
         return extension
-    
+
     @classmethod
     def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
         return {
@@ -196,38 +195,44 @@ def run(self, input_text: str) -> CoderOutput:
             if ao.structured_messages:
                 tool_uses = []
                 pending_tool_uses = {}  # Map tool request id to tool data
-                
+
                 for message in ao.structured_messages:
                     # Check for tool requests in assistant messages
                     if message.get("role") == "assistant" and "content" in message:
                         for content in message.get("content", []):
-                            if isinstance(content, dict) and content.get("type") == "toolRequest":
+                            if (
+                                isinstance(content, dict)
+                                and content.get("type") == "toolRequest"
+                            ):
                                 tool_id = content.get("id")
                                 tool_call = content.get("toolCall", {})
-                                
+
                                 if tool_call.get("status") == "success":
                                     tool_value = tool_call.get("value", {})
                                     tool_name = tool_value.get("name", "")
                                     tool_args = tool_value.get("arguments", {})
-                                    
+
                                     # Store pending tool use
                                     pending_tool_uses[tool_id] = {
                                         "name": tool_name,
                                         "arguments": tool_args,
                                         "success": False,  # Default until we see result
                                         "error": None,
-                                        "result": None
+                                        "result": None,
                                     }
-                    
+
                     # Check for tool responses in user messages
                     elif message.get("role") == "user" and "content" in message:
                         for content in message.get("content", []):
-                            if isinstance(content, dict) and content.get("type") == "toolResponse":
+                            if (
+                                isinstance(content, dict)
+                                and content.get("type") == "toolResponse"
+                            ):
                                 tool_id = content.get("id")
                                 if tool_id in pending_tool_uses:
                                     tool_data = pending_tool_uses[tool_id]
                                     tool_result = content.get("toolResult", {})
-                                    
+
                                     # Update with result
                                     if tool_result.get("status") == "success":
                                         tool_data["success"] = True
@@ -236,29 +241,40 @@ def run(self, input_text: str) -> CoderOutput:
                                         if isinstance(result_value, list):
                                             result_texts = []
                                             for item in result_value:
-                                                if isinstance(item, dict) and item.get("type") == "text":
-                                                    result_texts.append(item.get("text", ""))
-                                            tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                                if (
+                                                    isinstance(item, dict)
+                                                    and item.get("type") == "text"
+                                                ):
+                                                    result_texts.append(
+                                                        item.get("text", "")
+                                                    )
+                                            tool_data["result"] = (
+                                                "\n".join(result_texts)
+                                                if result_texts
+                                                else str(result_value)
+                                            )
                                         else:
                                             tool_data["result"] = str(result_value)
                                     else:
                                         tool_data["success"] = False
-                                        tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                                        tool_data["error"] = tool_result.get(
+                                            "error", "Tool execution failed"
+                                        )
                                         tool_data["result"] = None
-                                    
+
                                     # Create ToolUse object
                                     tool_use = ToolUse(**tool_data)
                                     tool_uses.append(tool_use)
-                                    
+
                                     # Remove from pending
                                     del pending_tool_uses[tool_id]
-                
+
                 # Add any remaining pending tool uses (shouldn't happen in normal flow)
                 for tool_data in pending_tool_uses.values():
                     tool_data["error"] = "No result received for tool call"
                     tool_use = ToolUse(**tool_data)
                     tool_uses.append(tool_use)
-                
+
                 if tool_uses:
                     ao.tool_uses = tool_uses
 
diff --git a/src/metacoder/configuration.py b/src/metacoder/configuration.py
index 97222ae..5110fe8 100644
--- a/src/metacoder/configuration.py
+++ b/src/metacoder/configuration.py
@@ -54,7 +54,10 @@ class AIModelProvider(BaseModel):
     name: str = Field(..., description="Name of the model provider")
     api_key: str | None = Field(None, description="API key for the model provider")
     metadata: dict[str, Any] = Field({}, description="Metadata for the model provider")
-    base_url: str | None = Field(None, description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ")
+    base_url: str | None = Field(
+        None,
+        description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ",
+    )
 
 
 class AIModelConfig(BaseModel):
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index a12658a..d2ed34b 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -103,7 +103,6 @@ def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder:
     return coder
 
 
-
 class EvalResult(BaseModel):
     """Result of a single evaluation."""
 
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index cf40da2..f62d3df 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -51,16 +51,16 @@ def load_mcp_collection(collection_path: Path) -> MCPCollectionConfig:
 
 def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
     """Load MCPs from the registry based on a path pattern.
-    
+
     Args:
         registry_path: Path pattern like 'metacoder' (all) or 'metacoder.basics'
-    
+
     Returns:
         MCPCollectionConfig containing all matched MCPs
     """
     # Base directory for registry
     registry_base = Path(__file__).parent / "mcps" / "registry"
-    
+
     # Convert dot notation to file path
     if registry_path == "metacoder":
         # Load all yaml files in registry
@@ -68,21 +68,21 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
     else:
         # Convert metacoder.basics to basics.yaml
         if registry_path.startswith("metacoder."):
-            registry_path = registry_path[len("metacoder."):]
+            registry_path = registry_path[len("metacoder.") :]
         yaml_files = [registry_base / f"{registry_path}.yaml"]
-    
+
     # Collect all MCPs
     all_mcps = []
     for yaml_file in yaml_files:
         if not yaml_file.exists():
             raise click.ClickException(f"Registry file not found: {yaml_file}")
-        
+
         try:
             with open(yaml_file, "r") as f:
                 data = yaml.safe_load(f)
         except yaml.YAMLError as e:
             raise click.ClickException(f"Invalid YAML in {yaml_file}: {e}")
-        
+
         # The registry files contain a list of MCP extensions directly
         if isinstance(data, list):
             for mcp_data in data:
@@ -99,7 +99,7 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
                 logger.warning(f"Invalid MCP in {yaml_file}: {e}")
     for mcp in all_mcps:
         mcp.enabled = False
-    
+
     # Create a collection config
     collection_name = f"Registry: {registry_path}"
     return MCPCollectionConfig(name=collection_name, description=None, servers=all_mcps)
@@ -239,7 +239,9 @@ def cli(ctx):
     "--provider", "-p", type=str, help="AI provider (e.g., openai, anthropic, google)"
 )
 @click.option(
-    "--model", type=str, help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)"
+    "--model",
+    type=str,
+    help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)",
 )
 @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
 @click.option("--quiet", "-q", is_flag=True, help="Quiet mode")
@@ -323,7 +325,7 @@ def run(
         raise click.ClickException("Cannot use both verbose and quiet mode")
     if verbose:
         logging.basicConfig(level=logging.DEBUG)
-    elif quiet: # quiet mode is a bit different, it's just no output
+    elif quiet:  # quiet mode is a bit different, it's just no output
         logging.basicConfig(level=logging.WARNING)
     else:
         logging.basicConfig(level=logging.INFO)
@@ -358,26 +360,29 @@ def run(
             click.echo(
                 f"   Enabling MCPs: {', '.join(enabled_list)} (all enabled by default)"
             )
-    
+
     # Load MCPs from registry if provided
     if registry:
         click.echo(f"📚 Loading MCPs from registry: {registry}")
         registry_config = load_mcp_registry(registry)
-        
+
         # Merge with existing MCP collection if any
         if mcp_collection_config:
             # Merge the servers lists
             for mcp in registry_config.servers:
                 # Avoid duplicates by name
-                if not any(existing.name == mcp.name for existing in mcp_collection_config.servers):
+                if not any(
+                    existing.name == mcp.name
+                    for existing in mcp_collection_config.servers
+                ):
                     mcp_collection_config.servers.append(mcp)
         else:
             mcp_collection_config = registry_config
-        
+
         # Show available MCPs from registry
         registry_mcps = [mcp.name for mcp in registry_config.servers]
         click.echo(f"   Registry MCPs: {', '.join(registry_mcps)}")
-        
+
         # Note that registry MCPs are not enabled by default
         if not enable_mcp:
             click.echo("   Use -e/--enable-mcp to enable specific MCPs")
@@ -421,7 +426,7 @@ def run(
         )
 
     if coder_config and coder_config.extensions:
-        for mcp in coder_config.extensions :
+        for mcp in coder_config.extensions:
             # use emoji to indicate enabled/disabled
             if mcp.enabled:
                 click.echo(f" ✅ MCP: {mcp.name}")
@@ -476,7 +481,9 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
+            click.echo(
+                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
+            )
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
@@ -485,7 +492,7 @@ def run(
             f"\n📋 Structured messages ({len(result.structured_messages)} total)"
         )
         for i, msg in enumerate(result.structured_messages):
-            click.echo(f"  {i+1}. {msg}")
+            click.echo(f"  {i + 1}. {msg}")
 
 
 @cli.command("list-coders")
@@ -588,10 +595,10 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
     click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed']/summary['total_evaluations']*100:.1f}%)"
+        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
     )
     click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed']/summary['total_evaluations']*100:.1f}%)"
+        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
     )
     if summary["errors"] > 0:
         click.echo(f"   Errors: {summary['errors']} ⚠️")
@@ -640,22 +647,22 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose: bool):
     """
     Introspect an MCP server to list its available tools, resources, and prompts.
-    
+
     MCP_SPEC can be:
     - A URL (http://localhost:8080)
     - A command (uvx mcp-server-fetch)
     - An MCP name when used with --registry
-    
+
     Examples:
-    
+
     \b
     # Introspect a running MCP server
     metacoder introspect-mcp http://localhost:8080
-    
+
     \b
     # Introspect an MCP from registry
     metacoder introspect-mcp fetch --registry metacoder.basics
-    
+
     \b
     # Introspect a command-based MCP
     metacoder introspect-mcp "uvx mcp-server-fetch"
@@ -665,18 +672,24 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
-    
+
     # Run the introspection with proper cleanup
     import os
     import sys
-    
+
     # Suppress the specific asyncio warning by running with -W flag
     env = os.environ.copy()
-    env['PYTHONWARNINGS'] = 'ignore::RuntimeWarning:asyncio.base_subprocess'
-    
+    env["PYTHONWARNINGS"] = "ignore::RuntimeWarning:asyncio.base_subprocess"
+
     # Run in a subprocess to isolate the asyncio event loop
     import subprocess
-    args = [sys.executable, "-W", "ignore::RuntimeWarning:asyncio.base_subprocess", "-c", f"""
+
+    args = [
+        sys.executable,
+        "-W",
+        "ignore::RuntimeWarning:asyncio.base_subprocess",
+        "-c",
+        f"""
 import asyncio
 import sys
 sys.path.insert(0, {repr(str(Path(__file__).parent.parent))})
@@ -688,26 +701,26 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
 except Exception as e:
     print(f"Error: {{e}}", file=sys.stderr)
     sys.exit(1)
-"""]
-    
+""",
+    ]
+
     try:
         # Run with stderr captured to filter out asyncio warnings
         result = subprocess.run(
-            args, 
-            env=env, 
-            timeout=timeout + 5,
-            stderr=subprocess.PIPE,
-            text=True
+            args, env=env, timeout=timeout + 5, stderr=subprocess.PIPE, text=True
         )
-        
+
         # Filter out the specific asyncio warning from stderr
         if result.stderr:
             error_lines = []
             skip_next = 0
             lines = result.stderr.splitlines()
-            
+
             for i, line in enumerate(lines):
-                if "Exception ignored in: <function BaseSubprocessTransport.__del__" in line:
+                if (
+                    "Exception ignored in: <function BaseSubprocessTransport.__del__"
+                    in line
+                ):
                     # Skip this line and the rest of the traceback
                     skip_next = 100  # Skip many lines to catch the full traceback
                 elif skip_next > 0:
@@ -717,12 +730,12 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
                         skip_next = 0  # Stop skipping after this line
                 else:
                     error_lines.append(line)
-            
+
             # Print any remaining stderr
             if error_lines:
                 for line in error_lines:
                     click.echo(line, err=True)
-        
+
         if result.returncode != 0:
             raise click.ClickException("Failed to introspect MCP server")
     except subprocess.TimeoutExpired:
@@ -736,48 +749,50 @@ async def _introspect_mcp_async(
 ):
     """Async implementation of MCP introspection."""
     from fastmcp import Client
-    
+
     mcp_config = None
     spec_to_use: Union[str, list[str]] = mcp_spec
-    
+
     # If registry is specified, load the MCP config
     if registry:
         click.echo(f"📚 Loading MCP '{mcp_spec}' from registry: {registry}")
         registry_config = load_mcp_registry(registry)
-        
+
         # Find the MCP in the registry
         mcp_config = None
         for mcp in registry_config.servers:
             if mcp.name == mcp_spec:
                 mcp_config = mcp
                 break
-        
+
         if not mcp_config:
             available = [mcp.name for mcp in registry_config.servers]
             raise click.ClickException(
                 f"MCP '{mcp_spec}' not found in registry. Available: {', '.join(available)}"
             )
-        
+
         # Build the command from MCP config
         if mcp_config.command and mcp_config.args:
             spec_to_use = [mcp_config.command] + mcp_config.args
         else:
-            raise click.ClickException(f"MCP '{mcp_spec}' has incomplete command configuration")
-    
+            raise click.ClickException(
+                f"MCP '{mcp_spec}' has incomplete command configuration"
+            )
+
     click.echo(f"🔍 Introspecting MCP: {spec_to_use}")
-    
+
     # Create client based on the spec type
     if isinstance(spec_to_use, list):
         # Command-based MCP - FastMCP expects a single server config dict
         server_config = {
             "server_name": {
                 "command": spec_to_use[0],
-                "args": spec_to_use[1:] if len(spec_to_use) > 1 else []
+                "args": spec_to_use[1:] if len(spec_to_use) > 1 else [],
             }
         }
         if mcp_config and mcp_config.env:
             server_config["server_name"]["env"] = mcp_config.env  # type: ignore
-        
+
         # FastMCP expects the full config with mcpServers key
         full_config = {"mcpServers": server_config}
         client = Client(full_config)
@@ -787,28 +802,29 @@ async def _introspect_mcp_async(
     else:
         # Try as command
         import shlex
+
         parts = shlex.split(spec_to_use)
         server_config = {
             "server_name": {
                 "command": parts[0],
-                "args": parts[1:] if len(parts) > 1 else []
+                "args": parts[1:] if len(parts) > 1 else [],
             }
         }
         full_config = {"mcpServers": server_config}
         client = Client(full_config)
-    
+
     async with client:
         click.echo("✅ Connected to MCP server")
-        
+
         # Get server info if available
-        if hasattr(client, 'server_info'):
+        if hasattr(client, "server_info"):
             info = client.server_info
             click.echo("\n📋 Server Info:")
             click.echo(f"   Name: {info.name}")
             click.echo(f"   Version: {info.version}")
-            if hasattr(info, 'description') and info.description:
+            if hasattr(info, "description") and info.description:
                 click.echo(f"   Description: {info.description}")
-        
+
         # List tools
         click.echo("\n🔧 Available Tools:")
         try:
@@ -818,13 +834,15 @@ async def _introspect_mcp_async(
                     click.echo(f"\n   📌 {tool.name}")
                     if tool.description:
                         click.echo(f"      Description: {tool.description}")
-                    if verbose and hasattr(tool, 'inputSchema') and tool.inputSchema:
-                        click.echo(f"      Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}")
+                    if verbose and hasattr(tool, "inputSchema") and tool.inputSchema:
+                        click.echo(
+                            f"      Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}"
+                        )
             else:
                 click.echo("   (No tools available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing tools: {e}")
-        
+
         # List resources
         click.echo("\n📁 Available Resources:")
         try:
@@ -841,7 +859,7 @@ async def _introspect_mcp_async(
                 click.echo("   (No resources available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing resources: {e}")
-        
+
         # List prompts
         click.echo("\n💬 Available Prompts:")
         try:
@@ -851,16 +869,18 @@ async def _introspect_mcp_async(
                     click.echo(f"\n   💡 {prompt.name}")
                     if prompt.description:
                         click.echo(f"      Description: {prompt.description}")
-                    if verbose and hasattr(prompt, 'arguments') and prompt.arguments:
+                    if verbose and hasattr(prompt, "arguments") and prompt.arguments:
                         click.echo("      Arguments:")
                         for arg in prompt.arguments:
                             req = "required" if arg.required else "optional"
-                            click.echo(f"        - {arg.name} ({req}): {arg.description}")
+                            click.echo(
+                                f"        - {arg.name} ({req}): {arg.description}"
+                            )
             else:
                 click.echo("   (No prompts available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing prompts: {e}")
-        
+
         click.echo("\n✅ Introspection complete!")
 
 
diff --git a/tests/test_claude_tool_use.py b/tests/test_claude_tool_use.py
index 817617f..3d74485 100644
--- a/tests/test_claude_tool_use.py
+++ b/tests/test_claude_tool_use.py
@@ -1,10 +1,11 @@
 """Test ClaudeCoder tool use extraction."""
+
 from metacoder.coders.base_coder import CoderOutput, ToolUse
 
 
 def test_claude_tool_use_extraction():
     """Test that ClaudeCoder correctly extracts tool uses from structured messages."""
-    
+
     # Create a mock output with tool use in structured messages
     output = CoderOutput(
         stdout="",
@@ -18,31 +19,31 @@ def test_claude_tool_use_extraction():
                             "type": "tool_use",
                             "id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s",
                             "name": "mcp__pubmed__get_paper_fulltext",
-                            "input": {"pmid": "35743164"}
+                            "input": {"pmid": "35743164"},
                         }
                     ]
-                }
+                },
             },
             {
-                "type": "user", 
+                "type": "user",
                 "message": {
                     "content": [
                         {
                             "type": "tool_result",
                             "content": "Paper content here...",
                             "is_error": False,
-                            "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s"
+                            "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s",
                         }
                     ]
-                }
-            }
-        ]
+                },
+            },
+        ],
     )
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in output.structured_messages:
         # Check for tool_use in assistant messages
         if message.get("type") == "assistant" and message.get("message"):
@@ -53,16 +54,16 @@ def test_claude_tool_use_extraction():
                         tool_id = content_item.get("id")
                         tool_name = content_item.get("name", "")
                         tool_input = content_item.get("input", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_input,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool_result in user messages
         elif message.get("type") == "user" and message.get("message"):
             msg_content = message["message"].get("content", [])
@@ -72,22 +73,24 @@ def test_claude_tool_use_extraction():
                         tool_id = content_item.get("tool_use_id")
                         if tool_id in pending_tool_uses:
                             tool_data = pending_tool_uses[tool_id]
-                            
+
                             # Update with result
                             is_tool_error = content_item.get("is_error", False)
                             tool_data["success"] = not is_tool_error
                             tool_data["result"] = content_item.get("content", "")
-                            
+
                             if is_tool_error:
-                                tool_data["error"] = content_item.get("content", "Tool error occurred")
-                            
+                                tool_data["error"] = content_item.get(
+                                    "content", "Tool error occurred"
+                                )
+
                             # Create ToolUse object
                             tool_use = ToolUse(**tool_data)
                             tool_uses.append(tool_use)
-                            
+
                             # Remove from pending
                             del pending_tool_uses[tool_id]
-    
+
     # Verify extraction
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -100,7 +103,7 @@ def test_claude_tool_use_extraction():
 
 def test_claude_tool_use_error():
     """Test that ClaudeCoder correctly handles tool errors."""
-    
+
     # Create a mock output with tool error
     output = CoderOutput(
         stdout="",
@@ -114,31 +117,31 @@ def test_claude_tool_use_error():
                             "type": "tool_use",
                             "id": "toolu_test",
                             "name": "mcp__pubmed__get_paper_fulltext",
-                            "input": {"pmid": "invalid"}
+                            "input": {"pmid": "invalid"},
                         }
                     ]
-                }
+                },
             },
             {
-                "type": "user", 
+                "type": "user",
                 "message": {
                     "content": [
                         {
                             "type": "tool_result",
                             "content": "MCP tool response exceeds maximum allowed tokens",
                             "is_error": True,
-                            "tool_use_id": "toolu_test"
+                            "tool_use_id": "toolu_test",
                         }
                     ]
-                }
-            }
-        ]
+                },
+            },
+        ],
     )
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in output.structured_messages:
         # Check for tool_use in assistant messages
         if message.get("type") == "assistant" and message.get("message"):
@@ -149,16 +152,16 @@ def test_claude_tool_use_error():
                         tool_id = content_item.get("id")
                         tool_name = content_item.get("name", "")
                         tool_input = content_item.get("input", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_input,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool_result in user messages
         elif message.get("type") == "user" and message.get("message"):
             msg_content = message["message"].get("content", [])
@@ -168,22 +171,24 @@ def test_claude_tool_use_error():
                         tool_id = content_item.get("tool_use_id")
                         if tool_id in pending_tool_uses:
                             tool_data = pending_tool_uses[tool_id]
-                            
+
                             # Update with result
                             is_tool_error = content_item.get("is_error", False)
                             tool_data["success"] = not is_tool_error
                             tool_data["result"] = content_item.get("content", "")
-                            
+
                             if is_tool_error:
-                                tool_data["error"] = content_item.get("content", "Tool error occurred")
-                            
+                                tool_data["error"] = content_item.get(
+                                    "content", "Tool error occurred"
+                                )
+
                             # Create ToolUse object
                             tool_use = ToolUse(**tool_data)
                             tool_uses.append(tool_use)
-                            
+
                             # Remove from pending
                             del pending_tool_uses[tool_id]
-    
+
     # Verify error handling
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -191,4 +196,4 @@ def test_claude_tool_use_error():
     assert tool_use.arguments == {"pmid": "invalid"}
     assert tool_use.success is False
     assert tool_use.error == "MCP tool response exceeds maximum allowed tokens"
-    assert tool_use.result == "MCP tool response exceeds maximum allowed tokens"
\ No newline at end of file
+    assert tool_use.result == "MCP tool response exceeds maximum allowed tokens"
diff --git a/tests/test_coders/test_coder_availability.py b/tests/test_coders/test_coder_availability.py
index d9a75d3..4b53b63 100644
--- a/tests/test_coders/test_coder_availability.py
+++ b/tests/test_coders/test_coder_availability.py
@@ -59,12 +59,12 @@ def test_all_coders_have_availability_method():
     from metacoder.metacoder import AVAILABLE_CODERS
 
     for coder_name, coder_class in AVAILABLE_CODERS.items():
-        assert hasattr(
-            coder_class, "is_available"
-        ), f"{coder_name} missing is_available method"
-        assert callable(
-            coder_class.is_available
-        ), f"{coder_name}.is_available is not callable"
+        assert hasattr(coder_class, "is_available"), (
+            f"{coder_name} missing is_available method"
+        )
+        assert callable(coder_class.is_available), (
+            f"{coder_name}.is_available is not callable"
+        )
 
 
 @patch("shutil.which")
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index 4180e92..cb73641 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -62,17 +62,17 @@ def test_llm_coder_basic_arithmetic(coder_name, coder_class):
 
             # Check result
             assert result is not None
-            assert (
-                result.stdout or result.result_text
-            ), "Coder should produce some output"
+            assert result.stdout or result.result_text, (
+                "Coder should produce some output"
+            )
 
             # Get the actual output text
             output_text = result.result_text or result.stdout
 
             # All LLM coders should include "4" in their answer
-            assert (
-                "4" in output_text
-            ), f"{coder_name} should answer '4' to 'What is 2+2?'"
+            assert "4" in output_text, (
+                f"{coder_name} should answer '4' to 'What is 2+2?'"
+            )
 
         except Exception as e:
             pytest.fail(f"Coder {coder_name} failed with error: {e}")
@@ -137,12 +137,12 @@ def test_llm_coder_code_generation(coder_name, coder_class):
             assert output_text, "Coder should produce some output"
 
             # Verify the output contains Python code elements
-            assert (
-                "def" in output_text
-            ), f"{coder_name} should generate a Python function"
-            assert (
-                "return" in output_text or "print" in output_text
-            ), f"{coder_name} should have return or print"
+            assert "def" in output_text, (
+                f"{coder_name} should generate a Python function"
+            )
+            assert "return" in output_text or "print" in output_text, (
+                f"{coder_name} should have return or print"
+            )
 
         except Exception as e:
             pytest.fail(f"Coder {coder_name} failed with error: {e}")
diff --git a/tests/test_dummy_coder_tool_capture.py b/tests/test_dummy_coder_tool_capture.py
index a3d6364..b92b7ab 100644
--- a/tests/test_dummy_coder_tool_capture.py
+++ b/tests/test_dummy_coder_tool_capture.py
@@ -1,4 +1,5 @@
 """Test that DummyCoder properly captures tool calls in CoderOutput."""
+
 from metacoder.coders.dummy import DummyCoder
 from metacoder.coders.base_coder import CoderOutput, ToolUse
 
@@ -6,23 +7,25 @@
 def test_dummy_coder_captures_tool_calls():
     """Test that DummyCoder captures tool calls in the CoderOutput."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers tool use
     output = coder.run("Use MCP to search PubMed for cancer research")
-    
+
     # Verify output is a CoderOutput instance
     assert isinstance(output, CoderOutput)
-    
+
     # Verify basic output fields
     assert output.stdout == "you said: Use MCP to search PubMed for cancer research"
     assert output.stderr == ""
-    assert output.result_text == "you said: Use MCP to search PubMed for cancer research"
-    
+    assert (
+        output.result_text == "you said: Use MCP to search PubMed for cancer research"
+    )
+
     # Verify tool_uses is populated
     assert output.tool_uses is not None
     assert isinstance(output.tool_uses, list)
     assert len(output.tool_uses) == 1
-    
+
     # Verify the tool use is properly structured
     tool_use = output.tool_uses[0]
     assert isinstance(tool_use, ToolUse)
@@ -36,27 +39,27 @@ def test_dummy_coder_captures_tool_calls():
 def test_dummy_coder_captures_multiple_tools():
     """Test that DummyCoder can capture multiple tool calls."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers multiple tools
     output = coder.run("Search PubMed and then cause an error")
-    
+
     # Verify multiple tools are captured
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 2
-    
+
     # Verify each tool is a proper ToolUse instance
     for tool in output.tool_uses:
         assert isinstance(tool, ToolUse)
-        assert hasattr(tool, 'name')
-        assert hasattr(tool, 'arguments')
-        assert hasattr(tool, 'success')
-        assert hasattr(tool, 'error')
-        assert hasattr(tool, 'result')
-    
+        assert hasattr(tool, "name")
+        assert hasattr(tool, "arguments")
+        assert hasattr(tool, "success")
+        assert hasattr(tool, "error")
+        assert hasattr(tool, "result")
+
     # Check first tool (PubMed search)
     assert output.tool_uses[0].name == "mcp__pubmed__search_papers"
     assert output.tool_uses[0].success is True
-    
+
     # Check second tool (error)
     assert output.tool_uses[1].name == "mcp__test__failing_tool"
     assert output.tool_uses[1].success is False
@@ -66,14 +69,14 @@ def test_dummy_coder_captures_multiple_tools():
 def test_dummy_coder_no_tools_when_not_triggered():
     """Test that DummyCoder doesn't add tools when not triggered."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that doesn't trigger tools
     output = coder.run("What is the weather today?")
-    
+
     # Verify output structure
     assert isinstance(output, CoderOutput)
     assert output.stdout == "you said: What is the weather today?"
-    
+
     # Verify no tools are added
     assert output.tool_uses is None
 
@@ -81,14 +84,14 @@ def test_dummy_coder_no_tools_when_not_triggered():
 def test_dummy_coder_tool_error_capture():
     """Test that DummyCoder properly captures tool errors."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers an error
     output = coder.run("Use tool with error")
-    
+
     # Verify error tool is captured
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     error_tool = output.tool_uses[0]
     assert error_tool.name == "mcp__test__failing_tool"
     assert error_tool.success is False
@@ -100,22 +103,22 @@ def test_dummy_coder_tool_error_capture():
 def test_dummy_coder_tool_serialization():
     """Test that tool uses can be serialized properly."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with tool trigger
     output = coder.run("Use MCP tool")
-    
+
     # Verify tool uses can be converted to dict (for serialization)
     assert output.tool_uses is not None
     tool_dict = output.tool_uses[0].model_dump()
-    
+
     assert isinstance(tool_dict, dict)
     assert "name" in tool_dict
     assert "arguments" in tool_dict
     assert "success" in tool_dict
     assert "error" in tool_dict
     assert "result" in tool_dict
-    
+
     # Verify values
     assert tool_dict["name"] == "mcp__dummy__test_tool"
     assert tool_dict["success"] is True
-    assert tool_dict["error"] is None
\ No newline at end of file
+    assert tool_dict["error"] is None
diff --git a/tests/test_dummy_tool_use.py b/tests/test_dummy_tool_use.py
index ca3f2b4..f9f675e 100644
--- a/tests/test_dummy_tool_use.py
+++ b/tests/test_dummy_tool_use.py
@@ -1,4 +1,5 @@
 """Test DummyCoder fake tool use generation."""
+
 from metacoder.coders.dummy import DummyCoder
 
 
@@ -6,7 +7,7 @@ def test_dummy_no_tools():
     """Test that dummy coder doesn't add tools when not mentioned."""
     coder = DummyCoder(workdir="test")
     output = coder.run("What is 2 + 2?")
-    
+
     assert output.stdout == "you said: What is 2 + 2?"
     assert output.tool_uses is None
 
@@ -15,10 +16,10 @@ def test_dummy_default_tool():
     """Test that dummy coder adds default tool when mentioned."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Use a tool to help me")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__dummy__test_tool"
     assert tool.arguments == {"input": "Use a tool to help me"}
@@ -31,10 +32,10 @@ def test_dummy_pubmed_search():
     """Test that dummy coder simulates PubMed search."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Search PubMed for papers about cancer")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__pubmed__search_papers"
     assert tool.arguments == {"query": "test query", "limit": 10}
@@ -47,10 +48,10 @@ def test_dummy_tool_error():
     """Test that dummy coder simulates tool errors."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Use MCP tool but simulate an error")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__test__failing_tool"
     assert tool.arguments == {"param": "value"}
@@ -63,15 +64,15 @@ def test_dummy_multiple_tools():
     """Test that dummy coder can simulate multiple tools."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Search PubMed and then simulate an error with MCP")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 2
-    
+
     # First tool - PubMed search
     tool1 = output.tool_uses[0]
     assert tool1.name == "mcp__pubmed__search_papers"
     assert tool1.success is True
-    
+
     # Second tool - error simulation
     tool2 = output.tool_uses[1]
     assert tool2.name == "mcp__test__failing_tool"
@@ -82,10 +83,10 @@ def test_dummy_mcp_keyword():
     """Test that MCP keyword triggers tool use."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Test MCP functionality")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__dummy__test_tool"
-    assert tool.success is True
\ No newline at end of file
+    assert tool.success is True
diff --git a/tests/test_evals/test_deep_eval.py b/tests/test_evals/test_deep_eval.py
index c55dec2..3690e70 100644
--- a/tests/test_evals/test_deep_eval.py
+++ b/tests/test_evals/test_deep_eval.py
@@ -61,8 +61,6 @@ def test_hallucination_eval(metric_cls):
     print(yaml.dump(results.model_dump()))
 
 
-
-
 correctness_metric = GEval(
     name="Correctness",
     criteria="Determine whether the actual output is factually correct based on the expected output.",
diff --git a/tests/test_goose_tool_use.py b/tests/test_goose_tool_use.py
index 94d83aa..3b643ae 100644
--- a/tests/test_goose_tool_use.py
+++ b/tests/test_goose_tool_use.py
@@ -1,4 +1,5 @@
 """Test GooseCoder tool use extraction."""
+
 from metacoder.coders.base_coder import ToolUse
 
 
@@ -13,7 +14,7 @@ def test_goose_tool_use_extraction():
             "content": [
                 {
                     "type": "text",
-                    "text": "I'll help you find information about diseases associated with ITPR1 mutations."
+                    "text": "I'll help you find information about diseases associated with ITPR1 mutations.",
                 },
                 {
                     "type": "toolRequest",
@@ -22,11 +23,11 @@ def test_goose_tool_use_extraction():
                         "status": "success",
                         "value": {
                             "name": "pubmed__get_paper_fulltext",
-                            "arguments": {"pmid": "35743164"}
-                        }
-                    }
-                }
-            ]
+                            "arguments": {"pmid": "35743164"},
+                        },
+                    },
+                },
+            ],
         },
         {
             "id": None,
@@ -38,22 +39,17 @@ def test_goose_tool_use_extraction():
                     "id": "toolu_01RbESTBH9tyWu9Q9uAVRjja",
                     "toolResult": {
                         "status": "success",
-                        "value": [
-                            {
-                                "type": "text",
-                                "text": "Paper content here..."
-                            }
-                        ]
-                    }
+                        "value": [{"type": "text", "text": "Paper content here..."}],
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages to extract tool uses (mimicking goose logic)
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         # Check for tool requests in assistant messages
         if message.get("role") == "assistant" and "content" in message:
@@ -61,21 +57,21 @@ def test_goose_tool_use_extraction():
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         tool_name = tool_value.get("name", "")
                         tool_args = tool_value.get("arguments", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_args,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool responses in user messages
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
@@ -84,7 +80,7 @@ def test_goose_tool_use_extraction():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         # Update with result
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
@@ -93,23 +89,32 @@ def test_goose_tool_use_extraction():
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
                         else:
                             tool_data["success"] = False
-                            tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                            tool_data["error"] = tool_result.get(
+                                "error", "Tool execution failed"
+                            )
                             tool_data["result"] = None
-                        
+
                         # Create ToolUse object
                         tool_use = ToolUse(**tool_data)
                         tool_uses.append(tool_use)
-                        
+
                         # Remove from pending
                         del pending_tool_uses[tool_id]
-    
+
     # Verify extraction
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -132,13 +137,10 @@ def test_goose_tool_use_error():
                     "id": "toolu_test",
                     "toolCall": {
                         "status": "success",
-                        "value": {
-                            "name": "test_tool",
-                            "arguments": {"param": "value"}
-                        }
-                    }
+                        "value": {"name": "test_tool", "arguments": {"param": "value"}},
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -148,37 +150,37 @@ def test_goose_tool_use_error():
                     "id": "toolu_test",
                     "toolResult": {
                         "status": "error",
-                        "error": "Tool failed to execute"
-                    }
+                        "error": "Tool failed to execute",
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         if message.get("role") == "assistant" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         tool_name = tool_value.get("name", "")
                         tool_args = tool_value.get("arguments", {})
-                        
+
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_args,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolResponse":
@@ -186,27 +188,36 @@ def test_goose_tool_use_error():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
                             result_value = tool_result.get("value", [])
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
                         else:
                             tool_data["success"] = False
-                            tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                            tool_data["error"] = tool_result.get(
+                                "error", "Tool execution failed"
+                            )
                             tool_data["result"] = None
-                        
+
                         tool_use = ToolUse(**tool_data)
                         tool_uses.append(tool_use)
                         del pending_tool_uses[tool_id]
-    
+
     # Verify error handling
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -230,11 +241,11 @@ def test_goose_multiple_tools():
                         "status": "success",
                         "value": {
                             "name": "search_tool",
-                            "arguments": {"query": "test"}
-                        }
-                    }
+                            "arguments": {"query": "test"},
+                        },
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -244,10 +255,10 @@ def test_goose_multiple_tools():
                     "id": "tool1",
                     "toolResult": {
                         "status": "success",
-                        "value": [{"type": "text", "text": "Search results"}]
-                    }
+                        "value": [{"type": "text", "text": "Search results"}],
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "assistant",
@@ -259,11 +270,11 @@ def test_goose_multiple_tools():
                         "status": "success",
                         "value": {
                             "name": "fetch_tool",
-                            "arguments": {"url": "http://example.com"}
-                        }
-                    }
+                            "arguments": {"url": "http://example.com"},
+                        },
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -273,24 +284,24 @@ def test_goose_multiple_tools():
                     "id": "tool2",
                     "toolResult": {
                         "status": "success",
-                        "value": [{"type": "text", "text": "Fetched content"}]
-                    }
+                        "value": [{"type": "text", "text": "Fetched content"}],
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         if message.get("role") == "assistant" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         pending_tool_uses[tool_id] = {
@@ -298,9 +309,9 @@ def test_goose_multiple_tools():
                             "arguments": tool_value.get("arguments", {}),
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolResponse":
@@ -308,22 +319,29 @@ def test_goose_multiple_tools():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
                             result_value = tool_result.get("value", [])
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
-                        
+
                         tool_uses.append(ToolUse(**tool_data))
                         del pending_tool_uses[tool_id]
-    
+
     # Verify multiple tools
     assert len(tool_uses) == 2
     assert tool_uses[0].name == "search_tool"
@@ -331,4 +349,4 @@ def test_goose_multiple_tools():
     assert tool_uses[0].result == "Search results"
     assert tool_uses[1].name == "fetch_tool"
     assert tool_uses[1].success is True
-    assert tool_uses[1].result == "Fetched content"
\ No newline at end of file
+    assert tool_uses[1].result == "Fetched content"
diff --git a/tests/test_instructions_option.py b/tests/test_instructions_option.py
index 681e275..ff9cc4e 100644
--- a/tests/test_instructions_option.py
+++ b/tests/test_instructions_option.py
@@ -21,7 +21,7 @@ def test_instructions_option_with_dummy_coder(runner):
         instructions_file = Path(temp_dir) / "test_instructions.md"
         instructions_content = "# Test Instructions\n\nBe helpful and concise."
         instructions_file.write_text(instructions_content)
-        
+
         # Run with instructions
         result = runner.invoke(
             main,
@@ -36,7 +36,7 @@ def test_instructions_option_with_dummy_coder(runner):
                 temp_dir,
             ],
         )
-        
+
         # Check that instructions were loaded
         assert result.exit_code == 0
         assert "Loaded instructions from:" in result.output
@@ -58,7 +58,7 @@ def test_no_instructions_still_works(runner):
                 temp_dir,
             ],
         )
-        
+
         assert result.exit_code == 0
         assert "you said: Hello" in result.output
         assert "Instructions loaded:" not in result.output
@@ -80,7 +80,7 @@ def test_instructions_file_not_found(runner):
                 temp_dir,
             ],
         )
-        
+
         # Should fail with appropriate error
         assert result.exit_code != 0
         assert "does not exist" in result.output
@@ -98,11 +98,11 @@ def test_instructions_with_config(runner):
 extensions: []
 """
         config_file.write_text(config_content)
-        
+
         # Create instructions file
         instructions_file = Path(temp_dir) / "instructions.md"
         instructions_file.write_text("Custom instructions")
-        
+
         result = runner.invoke(
             main,
             [
@@ -118,6 +118,6 @@ def test_instructions_with_config(runner):
                 temp_dir,
             ],
         )
-        
+
         assert result.exit_code == 0
-        assert "Loaded instructions from:" in result.output
\ No newline at end of file
+        assert "Loaded instructions from:" in result.output
diff --git a/tests/test_introspect_mcp.py b/tests/test_introspect_mcp.py
index 2f51ae0..4e5fd59 100644
--- a/tests/test_introspect_mcp.py
+++ b/tests/test_introspect_mcp.py
@@ -6,7 +6,7 @@ def test_introspect_mcp_help():
     """Test introspect-mcp help command."""
     runner = CliRunner()
     result = runner.invoke(cli, ["introspect-mcp", "--help"])
-    
+
     assert result.exit_code == 0
     assert "Introspect an MCP server" in result.output
     assert "MCP_SPEC" in result.output
@@ -17,12 +17,10 @@ def test_introspect_mcp_help():
 def test_introspect_mcp_with_invalid_registry():
     """Test introspect-mcp with non-existent registry MCP."""
     runner = CliRunner()
-    result = runner.invoke(cli, [
-        "introspect-mcp",
-        "nonexistent",
-        "--registry", "metacoder.basics"
-    ])
-    
+    result = runner.invoke(
+        cli, ["introspect-mcp", "nonexistent", "--registry", "metacoder.basics"]
+    )
+
     assert result.exit_code != 0
     assert "not found in registry" in result.output
 
@@ -30,12 +28,9 @@ def test_introspect_mcp_with_invalid_registry():
 def test_introspect_mcp_with_registry_no_mcp():
     """Test introspect-mcp with invalid registry."""
     runner = CliRunner()
-    result = runner.invoke(cli, [
-        "introspect-mcp",
-        "fetch",
-        "--registry", "metacoder.nonexistent"
-    ])
-    
+    result = runner.invoke(
+        cli, ["introspect-mcp", "fetch", "--registry", "metacoder.nonexistent"]
+    )
+
     assert result.exit_code != 0
     assert "Registry file not found" in result.output
-
diff --git a/tests/test_mcps/test_gemini_mcp.py b/tests/test_mcps/test_gemini_mcp.py
index 25b9c24..288f61c 100644
--- a/tests/test_mcps/test_gemini_mcp.py
+++ b/tests/test_mcps/test_gemini_mcp.py
@@ -13,7 +13,7 @@ def test_gemini_supports_mcp():
 def test_gemini_mcp_config_conversion():
     """Test conversion of MCPConfig to Gemini format."""
     coder = GeminiCoder(workdir="/tmp/test")
-    
+
     # Test stdio MCP
     mcp = MCPConfig(
         name="test_server",
@@ -21,11 +21,11 @@ def test_gemini_mcp_config_conversion():
         args=["-y", "@modelcontextprotocol/server-test"],
         env={"API_KEY": "${TEST_KEY}"},
         enabled=True,
-        type=MCPType.STDIO
+        type=MCPType.STDIO,
     )
-    
+
     result = coder.mcp_config_to_gemini_format(mcp)
-    
+
     assert result["command"] == "npx"
     assert result["args"] == ["-y", "@modelcontextprotocol/server-test"]
     assert result["env"] == {"API_KEY": "${TEST_KEY}"}
@@ -35,13 +35,9 @@ def test_gemini_mcp_config_conversion():
 def test_gemini_http_mcp_not_supported():
     """Test that HTTP MCPs raise NotImplementedError."""
     coder = GeminiCoder(workdir="/tmp/test")
-    
-    mcp = MCPConfig(
-        name="http_server",
-        enabled=True,
-        type=MCPType.HTTP
-    )
-    
+
+    mcp = MCPConfig(name="http_server", enabled=True, type=MCPType.HTTP)
+
     with pytest.raises(NotImplementedError, match="HTTP MCPs are not supported"):
         coder.mcp_config_to_gemini_format(mcp)
 
@@ -56,7 +52,7 @@ def test_gemini_mcp_settings_generation():
                 command="npx",
                 args=["-y", "@modelcontextprotocol/server-filesystem"],
                 enabled=True,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
             MCPConfig(
                 name="github",
@@ -64,41 +60,41 @@ def test_gemini_mcp_settings_generation():
                 args=["mcp-github"],
                 env={"GITHUB_TOKEN": "${GITHUB_TOKEN}"},
                 enabled=True,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
             MCPConfig(
                 name="disabled_server",
                 command="uvx",
                 args=["mcp-disabled"],
                 enabled=False,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
-        ]
+        ],
     )
-    
+
     coder = GeminiCoder(workdir="/tmp/test", config=config)
     config_objects = coder.default_config_objects()
-    
+
     # Should have created settings.json
     assert len(config_objects) == 1
     settings_obj = config_objects[0]
-    
+
     assert settings_obj.relative_path == ".gemini/settings.json"
     assert "mcpServers" in settings_obj.content
-    
+
     mcp_servers = settings_obj.content["mcpServers"]
-    
+
     # Should only include enabled servers
     assert "filesystem" in mcp_servers
     assert "github" in mcp_servers
     assert "disabled_server" not in mcp_servers
-    
+
     # Check filesystem server config
     fs_config = mcp_servers["filesystem"]
     assert fs_config["command"] == "npx"
     assert fs_config["args"] == ["-y", "@modelcontextprotocol/server-filesystem"]
     assert fs_config["timeout"] == 30000
-    
+
     # Check github server config
     gh_config = mcp_servers["github"]
     assert gh_config["command"] == "uvx"
@@ -111,6 +107,6 @@ def test_gemini_no_mcp_no_settings():
     """Test that no settings.json is created when no MCPs are configured."""
     coder = GeminiCoder(workdir="/tmp/test")
     config_objects = coder.default_config_objects()
-    
+
     # Should not create any config files when no MCPs
-    assert len(config_objects) == 0
\ No newline at end of file
+    assert len(config_objects) == 0
diff --git a/tests/test_registry_loading.py b/tests/test_registry_loading.py
index c96781e..c2f67a2 100644
--- a/tests/test_registry_loading.py
+++ b/tests/test_registry_loading.py
@@ -7,15 +7,15 @@
 def test_load_mcp_registry_basics():
     """Test loading basics registry."""
     collection = load_mcp_registry("metacoder.basics")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
     assert len(collection.servers) > 0
-    
+
     # Check that fetch is in basics
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names
     assert "taskmasterai" in mcp_names
-    
+
     # Check that all are disabled by default
     for mcp in collection.servers:
         assert not mcp.enabled and mcp.enabled is not None
@@ -24,10 +24,10 @@ def test_load_mcp_registry_basics():
 def test_load_mcp_registry_scilit():
     """Test loading scilit registry."""
     collection = load_mcp_registry("metacoder.scilit")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
     assert len(collection.servers) > 0
-    
+
     # Check that scilit MCPs are present
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "pdfreader" in mcp_names
@@ -38,9 +38,9 @@ def test_load_mcp_registry_scilit():
 def test_load_mcp_registry_all():
     """Test loading all registries with 'metacoder'."""
     collection = load_mcp_registry("metacoder")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
-    
+
     # Should have MCPs from both basics and scilit
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names  # from basics
@@ -50,7 +50,7 @@ def test_load_mcp_registry_all():
 def test_load_mcp_registry_without_prefix():
     """Test loading registry without metacoder prefix."""
     collection = load_mcp_registry("basics")
-    
+
     # Should work the same as with prefix
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names
@@ -59,17 +59,24 @@ def test_load_mcp_registry_without_prefix():
 def test_cli_with_registry():
     """Test CLI with registry option."""
     runner = CliRunner()
-    
+
     # Test with registry and enable specific MCP
-    result = runner.invoke(cli, [
-        "run",
-        "test prompt",
-        "--coder", "dummy",
-        "--registry", "metacoder.basics",
-        "--enable-mcp", "fetch",
-        "--workdir", "test_workdir"
-    ])
-    
+    result = runner.invoke(
+        cli,
+        [
+            "run",
+            "test prompt",
+            "--coder",
+            "dummy",
+            "--registry",
+            "metacoder.basics",
+            "--enable-mcp",
+            "fetch",
+            "--workdir",
+            "test_workdir",
+        ],
+    )
+
     assert result.exit_code == 0
     assert "Loading MCPs from registry: metacoder.basics" in result.output
     assert "Registry MCPs:" in result.output
@@ -79,7 +86,7 @@ def test_cli_with_registry():
 def test_cli_registry_with_mcp_collection():
     """Test CLI with both registry and MCP collection."""
     runner = CliRunner()
-    
+
     # Create a temporary MCP collection file
     with runner.isolated_filesystem():
         with open("test_mcps.yaml", "w") as f:
@@ -91,18 +98,27 @@ def test_cli_registry_with_mcp_collection():
     args: ["test"]
     enabled: true
 """)
-        
-        result = runner.invoke(cli, [
-            "run",
-            "test prompt",
-            "--coder", "dummy",
-            "--mcp-collection", "test_mcps.yaml",
-            "--registry", "metacoder.basics",
-            "--enable-mcp", "fetch",
-            "--enable-mcp", "custom_mcp",
-            "--workdir", "test_workdir"
-        ])
-        
+
+        result = runner.invoke(
+            cli,
+            [
+                "run",
+                "test prompt",
+                "--coder",
+                "dummy",
+                "--mcp-collection",
+                "test_mcps.yaml",
+                "--registry",
+                "metacoder.basics",
+                "--enable-mcp",
+                "fetch",
+                "--enable-mcp",
+                "custom_mcp",
+                "--workdir",
+                "test_workdir",
+            ],
+        )
+
         assert result.exit_code == 0
         assert "Loading MCP collection from: test_mcps.yaml" in result.output
         assert "Loading MCPs from registry: metacoder.basics" in result.output
@@ -113,5 +129,5 @@ def test_registry_nonexistent():
     """Test loading nonexistent registry."""
     with pytest.raises(Exception) as exc_info:
         load_mcp_registry("metacoder.nonexistent")
-    
-    assert "Registry file not found" in str(exc_info.value)
\ No newline at end of file
+
+    assert "Registry file not found" in str(exc_info.value)

From 9e73e3bc529d053b471a55145dedf3b4e5912b30 Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 15 Aug 2025 17:43:05 -0700
Subject: [PATCH 3/5] refactor: Convert METRICS to lazy initialization to avoid
 network calls on import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The METRICS dictionary was previously instantiated at module level, causing
GEval objects to be created during import. This triggered network dependencies
even when the metrics weren't being used.

Changes:
- Replace top-level METRICS dictionary with get_default_metrics() function
- Move GEval and DummyMetric instantiation into the function for lazy creation
- Update run_single_eval() to call get_default_metrics() when needed

This prevents unnecessary network calls during module import and improves
startup performance, especially in environments where network access may be
restricted or when metrics are not used in the execution path.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/evals/runner.py | 50 +++++++++++++++++------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index d2ed34b..67a9619 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -59,30 +59,27 @@ def is_successful(self) -> bool:
         return self.success
 
 
-correctness_metric = GEval(
-    name="Correctness",
-    criteria="Determine whether the actual output is factually correct based on the expected output.",
-    # NOTE: you can only provide either criteria or evaluation_steps, and not both
-    evaluation_steps=[
-        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-        "You should also heavily penalize omission of detail",
-        "Vague language, or contradicting OPINIONS, are OK",
-    ],
-    threshold=0.8,
-    evaluation_params=[
-        LLMTestCaseParams.INPUT,
-        LLMTestCaseParams.ACTUAL_OUTPUT,
-        LLMTestCaseParams.EXPECTED_OUTPUT,
-    ],
-)
-
-# instances
-dummy_metric = DummyMetric(threshold=0.5)
-
-METRICS = {
-    "CorrectnessMetric": correctness_metric,
-    "DummyMetric": dummy_metric,
-}
+def get_default_metrics() -> Dict[str, BaseMetric]:
+    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+    return {
+        "CorrectnessMetric": GEval(
+            name="Correctness",
+            criteria="Determine whether the actual output is factually correct based on the expected output.",
+            # NOTE: you can only provide either criteria or evaluation_steps, and not both
+            evaluation_steps=[
+                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+                "You should also heavily penalize omission of detail",
+                "Vague language, or contradicting OPINIONS, are OK",
+            ],
+            threshold=0.8,
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+                LLMTestCaseParams.EXPECTED_OUTPUT,
+            ],
+        ),
+        "DummyMetric": DummyMetric(threshold=0.5),
+    }
 
 
 def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder:
@@ -226,8 +223,9 @@ def run_single_eval(
 
         # Run each metric
         for metric_name in case.metrics:
-            if metric_name in METRICS:
-                metric = METRICS[metric_name]
+            default_metrics = get_default_metrics()
+            if metric_name in default_metrics:
+                metric = default_metrics[metric_name]
             else:
                 # Get metric class and instantiate
                 metric_class = self.get_metric_class(metric_name)

From 8e82f1059cca4cca95b61888ba31d54a2992491e Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 15 Aug 2025 17:47:38 -0700
Subject: [PATCH 4/5] doesn't actually test metacoder functionality

---
 tests/test_evals/test_deep_eval.py | 95 ------------------------------
 1 file changed, 95 deletions(-)
 delete mode 100644 tests/test_evals/test_deep_eval.py

diff --git a/tests/test_evals/test_deep_eval.py b/tests/test_evals/test_deep_eval.py
deleted file mode 100644
index 3690e70..0000000
--- a/tests/test_evals/test_deep_eval.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-Test the deepeval library.
-
-https://github.com/metacoder-ai/deepeval
-
-Note this doesn't actually test any metacoder functonality, it is more to explore
-deepeval metrics, it can probably be removed in the future.
-"""
-
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
-from deepeval import evaluate
-from deepeval.metrics import (
-    FaithfulnessMetric,
-    HallucinationMetric,
-)
-from deepeval.test_case import LLMTestCase
-import pytest
-
-
-@pytest.mark.llm
-@pytest.mark.parametrize("metric_cls", [FaithfulnessMetric])
-def test_generic_eval(metric_cls):
-    """Test FaithfulnessMetric with correct output matching context."""
-    metric = metric_cls(threshold=0.7)
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="The answer to the question 'what is the title of PMID:28027860?' is 'From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.'",
-        actual_output='The answer to the question "what is the title of PMID:28027860?" is "From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."',
-        context=[
-            "Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-        retrieval_context=[
-            "PMID:28027860? Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))
-
-
-@pytest.mark.llm
-@pytest.mark.parametrize("metric_cls", [HallucinationMetric])
-def test_hallucination_eval(metric_cls):
-    """Test HallucinationMetric detects incorrect information not supported by context."""
-    metric = metric_cls(threshold=0.7)
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.",
-        actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."',
-        context=[
-            "Title of PMID:28027860: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))
-
-
-correctness_metric = GEval(
-    name="Correctness",
-    criteria="Determine whether the actual output is factually correct based on the expected output.",
-    # NOTE: you can only provide either criteria or evaluation_steps, and not both
-    evaluation_steps=[
-        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-        "You should also heavily penalize omission of detail",
-        "Vague language, or contradicting OPINIONS, are OK",
-    ],
-    threshold=0.8,
-    evaluation_params=[
-        LLMTestCaseParams.INPUT,
-        LLMTestCaseParams.ACTUAL_OUTPUT,
-        LLMTestCaseParams.EXPECTED_OUTPUT,
-    ],
-)
-
-
-@pytest.mark.llm
-def test_geval_eval():
-    """Test GEval correctness metric catches factual errors in output."""
-    metric = correctness_metric
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.",
-        actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."',
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))

From 1017f637ed8e9f7096a7be49f6a892bb0dc9208b Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 15 Aug 2025 17:55:30 -0700
Subject: [PATCH 5/5] marking more tests as llm

---
 tests/test_evals/test_runner.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py
index 0515237..d1f0c3e 100644
--- a/tests/test_evals/test_runner.py
+++ b/tests/test_evals/test_runner.py
@@ -1,6 +1,9 @@
 """Tests for the evaluation runner.
 
 This uses only dummy coders, so can be used in non-integration contexts.
+
+TODO: some of these are marked llm because they use an LLM in the eval
+phase, even if they use a dummy coder - figure a way to have a dummy LLM Eval too
 """
 
 import pytest
@@ -137,6 +140,7 @@ def test_create_test_case_with_list_context(self):
         test_case = runner.create_test_case(eval_case, "4")
         assert test_case.retrieval_context == ["Math fact 1", "Math fact 2"]
 
+    @pytest.mark.llm
     def test_run_single_eval_with_dummy(self, simple_config, tmp_path):
         """Test running a single evaluation with dummy coder."""
         runner = EvalRunner()
@@ -244,6 +248,7 @@ def test_save_and_load_results(self, tmp_path):
         assert data["results"][0]["model"] == "model1"
         assert data["results"][0]["score"] == 0.9
 
+    @pytest.mark.llm
     def test_run_all_evals_with_dummy(self, simple_config, tmp_path):
         """Test running all evaluations with dummy coder."""
         runner = EvalRunner()