diff --git a/README.md b/README.md
index b591c01..e4e6846 100644
--- a/README.md
+++ b/README.md
@@ -24,10 +24,10 @@ metacoder "Write a Python function to calculate fibonacci numbers" -c claude -w
 ...
 
 # With custom instructions
-metacoder "Refactor this code" -c claude --instructions coding_guidelines.md
+metacoder "Refactor this code" -c claude --instructions coding_guidelines.md -w my-repo
 ...
 
-# Using MCPs
+# Using MCPs (e.g. GitHub MCP)
 metacoder "Fix issue 1234" -w path/to/my-repo --mcp-collection github_mcps.yaml
 ...
 
diff --git a/pyproject.toml b/pyproject.toml
index 5a09dfa..020908e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dev = [
     "mkdocstrings-python>=1.14.0",
     "mypy>=1.17.1",
     "pytest>=8.4.1",
+    "ruff>=0.12.8",
     "types-click>=7.1.8",
     "types-pyyaml>=6.0.12.20250516",
 ]
diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index a8be80b..b44c6ec 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -21,10 +21,15 @@
 
 class ToolUse(BaseModel):
     """Tool use from the coder."""
-    name: str = Field(..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext")
+
+    name: str = Field(
+        ..., description="Name of the tool; e.g. mcp.pubmed.get_paper_fulltext"
+    )
     arguments: dict[str, Any] = Field(..., description="Arguments to the tool")
     success: bool = Field(..., description="Whether the tool call was successful")
-    error: str | None = Field(default=None, description="Error message if the tool call failed")
+    error: str | None = Field(
+        default=None, description="Error message if the tool call failed"
+    )
     result: Any = Field(..., description="Result of the tool")
 
 
@@ -87,6 +92,7 @@ class BaseCoder(BaseModel, ABC):
     Subclasses should implement the following methods:
     - run(self, input_text: str) -> CoderOutput: Run the coder on the input text
     """
+
     workdir: str = Field(default="workdir", description="Working dir ")
     config: CoderConfig | None = Field(default=None, description="Config for the coder")
     params: dict | None = Field(default=None, description="Parameters for the coder")
@@ -115,8 +121,6 @@ def validate_mcp_support(self):
                 )
         return self
 
-
-
     @abstractmethod
     def run(self, input_text: str) -> CoderOutput:
         """Run the coder on the input text.
@@ -129,7 +133,6 @@ def run(self, input_text: str) -> CoderOutput:
         """
         raise NotImplementedError
 
-
     @classmethod
     def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
         """Return config files as a dictionary of filename/dirname to role."""
@@ -220,7 +223,6 @@ def stream_output(pipe, output_lines, stream):
 
         return CoderOutput(stdout=stdout_text, stderr=stderr_text)
 
-
     def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]:
         """
         Expand environment variables in the coder config.
@@ -257,7 +259,7 @@ def expand_env(self, env: dict[str, str] | None = None) -> dict[str, str]:
 
     def expand_prompt(self, input_text: str) -> str:
         """Expand environment variables in the prompt.
-        
+
         Typically this just returns the prompt as is:
 
         Example:
@@ -278,7 +280,7 @@ def expand_prompt(self, input_text: str) -> str:
     def default_config_objects(self) -> list[CoderConfigObject]:
         """Default config objects for the coder."""
         raise NotImplementedError("default_config_objects is not implemented")
-    
+
     def set_instructions(self, instructions: str):
         """Set the instructions for the coder.
 
@@ -291,7 +293,7 @@ def set_instructions(self, instructions: str):
             >>> coder.set_instructions("you are an awesome coder")
             >>> coder.config_objects
             [CoderConfigObject(file_type=<FileType.TEXT: 'text'>, relative_path='CLAUDE.md', content='you are an awesome coder')]
-        
+
         Args:
             instructions: The instructions to set
         """
@@ -300,16 +302,25 @@ def set_instructions(self, instructions: str):
                 if not self.config_objects:
                     self.config_objects = []
                 for obj in self.config_objects:
-                    if obj.relative_path == str(path) or obj.relative_path == str(path.name):
+                    if obj.relative_path == str(path) or obj.relative_path == str(
+                        path.name
+                    ):
                         obj.content = instructions
                         return
                 else:
-                    self.config_objects.append(CoderConfigObject(relative_path=str(path), content=instructions, file_type=FileType.TEXT))
+                    self.config_objects.append(
+                        CoderConfigObject(
+                            relative_path=str(path),
+                            content=instructions,
+                            file_type=FileType.TEXT,
+                        )
+                    )
                     return
             else:
                 raise ValueError(f"Cannot set instructions for {typ}")
-        raise ValueError(f"No primary instruction file found for {self.__class__.__name__}")
-            
+        raise ValueError(
+            f"No primary instruction file found for {self.__class__.__name__}"
+        )
 
     def prepare_workdir(self):
         """Prepare the workdir for the coder.
@@ -330,11 +341,7 @@ def prepare_workdir(self):
         # Check if MCP extensions are configured but not supported
         if self.config and self.config.extensions:
             logger.debug(f"🔧 Checking MCP extensions: {self.config.extensions}")
-            mcp_extensions = [
-                ext
-                for ext in self.config.extensions
-                if ext.enabled
-            ]
+            mcp_extensions = [ext for ext in self.config.extensions if ext.enabled]
             if mcp_extensions and not self.supports_mcp():
                 raise ValueError(
                     f"MCP extensions are configured but {self.__class__.__name__} does not support MCP. "
@@ -353,6 +360,7 @@ def prepare_workdir(self):
                     logger.debug(f" 🗑️ Removing old config object: {path}")
                     if path.is_dir():
                         import shutil
+
                         shutil.rmtree(path)
                     else:
                         path.unlink()
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index 1c67c20..cf1af7c 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -147,6 +147,7 @@ def run(self, input_text: str) -> CoderOutput:
             # time the command
             start_time = time.time()
             ao = self.run_process(command, env)
+
             # parse the jsonl output
             def parse_jsonl_line(text: str) -> dict[str, Any]:
                 try:
@@ -154,17 +155,20 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                     return result
                 except json.JSONDecodeError:
                     return {"original": text, "error": "JSONDecodeError"}
+
             ao.structured_messages = [
                 parse_jsonl_line(line) for line in ao.stdout.split("\n") if line
             ]
-            ao.structured_messages = [m for m in ao.structured_messages if m is not None]
+            ao.structured_messages = [
+                m for m in ao.structured_messages if m is not None
+            ]
             total_cost_usd = None
             is_error = None
-            
+
             # Extract tool uses
             tool_uses = []
             pending_tool_uses = {}  # Map tool_use_id to tool data
-            
+
             for message in ao.structured_messages:
                 if "total_cost_usd" in message:
                     total_cost_usd = message["total_cost_usd"]
@@ -172,7 +176,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                     is_error = message["is_error"]
                 if "result" in message:
                     ao.result_text = message["result"]
-                
+
                 # Check for tool_use in assistant messages
                 if message.get("type") == "assistant" and message.get("message"):
                     msg_content = message["message"].get("content", [])
@@ -182,16 +186,16 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                                 tool_id = content_item.get("id")
                                 tool_name = content_item.get("name", "")
                                 tool_input = content_item.get("input", {})
-                                
+
                                 # Store pending tool use
                                 pending_tool_uses[tool_id] = {
                                     "name": tool_name,
                                     "arguments": tool_input,
                                     "success": False,  # Default to False until we see result
                                     "error": None,
-                                    "result": None
+                                    "result": None,
                                 }
-                
+
                 # Check for tool_result in user messages
                 elif message.get("type") == "user" and message.get("message"):
                     msg_content = message["message"].get("content", [])
@@ -201,31 +205,35 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                                 tool_id = content_item.get("tool_use_id")
                                 if tool_id in pending_tool_uses:
                                     tool_data = pending_tool_uses[tool_id]
-                                    
+
                                     # Update with result
                                     is_tool_error = content_item.get("is_error", False)
                                     tool_data["success"] = not is_tool_error
-                                    tool_data["result"] = content_item.get("content", "")
-                                    
+                                    tool_data["result"] = content_item.get(
+                                        "content", ""
+                                    )
+
                                     if is_tool_error:
-                                        tool_data["error"] = content_item.get("content", "Tool error occurred")
-                                    
+                                        tool_data["error"] = content_item.get(
+                                            "content", "Tool error occurred"
+                                        )
+
                                     # Create ToolUse object
                                     tool_use = ToolUse(**tool_data)
                                     tool_uses.append(tool_use)
-                                    
+
                                     # Remove from pending
                                     del pending_tool_uses[tool_id]
-            
+
             # Add any remaining pending tool uses (shouldn't happen in normal flow)
             for tool_data in pending_tool_uses.values():
                 tool_data["error"] = "No result received for tool call"
                 tool_use = ToolUse(**tool_data)
                 tool_uses.append(tool_use)
-            
+
             if tool_uses:
                 ao.tool_uses = tool_uses
-                
+
             end_time = time.time()
             logger.info(f"🤖 Command took {end_time - start_time} seconds")
             ao.total_cost_usd = total_cost_usd
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 2f29483..8e9169e 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -26,7 +26,6 @@ def is_available(cls) -> bool:
         """Check if codex command is available."""
         return shutil.which("codex") is not None
 
-
     @property
     def instructions_path(self) -> Path:
         return Path("AGENTS.md")
diff --git a/src/metacoder/coders/dummy.py b/src/metacoder/coders/dummy.py
index bb93159..d55378d 100644
--- a/src/metacoder/coders/dummy.py
+++ b/src/metacoder/coders/dummy.py
@@ -1,17 +1,22 @@
 from pathlib import Path
-from metacoder.coders.base_coder import BaseCoder, CoderConfigObject, CoderOutput, ToolUse
+from metacoder.coders.base_coder import (
+    BaseCoder,
+    CoderConfigObject,
+    CoderOutput,
+    ToolUse,
+)
 from metacoder.configuration import ConfigFileRole
 
 
 class DummyCoder(BaseCoder):
     """
     Dummy coder for testing.
-    
+
     Simulates tool use when input contains keywords:
     - "tool" or "mcp": Adds a generic test tool
     - "search" or "pubmed": Simulates a PubMed search tool
     - "error": Simulates a tool failure
-    
+
     Multiple keywords can trigger multiple tools.
     """
 
@@ -34,58 +39,71 @@ def run(self, input_text: str) -> CoderOutput:
         instructions_content = None
         if self.config_objects:
             for obj in self.config_objects:
-                if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str(Path("INSTRUCTIONS.md")):
+                if obj.relative_path == "INSTRUCTIONS.md" or obj.relative_path == str(
+                    Path("INSTRUCTIONS.md")
+                ):
                     instructions_content = obj.content
                     break
-        
+
         # Create response based on whether instructions exist
         if instructions_content:
-            response = f"Instructions loaded: {instructions_content}\nProcessing: {input_text}"
+            response = (
+                f"Instructions loaded: {instructions_content}\nProcessing: {input_text}"
+            )
         else:
             response = f"you said: {input_text}"
-            
+
         output = CoderOutput(
             stdout=response,
             stderr="",
             result_text=response,
         )
-        
+
         # Add fake tool uses if input mentions tools, MCP, or specific services
-        if any(keyword in input_text.lower() for keyword in ["tool", "mcp", "pubmed", "search"]):
+        if any(
+            keyword in input_text.lower()
+            for keyword in ["tool", "mcp", "pubmed", "search"]
+        ):
             # Create some fake tool uses for testing
             tool_uses = []
-            
+
             # Simulate a successful tool call
             if "search" in input_text.lower() or "pubmed" in input_text.lower():
-                tool_uses.append(ToolUse(
-                    name="mcp__pubmed__search_papers",
-                    arguments={"query": "test query", "limit": 10},
-                    success=True,
-                    error=None,
-                    result={"papers": ["paper1", "paper2"], "count": 2}
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__pubmed__search_papers",
+                        arguments={"query": "test query", "limit": 10},
+                        success=True,
+                        error=None,
+                        result={"papers": ["paper1", "paper2"], "count": 2},
+                    )
+                )
+
             # Simulate a tool with an error
             if "error" in input_text.lower():
-                tool_uses.append(ToolUse(
-                    name="mcp__test__failing_tool", 
-                    arguments={"param": "value"},
-                    success=False,
-                    error="Simulated tool error for testing",
-                    result=None
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__test__failing_tool",
+                        arguments={"param": "value"},
+                        success=False,
+                        error="Simulated tool error for testing",
+                        result=None,
+                    )
+                )
+
             # Default tool if no specific keywords but general tool/mcp mentioned
             if not tool_uses:
-                tool_uses.append(ToolUse(
-                    name="mcp__dummy__test_tool",
-                    arguments={"input": input_text},
-                    success=True,
-                    error=None,
-                    result="Test tool executed successfully"
-                ))
-            
+                tool_uses.append(
+                    ToolUse(
+                        name="mcp__dummy__test_tool",
+                        arguments={"input": input_text},
+                        success=True,
+                        error=None,
+                        result="Test tool executed successfully",
+                    )
+                )
+
             if tool_uses:
                 output.tool_uses = tool_uses
-        
+
         return output
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index f81a5c8..20564a9 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -32,7 +32,7 @@ class GeminiCoder(BaseCoder):
     - `.gemini/commands/` - Custom commands directory
 
     MCP Support:
-    
+
     Gemini CLI supports MCP (Model Context Protocol) servers through the
     mcpServers configuration in .gemini/settings.json. When MCPs are configured
     through Metacoder, they will be automatically added to the settings file.
@@ -86,29 +86,27 @@ def mcp_config_to_gemini_format(self, mcp: MCPConfig) -> dict[str, Any]:
 
         # For HTTP type MCPs
         elif mcp.type == MCPType.HTTP:
-            raise NotImplementedError(
-                "HTTP MCPs are not supported for Gemini CLI yet"
-            )
+            raise NotImplementedError("HTTP MCPs are not supported for Gemini CLI yet")
 
         return server_config
 
     def default_config_objects(self) -> list[CoderConfigObject]:
         """Generate config objects including MCP configuration."""
         config_objects = []
-        
+
         # Create .gemini/settings.json if we have MCP extensions
         settings_content: dict[str, Any] = {}
-        
+
         # Add MCP servers configuration if extensions are present
         if self.config and self.config.extensions:
             mcp_servers = {}
             for mcp in self.config.extensions:
                 if mcp.enabled:
                     mcp_servers[mcp.name] = self.mcp_config_to_gemini_format(mcp)
-            
+
             if mcp_servers:
                 settings_content["mcpServers"] = mcp_servers
-        
+
         # Add settings.json if we have content to write
         if settings_content:
             config_objects.append(
@@ -118,10 +116,10 @@ def default_config_objects(self) -> list[CoderConfigObject]:
                     content=settings_content,
                 )
             )
-        
+
         # Add GEMINI.md if present in config
         # This could contain instructions specific to the task
-        
+
         return config_objects
 
     def run(self, input_text: str) -> CoderOutput:
@@ -136,7 +134,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["HOME"] = "."
 
             text = self.expand_prompt(input_text)
-            
+
             # Build the command
             # The gemini CLI uses conversational interface, so we need to handle it differently
             # For now, we'll use echo to pipe the prompt
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 9b76f4b..514dc2b 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -40,7 +40,6 @@ def supports_mcp(cls) -> bool:
         """GooseCoder supports MCP extensions."""
         return True
 
-
     def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
         """Convert an MCPConfig to Goose extension format."""
         extension = {
@@ -69,7 +68,7 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
         extension["bundled"] = None
 
         return extension
-    
+
     @classmethod
     def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
         return {
@@ -196,38 +195,44 @@ def run(self, input_text: str) -> CoderOutput:
             if ao.structured_messages:
                 tool_uses = []
                 pending_tool_uses = {}  # Map tool request id to tool data
-                
+
                 for message in ao.structured_messages:
                     # Check for tool requests in assistant messages
                     if message.get("role") == "assistant" and "content" in message:
                         for content in message.get("content", []):
-                            if isinstance(content, dict) and content.get("type") == "toolRequest":
+                            if (
+                                isinstance(content, dict)
+                                and content.get("type") == "toolRequest"
+                            ):
                                 tool_id = content.get("id")
                                 tool_call = content.get("toolCall", {})
-                                
+
                                 if tool_call.get("status") == "success":
                                     tool_value = tool_call.get("value", {})
                                     tool_name = tool_value.get("name", "")
                                     tool_args = tool_value.get("arguments", {})
-                                    
+
                                     # Store pending tool use
                                     pending_tool_uses[tool_id] = {
                                         "name": tool_name,
                                         "arguments": tool_args,
                                         "success": False,  # Default until we see result
                                         "error": None,
-                                        "result": None
+                                        "result": None,
                                     }
-                    
+
                     # Check for tool responses in user messages
                     elif message.get("role") == "user" and "content" in message:
                         for content in message.get("content", []):
-                            if isinstance(content, dict) and content.get("type") == "toolResponse":
+                            if (
+                                isinstance(content, dict)
+                                and content.get("type") == "toolResponse"
+                            ):
                                 tool_id = content.get("id")
                                 if tool_id in pending_tool_uses:
                                     tool_data = pending_tool_uses[tool_id]
                                     tool_result = content.get("toolResult", {})
-                                    
+
                                     # Update with result
                                     if tool_result.get("status") == "success":
                                         tool_data["success"] = True
@@ -236,29 +241,40 @@ def run(self, input_text: str) -> CoderOutput:
                                         if isinstance(result_value, list):
                                             result_texts = []
                                             for item in result_value:
-                                                if isinstance(item, dict) and item.get("type") == "text":
-                                                    result_texts.append(item.get("text", ""))
-                                            tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                                if (
+                                                    isinstance(item, dict)
+                                                    and item.get("type") == "text"
+                                                ):
+                                                    result_texts.append(
+                                                        item.get("text", "")
+                                                    )
+                                            tool_data["result"] = (
+                                                "\n".join(result_texts)
+                                                if result_texts
+                                                else str(result_value)
+                                            )
                                         else:
                                             tool_data["result"] = str(result_value)
                                     else:
                                         tool_data["success"] = False
-                                        tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                                        tool_data["error"] = tool_result.get(
+                                            "error", "Tool execution failed"
+                                        )
                                         tool_data["result"] = None
-                                    
+
                                     # Create ToolUse object
                                     tool_use = ToolUse(**tool_data)
                                     tool_uses.append(tool_use)
-                                    
+
                                     # Remove from pending
                                     del pending_tool_uses[tool_id]
-                
+
                 # Add any remaining pending tool uses (shouldn't happen in normal flow)
                 for tool_data in pending_tool_uses.values():
                     tool_data["error"] = "No result received for tool call"
                     tool_use = ToolUse(**tool_data)
                     tool_uses.append(tool_use)
-                
+
                 if tool_uses:
                     ao.tool_uses = tool_uses
 
diff --git a/src/metacoder/configuration.py b/src/metacoder/configuration.py
index 03cefca..5110fe8 100644
--- a/src/metacoder/configuration.py
+++ b/src/metacoder/configuration.py
@@ -54,7 +54,10 @@ class AIModelProvider(BaseModel):
     name: str = Field(..., description="Name of the model provider")
     api_key: str | None = Field(None, description="API key for the model provider")
     metadata: dict[str, Any] = Field({}, description="Metadata for the model provider")
-    base_url: str | None = Field(None, description="Base URL for the model provider")
+    base_url: str | None = Field(
+        None,
+        description="Base URL for the model provider, e.g. https://api.cborg.lbl.gov ",
+    )
 
 
 class AIModelConfig(BaseModel):
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index a12658a..67a9619 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -59,30 +59,27 @@ def is_successful(self) -> bool:
         return self.success
 
 
-correctness_metric = GEval(
-    name="Correctness",
-    criteria="Determine whether the actual output is factually correct based on the expected output.",
-    # NOTE: you can only provide either criteria or evaluation_steps, and not both
-    evaluation_steps=[
-        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-        "You should also heavily penalize omission of detail",
-        "Vague language, or contradicting OPINIONS, are OK",
-    ],
-    threshold=0.8,
-    evaluation_params=[
-        LLMTestCaseParams.INPUT,
-        LLMTestCaseParams.ACTUAL_OUTPUT,
-        LLMTestCaseParams.EXPECTED_OUTPUT,
-    ],
-)
-
-# instances
-dummy_metric = DummyMetric(threshold=0.5)
-
-METRICS = {
-    "CorrectnessMetric": correctness_metric,
-    "DummyMetric": dummy_metric,
-}
+def get_default_metrics() -> Dict[str, BaseMetric]:
+    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+    return {
+        "CorrectnessMetric": GEval(
+            name="Correctness",
+            criteria="Determine whether the actual output is factually correct based on the expected output.",
+            # NOTE: you can only provide either criteria or evaluation_steps, and not both
+            evaluation_steps=[
+                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+                "You should also heavily penalize omission of detail",
+                "Vague language, or contradicting OPINIONS, are OK",
+            ],
+            threshold=0.8,
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+                LLMTestCaseParams.EXPECTED_OUTPUT,
+            ],
+        ),
+        "DummyMetric": DummyMetric(threshold=0.5),
+    }
 
 
 def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder:
@@ -103,7 +100,6 @@ def create_coder(coder_name: str, workdir: str, config=None) -> BaseCoder:
     return coder
 
 
-
 class EvalResult(BaseModel):
     """Result of a single evaluation."""
 
@@ -227,8 +223,9 @@ def run_single_eval(
 
         # Run each metric
         for metric_name in case.metrics:
-            if metric_name in METRICS:
-                metric = METRICS[metric_name]
+            default_metrics = get_default_metrics()
+            if metric_name in default_metrics:
+                metric = default_metrics[metric_name]
             else:
                 # Get metric class and instantiate
                 metric_class = self.get_metric_class(metric_name)
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index a74035a..f62d3df 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -51,16 +51,16 @@ def load_mcp_collection(collection_path: Path) -> MCPCollectionConfig:
 
 def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
     """Load MCPs from the registry based on a path pattern.
-    
+
     Args:
         registry_path: Path pattern like 'metacoder' (all) or 'metacoder.basics'
-    
+
     Returns:
         MCPCollectionConfig containing all matched MCPs
     """
     # Base directory for registry
     registry_base = Path(__file__).parent / "mcps" / "registry"
-    
+
     # Convert dot notation to file path
     if registry_path == "metacoder":
         # Load all yaml files in registry
@@ -68,21 +68,21 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
     else:
         # Convert metacoder.basics to basics.yaml
         if registry_path.startswith("metacoder."):
-            registry_path = registry_path[len("metacoder."):]
+            registry_path = registry_path[len("metacoder.") :]
         yaml_files = [registry_base / f"{registry_path}.yaml"]
-    
+
     # Collect all MCPs
     all_mcps = []
     for yaml_file in yaml_files:
         if not yaml_file.exists():
             raise click.ClickException(f"Registry file not found: {yaml_file}")
-        
+
         try:
             with open(yaml_file, "r") as f:
                 data = yaml.safe_load(f)
         except yaml.YAMLError as e:
             raise click.ClickException(f"Invalid YAML in {yaml_file}: {e}")
-        
+
         # The registry files contain a list of MCP extensions directly
         if isinstance(data, list):
             for mcp_data in data:
@@ -99,7 +99,7 @@ def load_mcp_registry(registry_path: str) -> MCPCollectionConfig:
                 logger.warning(f"Invalid MCP in {yaml_file}: {e}")
     for mcp in all_mcps:
         mcp.enabled = False
-    
+
     # Create a collection config
     collection_name = f"Registry: {registry_path}"
     return MCPCollectionConfig(name=collection_name, description=None, servers=all_mcps)
@@ -239,7 +239,9 @@ def cli(ctx):
     "--provider", "-p", type=str, help="AI provider (e.g., openai, anthropic, google)"
 )
 @click.option(
-    "--model", type=str, help="AI model name (e.g., gpt-4, claude-3-opus, gemini-pro)"
+    "--model",
+    type=str,
+    help="AI model name (e.g., gpt-4o, claude-4-sonnet, gemini-2.5pro)",
 )
 @click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
 @click.option("--quiet", "-q", is_flag=True, help="Quiet mode")
@@ -323,7 +325,7 @@ def run(
         raise click.ClickException("Cannot use both verbose and quiet mode")
     if verbose:
         logging.basicConfig(level=logging.DEBUG)
-    elif quiet: # quiet mode is a bit different, it's just no output
+    elif quiet:  # quiet mode is a bit different, it's just no output
         logging.basicConfig(level=logging.WARNING)
     else:
         logging.basicConfig(level=logging.INFO)
@@ -358,26 +360,29 @@ def run(
             click.echo(
                 f"   Enabling MCPs: {', '.join(enabled_list)} (all enabled by default)"
             )
-    
+
     # Load MCPs from registry if provided
     if registry:
         click.echo(f"📚 Loading MCPs from registry: {registry}")
         registry_config = load_mcp_registry(registry)
-        
+
         # Merge with existing MCP collection if any
         if mcp_collection_config:
             # Merge the servers lists
             for mcp in registry_config.servers:
                 # Avoid duplicates by name
-                if not any(existing.name == mcp.name for existing in mcp_collection_config.servers):
+                if not any(
+                    existing.name == mcp.name
+                    for existing in mcp_collection_config.servers
+                ):
                     mcp_collection_config.servers.append(mcp)
         else:
             mcp_collection_config = registry_config
-        
+
         # Show available MCPs from registry
         registry_mcps = [mcp.name for mcp in registry_config.servers]
         click.echo(f"   Registry MCPs: {', '.join(registry_mcps)}")
-        
+
         # Note that registry MCPs are not enabled by default
         if not enable_mcp:
             click.echo("   Use -e/--enable-mcp to enable specific MCPs")
@@ -421,7 +426,7 @@ def run(
         )
 
     if coder_config and coder_config.extensions:
-        for mcp in coder_config.extensions :
+        for mcp in coder_config.extensions:
             # use emoji to indicate enabled/disabled
             if mcp.enabled:
                 click.echo(f" ✅ MCP: {mcp.name}")
@@ -476,7 +481,9 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
+            click.echo(
+                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
+            )
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
@@ -485,7 +492,7 @@ def run(
             f"\n📋 Structured messages ({len(result.structured_messages)} total)"
         )
         for i, msg in enumerate(result.structured_messages):
-            click.echo(f"  {i+1}. {msg}")
+            click.echo(f"  {i + 1}. {msg}")
 
 
 @cli.command("list-coders")
@@ -588,10 +595,10 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
     click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed']/summary['total_evaluations']*100:.1f}%)"
+        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
     )
     click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed']/summary['total_evaluations']*100:.1f}%)"
+        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
     )
     if summary["errors"] > 0:
         click.echo(f"   Errors: {summary['errors']} ⚠️")
@@ -640,22 +647,22 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose: bool):
     """
     Introspect an MCP server to list its available tools, resources, and prompts.
-    
+
     MCP_SPEC can be:
     - A URL (http://localhost:8080)
     - A command (uvx mcp-server-fetch)
     - An MCP name when used with --registry
-    
+
     Examples:
-    
+
     \b
     # Introspect a running MCP server
     metacoder introspect-mcp http://localhost:8080
-    
+
     \b
     # Introspect an MCP from registry
     metacoder introspect-mcp fetch --registry metacoder.basics
-    
+
     \b
     # Introspect a command-based MCP
     metacoder introspect-mcp "uvx mcp-server-fetch"
@@ -665,18 +672,24 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
-    
+
     # Run the introspection with proper cleanup
     import os
     import sys
-    
+
     # Suppress the specific asyncio warning by running with -W flag
     env = os.environ.copy()
-    env['PYTHONWARNINGS'] = 'ignore::RuntimeWarning:asyncio.base_subprocess'
-    
+    env["PYTHONWARNINGS"] = "ignore::RuntimeWarning:asyncio.base_subprocess"
+
     # Run in a subprocess to isolate the asyncio event loop
     import subprocess
-    args = [sys.executable, "-W", "ignore::RuntimeWarning:asyncio.base_subprocess", "-c", f"""
+
+    args = [
+        sys.executable,
+        "-W",
+        "ignore::RuntimeWarning:asyncio.base_subprocess",
+        "-c",
+        f"""
 import asyncio
 import sys
 sys.path.insert(0, {repr(str(Path(__file__).parent.parent))})
@@ -688,26 +701,26 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
 except Exception as e:
     print(f"Error: {{e}}", file=sys.stderr)
     sys.exit(1)
-"""]
-    
+""",
+    ]
+
     try:
         # Run with stderr captured to filter out asyncio warnings
         result = subprocess.run(
-            args, 
-            env=env, 
-            timeout=timeout + 5,
-            stderr=subprocess.PIPE,
-            text=True
+            args, env=env, timeout=timeout + 5, stderr=subprocess.PIPE, text=True
         )
-        
+
         # Filter out the specific asyncio warning from stderr
         if result.stderr:
             error_lines = []
             skip_next = 0
             lines = result.stderr.splitlines()
-            
+
             for i, line in enumerate(lines):
-                if "Exception ignored in: <function BaseSubprocessTransport.__del__" in line:
+                if (
+                    "Exception ignored in: <function BaseSubprocessTransport.__del__"
+                    in line
+                ):
                     # Skip this line and the rest of the traceback
                     skip_next = 100  # Skip many lines to catch the full traceback
                 elif skip_next > 0:
@@ -717,12 +730,12 @@ def introspect_mcp(mcp_spec: str, registry: Optional[str], timeout: int, verbose
                         skip_next = 0  # Stop skipping after this line
                 else:
                     error_lines.append(line)
-            
+
             # Print any remaining stderr
             if error_lines:
                 for line in error_lines:
                     click.echo(line, err=True)
-        
+
         if result.returncode != 0:
             raise click.ClickException("Failed to introspect MCP server")
     except subprocess.TimeoutExpired:
@@ -736,48 +749,50 @@ async def _introspect_mcp_async(
 ):
     """Async implementation of MCP introspection."""
     from fastmcp import Client
-    
+
     mcp_config = None
     spec_to_use: Union[str, list[str]] = mcp_spec
-    
+
     # If registry is specified, load the MCP config
     if registry:
         click.echo(f"📚 Loading MCP '{mcp_spec}' from registry: {registry}")
         registry_config = load_mcp_registry(registry)
-        
+
         # Find the MCP in the registry
         mcp_config = None
         for mcp in registry_config.servers:
             if mcp.name == mcp_spec:
                 mcp_config = mcp
                 break
-        
+
         if not mcp_config:
             available = [mcp.name for mcp in registry_config.servers]
             raise click.ClickException(
                 f"MCP '{mcp_spec}' not found in registry. Available: {', '.join(available)}"
             )
-        
+
         # Build the command from MCP config
         if mcp_config.command and mcp_config.args:
             spec_to_use = [mcp_config.command] + mcp_config.args
         else:
-            raise click.ClickException(f"MCP '{mcp_spec}' has incomplete command configuration")
-    
+            raise click.ClickException(
+                f"MCP '{mcp_spec}' has incomplete command configuration"
+            )
+
     click.echo(f"🔍 Introspecting MCP: {spec_to_use}")
-    
+
     # Create client based on the spec type
     if isinstance(spec_to_use, list):
         # Command-based MCP - FastMCP expects a single server config dict
         server_config = {
             "server_name": {
                 "command": spec_to_use[0],
-                "args": spec_to_use[1:] if len(spec_to_use) > 1 else []
+                "args": spec_to_use[1:] if len(spec_to_use) > 1 else [],
             }
         }
         if mcp_config and mcp_config.env:
             server_config["server_name"]["env"] = mcp_config.env  # type: ignore
-        
+
         # FastMCP expects the full config with mcpServers key
         full_config = {"mcpServers": server_config}
         client = Client(full_config)
@@ -787,28 +802,29 @@ async def _introspect_mcp_async(
     else:
         # Try as command
         import shlex
+
         parts = shlex.split(spec_to_use)
         server_config = {
             "server_name": {
                 "command": parts[0],
-                "args": parts[1:] if len(parts) > 1 else []
+                "args": parts[1:] if len(parts) > 1 else [],
             }
         }
         full_config = {"mcpServers": server_config}
         client = Client(full_config)
-    
+
     async with client:
         click.echo("✅ Connected to MCP server")
-        
+
         # Get server info if available
-        if hasattr(client, 'server_info'):
+        if hasattr(client, "server_info"):
             info = client.server_info
             click.echo("\n📋 Server Info:")
             click.echo(f"   Name: {info.name}")
             click.echo(f"   Version: {info.version}")
-            if hasattr(info, 'description') and info.description:
+            if hasattr(info, "description") and info.description:
                 click.echo(f"   Description: {info.description}")
-        
+
         # List tools
         click.echo("\n🔧 Available Tools:")
         try:
@@ -818,13 +834,15 @@ async def _introspect_mcp_async(
                     click.echo(f"\n   📌 {tool.name}")
                     if tool.description:
                         click.echo(f"      Description: {tool.description}")
-                    if verbose and hasattr(tool, 'inputSchema') and tool.inputSchema:
-                        click.echo(f"      Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}")
+                    if verbose and hasattr(tool, "inputSchema") and tool.inputSchema:
+                        click.echo(
+                            f"      Input Schema: {yaml.dump(tool.inputSchema, default_flow_style=False, indent=8).strip()}"
+                        )
             else:
                 click.echo("   (No tools available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing tools: {e}")
-        
+
         # List resources
         click.echo("\n📁 Available Resources:")
         try:
@@ -841,7 +859,7 @@ async def _introspect_mcp_async(
                 click.echo("   (No resources available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing resources: {e}")
-        
+
         # List prompts
         click.echo("\n💬 Available Prompts:")
         try:
@@ -851,16 +869,18 @@ async def _introspect_mcp_async(
                     click.echo(f"\n   💡 {prompt.name}")
                     if prompt.description:
                         click.echo(f"      Description: {prompt.description}")
-                    if verbose and hasattr(prompt, 'arguments') and prompt.arguments:
+                    if verbose and hasattr(prompt, "arguments") and prompt.arguments:
                         click.echo("      Arguments:")
                         for arg in prompt.arguments:
                             req = "required" if arg.required else "optional"
-                            click.echo(f"        - {arg.name} ({req}): {arg.description}")
+                            click.echo(
+                                f"        - {arg.name} ({req}): {arg.description}"
+                            )
             else:
                 click.echo("   (No prompts available)")
         except Exception as e:
             click.echo(f"   ⚠️ Error listing prompts: {e}")
-        
+
         click.echo("\n✅ Introspection complete!")
 
 
diff --git a/tests/test_claude_tool_use.py b/tests/test_claude_tool_use.py
index 817617f..3d74485 100644
--- a/tests/test_claude_tool_use.py
+++ b/tests/test_claude_tool_use.py
@@ -1,10 +1,11 @@
 """Test ClaudeCoder tool use extraction."""
+
 from metacoder.coders.base_coder import CoderOutput, ToolUse
 
 
 def test_claude_tool_use_extraction():
     """Test that ClaudeCoder correctly extracts tool uses from structured messages."""
-    
+
     # Create a mock output with tool use in structured messages
     output = CoderOutput(
         stdout="",
@@ -18,31 +19,31 @@ def test_claude_tool_use_extraction():
                             "type": "tool_use",
                             "id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s",
                             "name": "mcp__pubmed__get_paper_fulltext",
-                            "input": {"pmid": "35743164"}
+                            "input": {"pmid": "35743164"},
                         }
                     ]
-                }
+                },
             },
             {
-                "type": "user", 
+                "type": "user",
                 "message": {
                     "content": [
                         {
                             "type": "tool_result",
                             "content": "Paper content here...",
                             "is_error": False,
-                            "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s"
+                            "tool_use_id": "toolu_019mJqdgpJSP1Z6UcfsMhx7s",
                         }
                     ]
-                }
-            }
-        ]
+                },
+            },
+        ],
     )
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in output.structured_messages:
         # Check for tool_use in assistant messages
         if message.get("type") == "assistant" and message.get("message"):
@@ -53,16 +54,16 @@ def test_claude_tool_use_extraction():
                         tool_id = content_item.get("id")
                         tool_name = content_item.get("name", "")
                         tool_input = content_item.get("input", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_input,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool_result in user messages
         elif message.get("type") == "user" and message.get("message"):
             msg_content = message["message"].get("content", [])
@@ -72,22 +73,24 @@ def test_claude_tool_use_extraction():
                         tool_id = content_item.get("tool_use_id")
                         if tool_id in pending_tool_uses:
                             tool_data = pending_tool_uses[tool_id]
-                            
+
                             # Update with result
                             is_tool_error = content_item.get("is_error", False)
                             tool_data["success"] = not is_tool_error
                             tool_data["result"] = content_item.get("content", "")
-                            
+
                             if is_tool_error:
-                                tool_data["error"] = content_item.get("content", "Tool error occurred")
-                            
+                                tool_data["error"] = content_item.get(
+                                    "content", "Tool error occurred"
+                                )
+
                             # Create ToolUse object
                             tool_use = ToolUse(**tool_data)
                             tool_uses.append(tool_use)
-                            
+
                             # Remove from pending
                             del pending_tool_uses[tool_id]
-    
+
     # Verify extraction
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -100,7 +103,7 @@ def test_claude_tool_use_extraction():
 
 def test_claude_tool_use_error():
     """Test that ClaudeCoder correctly handles tool errors."""
-    
+
     # Create a mock output with tool error
     output = CoderOutput(
         stdout="",
@@ -114,31 +117,31 @@ def test_claude_tool_use_error():
                             "type": "tool_use",
                             "id": "toolu_test",
                             "name": "mcp__pubmed__get_paper_fulltext",
-                            "input": {"pmid": "invalid"}
+                            "input": {"pmid": "invalid"},
                         }
                     ]
-                }
+                },
             },
             {
-                "type": "user", 
+                "type": "user",
                 "message": {
                     "content": [
                         {
                             "type": "tool_result",
                             "content": "MCP tool response exceeds maximum allowed tokens",
                             "is_error": True,
-                            "tool_use_id": "toolu_test"
+                            "tool_use_id": "toolu_test",
                         }
                     ]
-                }
-            }
-        ]
+                },
+            },
+        ],
     )
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in output.structured_messages:
         # Check for tool_use in assistant messages
         if message.get("type") == "assistant" and message.get("message"):
@@ -149,16 +152,16 @@ def test_claude_tool_use_error():
                         tool_id = content_item.get("id")
                         tool_name = content_item.get("name", "")
                         tool_input = content_item.get("input", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_input,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool_result in user messages
         elif message.get("type") == "user" and message.get("message"):
             msg_content = message["message"].get("content", [])
@@ -168,22 +171,24 @@ def test_claude_tool_use_error():
                         tool_id = content_item.get("tool_use_id")
                         if tool_id in pending_tool_uses:
                             tool_data = pending_tool_uses[tool_id]
-                            
+
                             # Update with result
                             is_tool_error = content_item.get("is_error", False)
                             tool_data["success"] = not is_tool_error
                             tool_data["result"] = content_item.get("content", "")
-                            
+
                             if is_tool_error:
-                                tool_data["error"] = content_item.get("content", "Tool error occurred")
-                            
+                                tool_data["error"] = content_item.get(
+                                    "content", "Tool error occurred"
+                                )
+
                             # Create ToolUse object
                             tool_use = ToolUse(**tool_data)
                             tool_uses.append(tool_use)
-                            
+
                             # Remove from pending
                             del pending_tool_uses[tool_id]
-    
+
     # Verify error handling
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -191,4 +196,4 @@ def test_claude_tool_use_error():
     assert tool_use.arguments == {"pmid": "invalid"}
     assert tool_use.success is False
     assert tool_use.error == "MCP tool response exceeds maximum allowed tokens"
-    assert tool_use.result == "MCP tool response exceeds maximum allowed tokens"
\ No newline at end of file
+    assert tool_use.result == "MCP tool response exceeds maximum allowed tokens"
diff --git a/tests/test_coders/test_coder_availability.py b/tests/test_coders/test_coder_availability.py
index d9a75d3..4b53b63 100644
--- a/tests/test_coders/test_coder_availability.py
+++ b/tests/test_coders/test_coder_availability.py
@@ -59,12 +59,12 @@ def test_all_coders_have_availability_method():
     from metacoder.metacoder import AVAILABLE_CODERS
 
     for coder_name, coder_class in AVAILABLE_CODERS.items():
-        assert hasattr(
-            coder_class, "is_available"
-        ), f"{coder_name} missing is_available method"
-        assert callable(
-            coder_class.is_available
-        ), f"{coder_name}.is_available is not callable"
+        assert hasattr(coder_class, "is_available"), (
+            f"{coder_name} missing is_available method"
+        )
+        assert callable(coder_class.is_available), (
+            f"{coder_name}.is_available is not callable"
+        )
 
 
 @patch("shutil.which")
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index 4180e92..cb73641 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -62,17 +62,17 @@ def test_llm_coder_basic_arithmetic(coder_name, coder_class):
 
             # Check result
             assert result is not None
-            assert (
-                result.stdout or result.result_text
-            ), "Coder should produce some output"
+            assert result.stdout or result.result_text, (
+                "Coder should produce some output"
+            )
 
             # Get the actual output text
             output_text = result.result_text or result.stdout
 
             # All LLM coders should include "4" in their answer
-            assert (
-                "4" in output_text
-            ), f"{coder_name} should answer '4' to 'What is 2+2?'"
+            assert "4" in output_text, (
+                f"{coder_name} should answer '4' to 'What is 2+2?'"
+            )
 
         except Exception as e:
             pytest.fail(f"Coder {coder_name} failed with error: {e}")
@@ -137,12 +137,12 @@ def test_llm_coder_code_generation(coder_name, coder_class):
             assert output_text, "Coder should produce some output"
 
             # Verify the output contains Python code elements
-            assert (
-                "def" in output_text
-            ), f"{coder_name} should generate a Python function"
-            assert (
-                "return" in output_text or "print" in output_text
-            ), f"{coder_name} should have return or print"
+            assert "def" in output_text, (
+                f"{coder_name} should generate a Python function"
+            )
+            assert "return" in output_text or "print" in output_text, (
+                f"{coder_name} should have return or print"
+            )
 
         except Exception as e:
             pytest.fail(f"Coder {coder_name} failed with error: {e}")
diff --git a/tests/test_dummy_coder_tool_capture.py b/tests/test_dummy_coder_tool_capture.py
index a3d6364..b92b7ab 100644
--- a/tests/test_dummy_coder_tool_capture.py
+++ b/tests/test_dummy_coder_tool_capture.py
@@ -1,4 +1,5 @@
 """Test that DummyCoder properly captures tool calls in CoderOutput."""
+
 from metacoder.coders.dummy import DummyCoder
 from metacoder.coders.base_coder import CoderOutput, ToolUse
 
@@ -6,23 +7,25 @@
 def test_dummy_coder_captures_tool_calls():
     """Test that DummyCoder captures tool calls in the CoderOutput."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers tool use
     output = coder.run("Use MCP to search PubMed for cancer research")
-    
+
     # Verify output is a CoderOutput instance
     assert isinstance(output, CoderOutput)
-    
+
     # Verify basic output fields
     assert output.stdout == "you said: Use MCP to search PubMed for cancer research"
     assert output.stderr == ""
-    assert output.result_text == "you said: Use MCP to search PubMed for cancer research"
-    
+    assert (
+        output.result_text == "you said: Use MCP to search PubMed for cancer research"
+    )
+
     # Verify tool_uses is populated
     assert output.tool_uses is not None
     assert isinstance(output.tool_uses, list)
     assert len(output.tool_uses) == 1
-    
+
     # Verify the tool use is properly structured
     tool_use = output.tool_uses[0]
     assert isinstance(tool_use, ToolUse)
@@ -36,27 +39,27 @@ def test_dummy_coder_captures_tool_calls():
 def test_dummy_coder_captures_multiple_tools():
     """Test that DummyCoder can capture multiple tool calls."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers multiple tools
     output = coder.run("Search PubMed and then cause an error")
-    
+
     # Verify multiple tools are captured
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 2
-    
+
     # Verify each tool is a proper ToolUse instance
     for tool in output.tool_uses:
         assert isinstance(tool, ToolUse)
-        assert hasattr(tool, 'name')
-        assert hasattr(tool, 'arguments')
-        assert hasattr(tool, 'success')
-        assert hasattr(tool, 'error')
-        assert hasattr(tool, 'result')
-    
+        assert hasattr(tool, "name")
+        assert hasattr(tool, "arguments")
+        assert hasattr(tool, "success")
+        assert hasattr(tool, "error")
+        assert hasattr(tool, "result")
+
     # Check first tool (PubMed search)
     assert output.tool_uses[0].name == "mcp__pubmed__search_papers"
     assert output.tool_uses[0].success is True
-    
+
     # Check second tool (error)
     assert output.tool_uses[1].name == "mcp__test__failing_tool"
     assert output.tool_uses[1].success is False
@@ -66,14 +69,14 @@ def test_dummy_coder_captures_multiple_tools():
 def test_dummy_coder_no_tools_when_not_triggered():
     """Test that DummyCoder doesn't add tools when not triggered."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that doesn't trigger tools
     output = coder.run("What is the weather today?")
-    
+
     # Verify output structure
     assert isinstance(output, CoderOutput)
     assert output.stdout == "you said: What is the weather today?"
-    
+
     # Verify no tools are added
     assert output.tool_uses is None
 
@@ -81,14 +84,14 @@ def test_dummy_coder_no_tools_when_not_triggered():
 def test_dummy_coder_tool_error_capture():
     """Test that DummyCoder properly captures tool errors."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with input that triggers an error
     output = coder.run("Use tool with error")
-    
+
     # Verify error tool is captured
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     error_tool = output.tool_uses[0]
     assert error_tool.name == "mcp__test__failing_tool"
     assert error_tool.success is False
@@ -100,22 +103,22 @@ def test_dummy_coder_tool_error_capture():
 def test_dummy_coder_tool_serialization():
     """Test that tool uses can be serialized properly."""
     coder = DummyCoder(workdir="test")
-    
+
     # Run with tool trigger
     output = coder.run("Use MCP tool")
-    
+
     # Verify tool uses can be converted to dict (for serialization)
     assert output.tool_uses is not None
     tool_dict = output.tool_uses[0].model_dump()
-    
+
     assert isinstance(tool_dict, dict)
     assert "name" in tool_dict
     assert "arguments" in tool_dict
     assert "success" in tool_dict
     assert "error" in tool_dict
     assert "result" in tool_dict
-    
+
     # Verify values
     assert tool_dict["name"] == "mcp__dummy__test_tool"
     assert tool_dict["success"] is True
-    assert tool_dict["error"] is None
\ No newline at end of file
+    assert tool_dict["error"] is None
diff --git a/tests/test_dummy_tool_use.py b/tests/test_dummy_tool_use.py
index ca3f2b4..f9f675e 100644
--- a/tests/test_dummy_tool_use.py
+++ b/tests/test_dummy_tool_use.py
@@ -1,4 +1,5 @@
 """Test DummyCoder fake tool use generation."""
+
 from metacoder.coders.dummy import DummyCoder
 
 
@@ -6,7 +7,7 @@ def test_dummy_no_tools():
     """Test that dummy coder doesn't add tools when not mentioned."""
     coder = DummyCoder(workdir="test")
     output = coder.run("What is 2 + 2?")
-    
+
     assert output.stdout == "you said: What is 2 + 2?"
     assert output.tool_uses is None
 
@@ -15,10 +16,10 @@ def test_dummy_default_tool():
     """Test that dummy coder adds default tool when mentioned."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Use a tool to help me")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__dummy__test_tool"
     assert tool.arguments == {"input": "Use a tool to help me"}
@@ -31,10 +32,10 @@ def test_dummy_pubmed_search():
     """Test that dummy coder simulates PubMed search."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Search PubMed for papers about cancer")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__pubmed__search_papers"
     assert tool.arguments == {"query": "test query", "limit": 10}
@@ -47,10 +48,10 @@ def test_dummy_tool_error():
     """Test that dummy coder simulates tool errors."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Use MCP tool but simulate an error")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__test__failing_tool"
     assert tool.arguments == {"param": "value"}
@@ -63,15 +64,15 @@ def test_dummy_multiple_tools():
     """Test that dummy coder can simulate multiple tools."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Search PubMed and then simulate an error with MCP")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 2
-    
+
     # First tool - PubMed search
     tool1 = output.tool_uses[0]
     assert tool1.name == "mcp__pubmed__search_papers"
     assert tool1.success is True
-    
+
     # Second tool - error simulation
     tool2 = output.tool_uses[1]
     assert tool2.name == "mcp__test__failing_tool"
@@ -82,10 +83,10 @@ def test_dummy_mcp_keyword():
     """Test that MCP keyword triggers tool use."""
     coder = DummyCoder(workdir="test")
     output = coder.run("Test MCP functionality")
-    
+
     assert output.tool_uses is not None
     assert len(output.tool_uses) == 1
-    
+
     tool = output.tool_uses[0]
     assert tool.name == "mcp__dummy__test_tool"
-    assert tool.success is True
\ No newline at end of file
+    assert tool.success is True
diff --git a/tests/test_evals/test_deep_eval.py b/tests/test_evals/test_deep_eval.py
deleted file mode 100644
index c55dec2..0000000
--- a/tests/test_evals/test_deep_eval.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-Test the deepeval library.
-
-https://github.com/metacoder-ai/deepeval
-
-Note this doesn't actually test any metacoder functonality, it is more to explore
-deepeval metrics, it can probably be removed in the future.
-"""
-
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
-from deepeval import evaluate
-from deepeval.metrics import (
-    FaithfulnessMetric,
-    HallucinationMetric,
-)
-from deepeval.test_case import LLMTestCase
-import pytest
-
-
-@pytest.mark.llm
-@pytest.mark.parametrize("metric_cls", [FaithfulnessMetric])
-def test_generic_eval(metric_cls):
-    """Test FaithfulnessMetric with correct output matching context."""
-    metric = metric_cls(threshold=0.7)
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="The answer to the question 'what is the title of PMID:28027860?' is 'From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.'",
-        actual_output='The answer to the question "what is the title of PMID:28027860?" is "From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."',
-        context=[
-            "Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-        retrieval_context=[
-            "PMID:28027860? Title: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))
-
-
-@pytest.mark.llm
-@pytest.mark.parametrize("metric_cls", [HallucinationMetric])
-def test_hallucination_eval(metric_cls):
-    """Test HallucinationMetric detects incorrect information not supported by context."""
-    metric = metric_cls(threshold=0.7)
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.",
-        actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."',
-        context=[
-            "Title of PMID:28027860: From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy."
-        ],
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))
-
-
-
-
-correctness_metric = GEval(
-    name="Correctness",
-    criteria="Determine whether the actual output is factually correct based on the expected output.",
-    # NOTE: you can only provide either criteria or evaluation_steps, and not both
-    evaluation_steps=[
-        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-        "You should also heavily penalize omission of detail",
-        "Vague language, or contradicting OPINIONS, are OK",
-    ],
-    threshold=0.8,
-    evaluation_params=[
-        LLMTestCaseParams.INPUT,
-        LLMTestCaseParams.ACTUAL_OUTPUT,
-        LLMTestCaseParams.EXPECTED_OUTPUT,
-    ],
-)
-
-
-@pytest.mark.llm
-def test_geval_eval():
-    """Test GEval correctness metric catches factual errors in output."""
-    metric = correctness_metric
-    test_case = LLMTestCase(
-        input="What is the title of PMID:28027860?",
-        expected_output="From nocturnal frontal lobe epilepsy to Sleep-Related Hypermotor Epilepsy.",
-        actual_output='The title of the article with PMID:28027860 is "Predictors of acute and persisting fatigue in people with relapsing and remitting multiple sclerosis: A cohort study."',
-    )
-    results = evaluate([test_case], [metric])
-    import yaml
-
-    print(results)
-    print(yaml.dump(results.model_dump()))
diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py
index 0515237..d1f0c3e 100644
--- a/tests/test_evals/test_runner.py
+++ b/tests/test_evals/test_runner.py
@@ -1,6 +1,9 @@
 """Tests for the evaluation runner.
 
 This uses only dummy coders, so can be used in non-integration contexts.
+
+TODO: some of these are marked llm because they use an LLM in the eval
+phase, even if they use a dummy coder - figure a way to have a dummy LLM Eval too
 """
 
 import pytest
@@ -137,6 +140,7 @@ def test_create_test_case_with_list_context(self):
         test_case = runner.create_test_case(eval_case, "4")
         assert test_case.retrieval_context == ["Math fact 1", "Math fact 2"]
 
+    @pytest.mark.llm
     def test_run_single_eval_with_dummy(self, simple_config, tmp_path):
         """Test running a single evaluation with dummy coder."""
         runner = EvalRunner()
@@ -244,6 +248,7 @@ def test_save_and_load_results(self, tmp_path):
         assert data["results"][0]["model"] == "model1"
         assert data["results"][0]["score"] == 0.9
 
+    @pytest.mark.llm
     def test_run_all_evals_with_dummy(self, simple_config, tmp_path):
         """Test running all evaluations with dummy coder."""
         runner = EvalRunner()
diff --git a/tests/test_goose_tool_use.py b/tests/test_goose_tool_use.py
index 94d83aa..3b643ae 100644
--- a/tests/test_goose_tool_use.py
+++ b/tests/test_goose_tool_use.py
@@ -1,4 +1,5 @@
 """Test GooseCoder tool use extraction."""
+
 from metacoder.coders.base_coder import ToolUse
 
 
@@ -13,7 +14,7 @@ def test_goose_tool_use_extraction():
             "content": [
                 {
                     "type": "text",
-                    "text": "I'll help you find information about diseases associated with ITPR1 mutations."
+                    "text": "I'll help you find information about diseases associated with ITPR1 mutations.",
                 },
                 {
                     "type": "toolRequest",
@@ -22,11 +23,11 @@ def test_goose_tool_use_extraction():
                         "status": "success",
                         "value": {
                             "name": "pubmed__get_paper_fulltext",
-                            "arguments": {"pmid": "35743164"}
-                        }
-                    }
-                }
-            ]
+                            "arguments": {"pmid": "35743164"},
+                        },
+                    },
+                },
+            ],
         },
         {
             "id": None,
@@ -38,22 +39,17 @@ def test_goose_tool_use_extraction():
                     "id": "toolu_01RbESTBH9tyWu9Q9uAVRjja",
                     "toolResult": {
                         "status": "success",
-                        "value": [
-                            {
-                                "type": "text",
-                                "text": "Paper content here..."
-                            }
-                        ]
-                    }
+                        "value": [{"type": "text", "text": "Paper content here..."}],
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages to extract tool uses (mimicking goose logic)
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         # Check for tool requests in assistant messages
         if message.get("role") == "assistant" and "content" in message:
@@ -61,21 +57,21 @@ def test_goose_tool_use_extraction():
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         tool_name = tool_value.get("name", "")
                         tool_args = tool_value.get("arguments", {})
-                        
+
                         # Store pending tool use
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_args,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         # Check for tool responses in user messages
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
@@ -84,7 +80,7 @@ def test_goose_tool_use_extraction():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         # Update with result
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
@@ -93,23 +89,32 @@ def test_goose_tool_use_extraction():
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
                         else:
                             tool_data["success"] = False
-                            tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                            tool_data["error"] = tool_result.get(
+                                "error", "Tool execution failed"
+                            )
                             tool_data["result"] = None
-                        
+
                         # Create ToolUse object
                         tool_use = ToolUse(**tool_data)
                         tool_uses.append(tool_use)
-                        
+
                         # Remove from pending
                         del pending_tool_uses[tool_id]
-    
+
     # Verify extraction
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -132,13 +137,10 @@ def test_goose_tool_use_error():
                     "id": "toolu_test",
                     "toolCall": {
                         "status": "success",
-                        "value": {
-                            "name": "test_tool",
-                            "arguments": {"param": "value"}
-                        }
-                    }
+                        "value": {"name": "test_tool", "arguments": {"param": "value"}},
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -148,37 +150,37 @@ def test_goose_tool_use_error():
                     "id": "toolu_test",
                     "toolResult": {
                         "status": "error",
-                        "error": "Tool failed to execute"
-                    }
+                        "error": "Tool failed to execute",
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages to extract tool uses
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         if message.get("role") == "assistant" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         tool_name = tool_value.get("name", "")
                         tool_args = tool_value.get("arguments", {})
-                        
+
                         pending_tool_uses[tool_id] = {
                             "name": tool_name,
                             "arguments": tool_args,
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolResponse":
@@ -186,27 +188,36 @@ def test_goose_tool_use_error():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
                             result_value = tool_result.get("value", [])
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
                         else:
                             tool_data["success"] = False
-                            tool_data["error"] = tool_result.get("error", "Tool execution failed")
+                            tool_data["error"] = tool_result.get(
+                                "error", "Tool execution failed"
+                            )
                             tool_data["result"] = None
-                        
+
                         tool_use = ToolUse(**tool_data)
                         tool_uses.append(tool_use)
                         del pending_tool_uses[tool_id]
-    
+
     # Verify error handling
     assert len(tool_uses) == 1
     tool_use = tool_uses[0]
@@ -230,11 +241,11 @@ def test_goose_multiple_tools():
                         "status": "success",
                         "value": {
                             "name": "search_tool",
-                            "arguments": {"query": "test"}
-                        }
-                    }
+                            "arguments": {"query": "test"},
+                        },
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -244,10 +255,10 @@ def test_goose_multiple_tools():
                     "id": "tool1",
                     "toolResult": {
                         "status": "success",
-                        "value": [{"type": "text", "text": "Search results"}]
-                    }
+                        "value": [{"type": "text", "text": "Search results"}],
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "assistant",
@@ -259,11 +270,11 @@ def test_goose_multiple_tools():
                         "status": "success",
                         "value": {
                             "name": "fetch_tool",
-                            "arguments": {"url": "http://example.com"}
-                        }
-                    }
+                            "arguments": {"url": "http://example.com"},
+                        },
+                    },
                 }
-            ]
+            ],
         },
         {
             "role": "user",
@@ -273,24 +284,24 @@ def test_goose_multiple_tools():
                     "id": "tool2",
                     "toolResult": {
                         "status": "success",
-                        "value": [{"type": "text", "text": "Fetched content"}]
-                    }
+                        "value": [{"type": "text", "text": "Fetched content"}],
+                    },
                 }
-            ]
-        }
+            ],
+        },
     ]
-    
+
     # Process structured messages
     tool_uses = []
     pending_tool_uses = {}
-    
+
     for message in structured_messages:
         if message.get("role") == "assistant" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolRequest":
                     tool_id = content.get("id")
                     tool_call = content.get("toolCall", {})
-                    
+
                     if tool_call.get("status") == "success":
                         tool_value = tool_call.get("value", {})
                         pending_tool_uses[tool_id] = {
@@ -298,9 +309,9 @@ def test_goose_multiple_tools():
                             "arguments": tool_value.get("arguments", {}),
                             "success": False,
                             "error": None,
-                            "result": None
+                            "result": None,
                         }
-        
+
         elif message.get("role") == "user" and "content" in message:
             for content in message.get("content", []):
                 if isinstance(content, dict) and content.get("type") == "toolResponse":
@@ -308,22 +319,29 @@ def test_goose_multiple_tools():
                     if tool_id in pending_tool_uses:
                         tool_data = pending_tool_uses[tool_id]
                         tool_result = content.get("toolResult", {})
-                        
+
                         if tool_result.get("status") == "success":
                             tool_data["success"] = True
                             result_value = tool_result.get("value", [])
                             if isinstance(result_value, list):
                                 result_texts = []
                                 for item in result_value:
-                                    if isinstance(item, dict) and item.get("type") == "text":
+                                    if (
+                                        isinstance(item, dict)
+                                        and item.get("type") == "text"
+                                    ):
                                         result_texts.append(item.get("text", ""))
-                                tool_data["result"] = "\n".join(result_texts) if result_texts else str(result_value)
+                                tool_data["result"] = (
+                                    "\n".join(result_texts)
+                                    if result_texts
+                                    else str(result_value)
+                                )
                             else:
                                 tool_data["result"] = str(result_value)
-                        
+
                         tool_uses.append(ToolUse(**tool_data))
                         del pending_tool_uses[tool_id]
-    
+
     # Verify multiple tools
     assert len(tool_uses) == 2
     assert tool_uses[0].name == "search_tool"
@@ -331,4 +349,4 @@ def test_goose_multiple_tools():
     assert tool_uses[0].result == "Search results"
     assert tool_uses[1].name == "fetch_tool"
     assert tool_uses[1].success is True
-    assert tool_uses[1].result == "Fetched content"
\ No newline at end of file
+    assert tool_uses[1].result == "Fetched content"
diff --git a/tests/test_instructions_option.py b/tests/test_instructions_option.py
index 681e275..ff9cc4e 100644
--- a/tests/test_instructions_option.py
+++ b/tests/test_instructions_option.py
@@ -21,7 +21,7 @@ def test_instructions_option_with_dummy_coder(runner):
         instructions_file = Path(temp_dir) / "test_instructions.md"
         instructions_content = "# Test Instructions\n\nBe helpful and concise."
         instructions_file.write_text(instructions_content)
-        
+
         # Run with instructions
         result = runner.invoke(
             main,
@@ -36,7 +36,7 @@ def test_instructions_option_with_dummy_coder(runner):
                 temp_dir,
             ],
         )
-        
+
         # Check that instructions were loaded
         assert result.exit_code == 0
         assert "Loaded instructions from:" in result.output
@@ -58,7 +58,7 @@ def test_no_instructions_still_works(runner):
                 temp_dir,
             ],
         )
-        
+
         assert result.exit_code == 0
         assert "you said: Hello" in result.output
         assert "Instructions loaded:" not in result.output
@@ -80,7 +80,7 @@ def test_instructions_file_not_found(runner):
                 temp_dir,
             ],
         )
-        
+
         # Should fail with appropriate error
         assert result.exit_code != 0
         assert "does not exist" in result.output
@@ -98,11 +98,11 @@ def test_instructions_with_config(runner):
 extensions: []
 """
         config_file.write_text(config_content)
-        
+
         # Create instructions file
         instructions_file = Path(temp_dir) / "instructions.md"
         instructions_file.write_text("Custom instructions")
-        
+
         result = runner.invoke(
             main,
             [
@@ -118,6 +118,6 @@ def test_instructions_with_config(runner):
                 temp_dir,
             ],
         )
-        
+
         assert result.exit_code == 0
-        assert "Loaded instructions from:" in result.output
\ No newline at end of file
+        assert "Loaded instructions from:" in result.output
diff --git a/tests/test_introspect_mcp.py b/tests/test_introspect_mcp.py
index 2f51ae0..4e5fd59 100644
--- a/tests/test_introspect_mcp.py
+++ b/tests/test_introspect_mcp.py
@@ -6,7 +6,7 @@ def test_introspect_mcp_help():
     """Test introspect-mcp help command."""
     runner = CliRunner()
     result = runner.invoke(cli, ["introspect-mcp", "--help"])
-    
+
     assert result.exit_code == 0
     assert "Introspect an MCP server" in result.output
     assert "MCP_SPEC" in result.output
@@ -17,12 +17,10 @@ def test_introspect_mcp_help():
 def test_introspect_mcp_with_invalid_registry():
     """Test introspect-mcp with non-existent registry MCP."""
     runner = CliRunner()
-    result = runner.invoke(cli, [
-        "introspect-mcp",
-        "nonexistent",
-        "--registry", "metacoder.basics"
-    ])
-    
+    result = runner.invoke(
+        cli, ["introspect-mcp", "nonexistent", "--registry", "metacoder.basics"]
+    )
+
     assert result.exit_code != 0
     assert "not found in registry" in result.output
 
@@ -30,12 +28,9 @@ def test_introspect_mcp_with_invalid_registry():
 def test_introspect_mcp_with_registry_no_mcp():
     """Test introspect-mcp with invalid registry."""
     runner = CliRunner()
-    result = runner.invoke(cli, [
-        "introspect-mcp",
-        "fetch",
-        "--registry", "metacoder.nonexistent"
-    ])
-    
+    result = runner.invoke(
+        cli, ["introspect-mcp", "fetch", "--registry", "metacoder.nonexistent"]
+    )
+
     assert result.exit_code != 0
     assert "Registry file not found" in result.output
-
diff --git a/tests/test_mcps/test_gemini_mcp.py b/tests/test_mcps/test_gemini_mcp.py
index 25b9c24..288f61c 100644
--- a/tests/test_mcps/test_gemini_mcp.py
+++ b/tests/test_mcps/test_gemini_mcp.py
@@ -13,7 +13,7 @@ def test_gemini_supports_mcp():
 def test_gemini_mcp_config_conversion():
     """Test conversion of MCPConfig to Gemini format."""
     coder = GeminiCoder(workdir="/tmp/test")
-    
+
     # Test stdio MCP
     mcp = MCPConfig(
         name="test_server",
@@ -21,11 +21,11 @@ def test_gemini_mcp_config_conversion():
         args=["-y", "@modelcontextprotocol/server-test"],
         env={"API_KEY": "${TEST_KEY}"},
         enabled=True,
-        type=MCPType.STDIO
+        type=MCPType.STDIO,
     )
-    
+
     result = coder.mcp_config_to_gemini_format(mcp)
-    
+
     assert result["command"] == "npx"
     assert result["args"] == ["-y", "@modelcontextprotocol/server-test"]
     assert result["env"] == {"API_KEY": "${TEST_KEY}"}
@@ -35,13 +35,9 @@ def test_gemini_mcp_config_conversion():
 def test_gemini_http_mcp_not_supported():
     """Test that HTTP MCPs raise NotImplementedError."""
     coder = GeminiCoder(workdir="/tmp/test")
-    
-    mcp = MCPConfig(
-        name="http_server",
-        enabled=True,
-        type=MCPType.HTTP
-    )
-    
+
+    mcp = MCPConfig(name="http_server", enabled=True, type=MCPType.HTTP)
+
     with pytest.raises(NotImplementedError, match="HTTP MCPs are not supported"):
         coder.mcp_config_to_gemini_format(mcp)
 
@@ -56,7 +52,7 @@ def test_gemini_mcp_settings_generation():
                 command="npx",
                 args=["-y", "@modelcontextprotocol/server-filesystem"],
                 enabled=True,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
             MCPConfig(
                 name="github",
@@ -64,41 +60,41 @@ def test_gemini_mcp_settings_generation():
                 args=["mcp-github"],
                 env={"GITHUB_TOKEN": "${GITHUB_TOKEN}"},
                 enabled=True,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
             MCPConfig(
                 name="disabled_server",
                 command="uvx",
                 args=["mcp-disabled"],
                 enabled=False,
-                type=MCPType.STDIO
+                type=MCPType.STDIO,
             ),
-        ]
+        ],
     )
-    
+
     coder = GeminiCoder(workdir="/tmp/test", config=config)
     config_objects = coder.default_config_objects()
-    
+
     # Should have created settings.json
     assert len(config_objects) == 1
     settings_obj = config_objects[0]
-    
+
     assert settings_obj.relative_path == ".gemini/settings.json"
     assert "mcpServers" in settings_obj.content
-    
+
     mcp_servers = settings_obj.content["mcpServers"]
-    
+
     # Should only include enabled servers
     assert "filesystem" in mcp_servers
     assert "github" in mcp_servers
     assert "disabled_server" not in mcp_servers
-    
+
     # Check filesystem server config
     fs_config = mcp_servers["filesystem"]
     assert fs_config["command"] == "npx"
     assert fs_config["args"] == ["-y", "@modelcontextprotocol/server-filesystem"]
     assert fs_config["timeout"] == 30000
-    
+
     # Check github server config
     gh_config = mcp_servers["github"]
     assert gh_config["command"] == "uvx"
@@ -111,6 +107,6 @@ def test_gemini_no_mcp_no_settings():
     """Test that no settings.json is created when no MCPs are configured."""
     coder = GeminiCoder(workdir="/tmp/test")
     config_objects = coder.default_config_objects()
-    
+
     # Should not create any config files when no MCPs
-    assert len(config_objects) == 0
\ No newline at end of file
+    assert len(config_objects) == 0
diff --git a/tests/test_registry_loading.py b/tests/test_registry_loading.py
index c96781e..c2f67a2 100644
--- a/tests/test_registry_loading.py
+++ b/tests/test_registry_loading.py
@@ -7,15 +7,15 @@
 def test_load_mcp_registry_basics():
     """Test loading basics registry."""
     collection = load_mcp_registry("metacoder.basics")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
     assert len(collection.servers) > 0
-    
+
     # Check that fetch is in basics
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names
     assert "taskmasterai" in mcp_names
-    
+
     # Check that all are disabled by default
     for mcp in collection.servers:
         assert not mcp.enabled and mcp.enabled is not None
@@ -24,10 +24,10 @@ def test_load_mcp_registry_basics():
 def test_load_mcp_registry_scilit():
     """Test loading scilit registry."""
     collection = load_mcp_registry("metacoder.scilit")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
     assert len(collection.servers) > 0
-    
+
     # Check that scilit MCPs are present
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "pdfreader" in mcp_names
@@ -38,9 +38,9 @@ def test_load_mcp_registry_scilit():
 def test_load_mcp_registry_all():
     """Test loading all registries with 'metacoder'."""
     collection = load_mcp_registry("metacoder")
-    
+
     assert isinstance(collection, MCPCollectionConfig)
-    
+
     # Should have MCPs from both basics and scilit
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names  # from basics
@@ -50,7 +50,7 @@ def test_load_mcp_registry_all():
 def test_load_mcp_registry_without_prefix():
     """Test loading registry without metacoder prefix."""
     collection = load_mcp_registry("basics")
-    
+
     # Should work the same as with prefix
     mcp_names = [mcp.name for mcp in collection.servers]
     assert "fetch" in mcp_names
@@ -59,17 +59,24 @@ def test_load_mcp_registry_without_prefix():
 def test_cli_with_registry():
     """Test CLI with registry option."""
     runner = CliRunner()
-    
+
     # Test with registry and enable specific MCP
-    result = runner.invoke(cli, [
-        "run",
-        "test prompt",
-        "--coder", "dummy",
-        "--registry", "metacoder.basics",
-        "--enable-mcp", "fetch",
-        "--workdir", "test_workdir"
-    ])
-    
+    result = runner.invoke(
+        cli,
+        [
+            "run",
+            "test prompt",
+            "--coder",
+            "dummy",
+            "--registry",
+            "metacoder.basics",
+            "--enable-mcp",
+            "fetch",
+            "--workdir",
+            "test_workdir",
+        ],
+    )
+
     assert result.exit_code == 0
     assert "Loading MCPs from registry: metacoder.basics" in result.output
     assert "Registry MCPs:" in result.output
@@ -79,7 +86,7 @@ def test_cli_with_registry():
 def test_cli_registry_with_mcp_collection():
     """Test CLI with both registry and MCP collection."""
     runner = CliRunner()
-    
+
     # Create a temporary MCP collection file
     with runner.isolated_filesystem():
         with open("test_mcps.yaml", "w") as f:
@@ -91,18 +98,27 @@ def test_cli_registry_with_mcp_collection():
     args: ["test"]
     enabled: true
 """)
-        
-        result = runner.invoke(cli, [
-            "run",
-            "test prompt",
-            "--coder", "dummy",
-            "--mcp-collection", "test_mcps.yaml",
-            "--registry", "metacoder.basics",
-            "--enable-mcp", "fetch",
-            "--enable-mcp", "custom_mcp",
-            "--workdir", "test_workdir"
-        ])
-        
+
+        result = runner.invoke(
+            cli,
+            [
+                "run",
+                "test prompt",
+                "--coder",
+                "dummy",
+                "--mcp-collection",
+                "test_mcps.yaml",
+                "--registry",
+                "metacoder.basics",
+                "--enable-mcp",
+                "fetch",
+                "--enable-mcp",
+                "custom_mcp",
+                "--workdir",
+                "test_workdir",
+            ],
+        )
+
         assert result.exit_code == 0
         assert "Loading MCP collection from: test_mcps.yaml" in result.output
         assert "Loading MCPs from registry: metacoder.basics" in result.output
@@ -113,5 +129,5 @@ def test_registry_nonexistent():
     """Test loading nonexistent registry."""
     with pytest.raises(Exception) as exc_info:
         load_mcp_registry("metacoder.nonexistent")
-    
-    assert "Registry file not found" in str(exc_info.value)
\ No newline at end of file
+
+    assert "Registry file not found" in str(exc_info.value)
diff --git a/uv.lock b/uv.lock
index 875df9f..b68d2f3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1515,6 +1515,7 @@ dev = [
     { name = "mkdocstrings-python" },
     { name = "mypy" },
     { name = "pytest" },
+    { name = "ruff" },
     { name = "types-click" },
     { name = "types-pyyaml" },
 ]
@@ -1539,6 +1540,7 @@ dev = [
     { name = "mkdocstrings-python", specifier = ">=1.14.0" },
     { name = "mypy", specifier = ">=1.17.1" },
     { name = "pytest", specifier = ">=8.4.1" },
+    { name = "ruff", specifier = ">=0.12.8" },
     { name = "types-click", specifier = ">=7.1.8" },
     { name = "types-pyyaml", specifier = ">=6.0.12.20250516" },
 ]
@@ -3271,6 +3273,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 },
 ]
 
+[[package]]
+name = "ruff"
+version = "0.12.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4b/da/5bd7565be729e86e1442dad2c9a364ceeff82227c2dece7c29697a9795eb/ruff-0.12.8.tar.gz", hash = "sha256:4cb3a45525176e1009b2b64126acf5f9444ea59066262791febf55e40493a033", size = 5242373 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/1e/c843bfa8ad1114fab3eb2b78235dda76acd66384c663a4e0415ecc13aa1e/ruff-0.12.8-py3-none-linux_armv6l.whl", hash = "sha256:63cb5a5e933fc913e5823a0dfdc3c99add73f52d139d6cd5cc8639d0e0465513", size = 11675315 },
+    { url = "https://files.pythonhosted.org/packages/24/ee/af6e5c2a8ca3a81676d5480a1025494fd104b8896266502bb4de2a0e8388/ruff-0.12.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9a9bbe28f9f551accf84a24c366c1aa8774d6748438b47174f8e8565ab9dedbc", size = 12456653 },
+    { url = "https://files.pythonhosted.org/packages/99/9d/e91f84dfe3866fa648c10512904991ecc326fd0b66578b324ee6ecb8f725/ruff-0.12.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2fae54e752a3150f7ee0e09bce2e133caf10ce9d971510a9b925392dc98d2fec", size = 11659690 },
+    { url = "https://files.pythonhosted.org/packages/fe/ac/a363d25ec53040408ebdd4efcee929d48547665858ede0505d1d8041b2e5/ruff-0.12.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0acbcf01206df963d9331b5838fb31f3b44fa979ee7fa368b9b9057d89f4a53", size = 11896923 },
+    { url = "https://files.pythonhosted.org/packages/58/9f/ea356cd87c395f6ade9bb81365bd909ff60860975ca1bc39f0e59de3da37/ruff-0.12.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae3e7504666ad4c62f9ac8eedb52a93f9ebdeb34742b8b71cd3cccd24912719f", size = 11477612 },
+    { url = "https://files.pythonhosted.org/packages/1a/46/92e8fa3c9dcfd49175225c09053916cb97bb7204f9f899c2f2baca69e450/ruff-0.12.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb82efb5d35d07497813a1c5647867390a7d83304562607f3579602fa3d7d46f", size = 13182745 },
+    { url = "https://files.pythonhosted.org/packages/5e/c4/f2176a310f26e6160deaf661ef60db6c3bb62b7a35e57ae28f27a09a7d63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dbea798fc0065ad0b84a2947b0aff4233f0cb30f226f00a2c5850ca4393de609", size = 14206885 },
+    { url = "https://files.pythonhosted.org/packages/87/9d/98e162f3eeeb6689acbedbae5050b4b3220754554526c50c292b611d3a63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:49ebcaccc2bdad86fd51b7864e3d808aad404aab8df33d469b6e65584656263a", size = 13639381 },
+    { url = "https://files.pythonhosted.org/packages/81/4e/1b7478b072fcde5161b48f64774d6edd59d6d198e4ba8918d9f4702b8043/ruff-0.12.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ac9c570634b98c71c88cb17badd90f13fc076a472ba6ef1d113d8ed3df109fb", size = 12613271 },
+    { url = "https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818", size = 12847783 },
+    { url = "https://files.pythonhosted.org/packages/4e/2a/0b6ac3dd045acf8aa229b12c9c17bb35508191b71a14904baf99573a21bd/ruff-0.12.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:71c83121512e7743fba5a8848c261dcc454cafb3ef2934a43f1b7a4eb5a447ea", size = 11702672 },
+    { url = "https://files.pythonhosted.org/packages/9d/ee/f9fdc9f341b0430110de8b39a6ee5fa68c5706dc7c0aa940817947d6937e/ruff-0.12.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:de4429ef2ba091ecddedd300f4c3f24bca875d3d8b23340728c3cb0da81072c3", size = 11440626 },
+    { url = "https://files.pythonhosted.org/packages/89/fb/b3aa2d482d05f44e4d197d1de5e3863feb13067b22c571b9561085c999dc/ruff-0.12.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a2cab5f60d5b65b50fba39a8950c8746df1627d54ba1197f970763917184b161", size = 12462162 },
+    { url = "https://files.pythonhosted.org/packages/18/9f/5c5d93e1d00d854d5013c96e1a92c33b703a0332707a7cdbd0a4880a84fb/ruff-0.12.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:45c32487e14f60b88aad6be9fd5da5093dbefb0e3e1224131cb1d441d7cb7d46", size = 12913212 },
+    { url = "https://files.pythonhosted.org/packages/71/13/ab9120add1c0e4604c71bfc2e4ef7d63bebece0cfe617013da289539cef8/ruff-0.12.8-py3-none-win32.whl", hash = "sha256:daf3475060a617fd5bc80638aeaf2f5937f10af3ec44464e280a9d2218e720d3", size = 11694382 },
+    { url = "https://files.pythonhosted.org/packages/f6/dc/a2873b7c5001c62f46266685863bee2888caf469d1edac84bf3242074be2/ruff-0.12.8-py3-none-win_amd64.whl", hash = "sha256:7209531f1a1fcfbe8e46bcd7ab30e2f43604d8ba1c49029bb420b103d0b5f76e", size = 12740482 },
+    { url = "https://files.pythonhosted.org/packages/cb/5c/799a1efb8b5abab56e8a9f2a0b72d12bd64bb55815e9476c7d0a2887d2f7/ruff-0.12.8-py3-none-win_arm64.whl", hash = "sha256:c90e1a334683ce41b0e7a04f41790c429bf5073b62c1ae701c9dc5b3d14f0749", size = 11884718 },
+]
+
 [[package]]
 name = "sentry-sdk"
 version = "2.34.1"