From cd959182ed1ea109cdcb970c8749d227c58de6eb Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:58:23 -0400
Subject: [PATCH 01/34] Enforce UTF-8 for Goose session files.

---
 src/metacoder/coders/goose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 514dc2b..f3e378c 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]

From a791ce5875cc759f475b22443ad01f55e8e43e1b Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:41:08 -0400
Subject: [PATCH 02/34] Fixes issue #15. Prevents divide by zero errors and
 cleans up summaries by using consistent printing methods.

---
 src/metacoder/metacoder.py | 42 ++++++++++++--------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index f62d3df..3ba1e1b 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -421,9 +421,7 @@ def run(
                 coder_config.ai_model.name = model
 
         # Show the model configuration
-        click.echo(
-            f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})"
-        )
+        click.echo(f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})")
 
     if coder_config and coder_config.extensions:
         for mcp in coder_config.extensions:
@@ -481,16 +479,12 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(
-                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
-            )
+            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
     if verbose and result.structured_messages:
-        click.echo(
-            f"\n📋 Structured messages ({len(result.structured_messages)} total)"
-        )
+        click.echo(f"\n📋 Structured messages ({len(result.structured_messages)} total)")
         for i, msg in enumerate(result.structured_messages):
             click.echo(f"  {i + 1}. {msg}")
 
@@ -592,38 +586,28 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
+    frac_passed = summary['passed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+    frac_failed = summary['failed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
-    click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
-    )
-    click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
-    )
-    if summary["errors"] > 0:
-        click.echo(f"   Errors: {summary['errors']} ⚠️")
+    click.echo(f"   Passed: {summary['passed']} ({frac_passed:.1%})")
+    click.echo(f"   Failed: {summary['failed']} ({frac_failed:.1%})")
+    click.echo(f"   Errors: {summary['errors']} ⚠️") if summary["errors"] else None
 
     # Print by-coder summary
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
-            )
-            click.echo(
-                f"     {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
-            )
+            coder_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
+            click.echo(f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})")
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
-            )
-            click.echo(
-                f"     {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
-            )
+            model_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
+            click.echo(f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})")
 
     click.echo("\n✅ Evaluation complete!")
 

From 49891a3e01e1dcd43ba9255e933b28a64c84aa7f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 16:19:57 -0400
Subject: [PATCH 03/34] Cleaned up output by using consistent printing methods.

---
 src/metacoder/coders/claude.py | 2 +-
 src/metacoder/coders/codex.py  | 2 +-
 src/metacoder/coders/gemini.py | 2 +-
 src/metacoder/coders/goose.py  | 2 +-
 src/metacoder/coders/qwen.py   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index ee31b74..1a43295 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 8e9169e..3451ebe 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 20564a9..6af35c4 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index f3e378c..6b0b5c0 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
index 43aefb6..b6f4080 100644
--- a/src/metacoder/coders/qwen.py
+++ b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(

From 46ad344d476b69aabb2023d1f4ff698d0d380452 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:47:25 -0400
Subject: [PATCH 04/34] Fixes Issue #18 by implementing metric downgrades to
 Claude if OpenAPI calls fail, and to DummyMetric if Claude fails.

---
 src/metacoder/evals/judges.py |  55 ++++++++++++++++
 src/metacoder/evals/runner.py | 120 ++++++++++++++++++++++++++--------
 2 files changed, 148 insertions(+), 27 deletions(-)
 create mode 100644 src/metacoder/evals/judges.py

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
new file mode 100644
index 0000000..7ca74c6
--- /dev/null
+++ b/src/metacoder/evals/judges.py
@@ -0,0 +1,55 @@
+# metacoder/evals/judges.py
+
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "claude-3-5-sonnet-20240620",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
+        self.client = Anthropic(api_key = api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model = self.model_name,
+            max_tokens = self.max_tokens,
+            temperature = self.temperature,
+            messages = messages
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 67a9619..e709b8c 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -5,30 +5,33 @@
 """
 
 import copy
+import functools
 import importlib
 import logging
+import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 
 from pydantic import BaseModel
 import yaml
+
 from deepeval import evaluate
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import BaseMetric, GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
+from openai import APIStatusError
+from openai.types.chat import ChatCompletionMessageParam
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
 from metacoder.evals.eval_model import EvalCase, EvalDataset
 from metacoder.configuration import AIModelConfig, CoderConfig
 
-
 logger = logging.getLogger(__name__)
 
-
 class DummyMetric(BaseMetric):
     """A dummy metric that always returns a perfect score for testing."""
 
@@ -58,27 +61,32 @@ def is_successful(self) -> bool:
         """Check if the metric passed."""
         return self.success
 
+def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
+    """Creates a GEval instance with the specified model."""
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        evaluation_steps = [
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        threshold = 0.8,
+        evaluation_params = [
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        model = model # may be None (defaults to OpenAI) or a Claude judge
+    )
+
 
-def get_default_metrics() -> Dict[str, BaseMetric]:
-    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+def get_default_metrics(model: Optional[DeepEvalBaseLLM] = None) -> Dict[str, BaseMetric]:
+    """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": GEval(
-            name="Correctness",
-            criteria="Determine whether the actual output is factually correct based on the expected output.",
-            # NOTE: you can only provide either criteria or evaluation_steps, and not both
-            evaluation_steps=[
-                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-                "You should also heavily penalize omission of detail",
-                "Vague language, or contradicting OPINIONS, are OK",
-            ],
-            threshold=0.8,
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-            ],
-        ),
-        "DummyMetric": DummyMetric(threshold=0.5),
+        "CorrectnessMetric": make_geval(model = model), # Note: GEval defaults to OpenAI if no model is specified.
+        "DummyMetric": DummyMetric(threshold = 0.5)
     }
 
 
@@ -123,6 +131,8 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
+        self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -183,6 +193,40 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
             additional_metadata=case.additional_metadata,
         )
 
+    @functools.lru_cache(maxsize=1)
+    def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
+        if not os.getenv("OPENAI_API_KEY"):
+            logger.warning("OPENAI_API_KEY is not set.")
+            return False
+        """
+            Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
+            Fast probe of the /chat/completions endpoint (the one GEval uses).
+            Returns False on 429 (insufficient_quota) or any exception.
+        """
+        try:
+            from openai import OpenAI
+            # turn off SDK retries for the check so it returns fast
+            client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
+            # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
+            raw = [{"role": "user", "content": "ping"}]
+            messages = cast(List[ChatCompletionMessageParam], raw)
+            client.chat.completions.create(
+                model = model,
+                messages = messages,
+                max_tokens = 1,
+                temperature = 0,
+            )
+            return True
+        except APIStatusError as e:
+            # 429 insufficient_quota, or other status codes
+            if e.status_code == 429:
+                return False
+            return False
+        except Exception as e:
+            # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            return False
+
     def run_single_eval(
         self,
         model_name: str,
@@ -236,7 +280,29 @@ def run_single_eval(
 
             # Evaluate
             logger.info(f"Evaluating with {metric_name}")
-            eval_results = evaluate([test_case], [metric])
+
+            if isinstance(metric, GEval):
+                # Assume GEval will hit OpenAI unless we replace it.
+                if self.use_openai and not self._openai_quota_ok():
+                    self.use_openai = False
+                    logger.warning("OpenAI quota exhausted; downgrading to Claude...")
+                    from metacoder.evals.judges import ClaudeJudge
+                    try:
+                        # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
+                        metric = make_geval(model = ClaudeJudge("claude-3-5-sonnet-20240620"))
+                    except Exception as e:
+                        # Fallback: if you can't use Claude, downgrade gracefully.
+                        logger.warning("Claude unavailable (%s); downgrading to DummyMetric.", e)
+                        metric = DummyMetric(threshold = 0.5)
+
+            eval_results = evaluate(
+                [test_case],
+                [metric],
+                async_config = AsyncConfig(run_async=False), # disable async
+                display_config = DisplayConfig(show_indicator=False, print_results=False, verbose_mode=self.verbose), # hide the spinner
+                cache_config = CacheConfig(use_cache=False, write_cache=False),
+                error_config = ErrorConfig(ignore_errors=False, skip_on_missing_params=True) # actually fail on failure
+            )
 
             # Extract results - the structure varies by deepeval version
             test_result = eval_results.test_results[0]

From fc7ba41bcda2124e2ab1a6034c3896bd4b937975 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 22:15:47 -0400
Subject: [PATCH 05/34] Satisfied ruff's bizarre rules.

---
 src/metacoder/evals/judges.py | 12 +++++---
 src/metacoder/evals/runner.py | 57 +++++++++++++++++++++++------------
 src/metacoder/metacoder.py    | 40 ++++++++++++++++++------
 3 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 7ca74c6..dc21724 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -4,8 +4,10 @@
 
 from anthropic import Anthropic
 from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
 from deepeval.models.base_model import DeepEvalBaseLLM
 
+
 class ClaudeJudge(DeepEvalBaseLLM):
     """
     Wraps Anthropic's Claude models so they can be used as
@@ -22,7 +24,7 @@ def __init__(
         api_key = os.getenv("ANTHROPIC_API_KEY")
         if not api_key:
             raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
-        self.client = Anthropic(api_key = api_key)
+        self.client = Anthropic(api_key=api_key)
         self.model_name = model_name
         self.max_tokens = max_tokens
         self.temperature = temperature
@@ -35,10 +37,10 @@ def generate(self, prompt: str) -> str:
         content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
         messages: list[MessageParam] = [{"role": "user", "content": content}]
         resp = self.client.messages.create(
-            model = self.model_name,
-            max_tokens = self.max_tokens,
-            temperature = self.temperature,
-            messages = messages
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
         )
         # anthropic returns a list of content blocks; collect only the text blocks.
         parts: list[str] = []
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index e709b8c..40edd3b 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -32,6 +32,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class DummyMetric(BaseMetric):
     """A dummy metric that always returns a perfect score for testing."""
 
@@ -61,32 +62,37 @@ def is_successful(self) -> bool:
         """Check if the metric passed."""
         return self.success
 
+
 def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
     """Creates a GEval instance with the specified model."""
     return GEval(
         name="Correctness",
         criteria="Determine whether the actual output is factually correct based on the expected output.",
         # NOTE: you can only provide either criteria or evaluation_steps, and not both
-        evaluation_steps = [
+        evaluation_steps=[
             "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
             "You should also heavily penalize omission of detail",
             "Vague language, or contradicting OPINIONS, are OK",
         ],
-        threshold = 0.8,
-        evaluation_params = [
+        threshold=0.8,
+        evaluation_params=[
             LLMTestCaseParams.INPUT,
             LLMTestCaseParams.ACTUAL_OUTPUT,
             LLMTestCaseParams.EXPECTED_OUTPUT,
         ],
-        model = model # may be None (defaults to OpenAI) or a Claude judge
+        model=model,  # may be None (defaults to OpenAI) or a Claude judge
     )
 
 
-def get_default_metrics(model: Optional[DeepEvalBaseLLM] = None) -> Dict[str, BaseMetric]:
+def get_default_metrics(
+    model: Optional[DeepEvalBaseLLM] = None,
+) -> Dict[str, BaseMetric]:
     """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": make_geval(model = model), # Note: GEval defaults to OpenAI if no model is specified.
-        "DummyMetric": DummyMetric(threshold = 0.5)
+        "CorrectnessMetric": make_geval(
+            model=model  # Note: GEval defaults to OpenAI if no model is specified.
+        ),
+        "DummyMetric": DummyMetric(threshold=0.5),
     }
 
 
@@ -131,7 +137,7 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
-        self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+        self.use_openai = True  # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
 
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
@@ -205,16 +211,17 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         """
         try:
             from openai import OpenAI
+
             # turn off SDK retries for the check so it returns fast
             client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
             # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
             raw = [{"role": "user", "content": "ping"}]
             messages = cast(List[ChatCompletionMessageParam], raw)
             client.chat.completions.create(
-                model = model,
-                messages = messages,
-                max_tokens = 1,
-                temperature = 0,
+                model=model,
+                messages=messages,
+                max_tokens=1,
+                temperature=0,
             )
             return True
         except APIStatusError as e:
@@ -287,21 +294,33 @@ def run_single_eval(
                     self.use_openai = False
                     logger.warning("OpenAI quota exhausted; downgrading to Claude...")
                     from metacoder.evals.judges import ClaudeJudge
+
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
-                        metric = make_geval(model = ClaudeJudge("claude-3-5-sonnet-20240620"))
+                        metric = make_geval(
+                            model=ClaudeJudge("claude-3-5-sonnet-20240620")
+                        )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
-                        logger.warning("Claude unavailable (%s); downgrading to DummyMetric.", e)
-                        metric = DummyMetric(threshold = 0.5)
+                        logger.warning(
+                            "Claude unavailable (%s); downgrading to DummyMetric.", e
+                        )
+                        metric = DummyMetric(threshold=0.5)
 
             eval_results = evaluate(
                 [test_case],
                 [metric],
-                async_config = AsyncConfig(run_async=False), # disable async
-                display_config = DisplayConfig(show_indicator=False, print_results=False, verbose_mode=self.verbose), # hide the spinner
-                cache_config = CacheConfig(use_cache=False, write_cache=False),
-                error_config = ErrorConfig(ignore_errors=False, skip_on_missing_params=True) # actually fail on failure
+                async_config=AsyncConfig(run_async=False),  # disable async
+                display_config=DisplayConfig(
+                    show_indicator=False,  # hide the progress meter
+                    print_results=False,
+                    verbose_mode=self.verbose,
+                ),
+                cache_config=CacheConfig(use_cache=False, write_cache=False),
+                error_config=ErrorConfig(
+                    ignore_errors=False,  # actually fail on failure
+                    skip_on_missing_params=True,
+                ),
             )
 
             # Extract results - the structure varies by deepeval version
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index 3ba1e1b..28113ec 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -421,7 +421,9 @@ def run(
                 coder_config.ai_model.name = model
 
         # Show the model configuration
-        click.echo(f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})")
+        click.echo(
+            f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})"
+        )
 
     if coder_config and coder_config.extensions:
         for mcp in coder_config.extensions:
@@ -479,12 +481,16 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
+            click.echo(
+                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
+            )
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
     if verbose and result.structured_messages:
-        click.echo(f"\n📋 Structured messages ({len(result.structured_messages)} total)")
+        click.echo(
+            f"\n📋 Structured messages ({len(result.structured_messages)} total)"
+        )
         for i, msg in enumerate(result.structured_messages):
             click.echo(f"  {i + 1}. {msg}")
 
@@ -586,8 +592,16 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
-    frac_passed = summary['passed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
-    frac_failed = summary['failed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+    frac_passed = (
+        summary["passed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
+    )
+    frac_failed = (
+        summary["failed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
+    )
 
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
@@ -599,15 +613,23 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            coder_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
-            click.echo(f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})")
+            coder_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
+            )
+            click.echo(
+                f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})"
+            )
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            model_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
-            click.echo(f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})")
+            model_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
+            )
+            click.echo(
+                f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})"
+            )
 
     click.echo("\n✅ Evaluation complete!")
 

From 54dd3d3f4faf9207dd4f233496f77a488f5075ad Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 23:38:11 -0400
Subject: [PATCH 06/34] Added extra logging and test for goose UTF-8 handling.

---
 src/metacoder/evals/runner.py          |  2 ++
 tests/test_coders/test_coders_basic.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 40edd3b..bc40f38 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -227,7 +227,9 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         except APIStatusError as e:
             # 429 insufficient_quota, or other status codes
             if e.status_code == 429:
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
                 return False
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
             # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index a9498b6..5d9daf1 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -3,6 +3,7 @@
 These tests check that each coder can handle a simple arithmetic question.
 """
 
+import json
 import tempfile
 import pytest
 
@@ -164,3 +165,16 @@ def test_dummy_coder_always_works():
         assert result is not None
         assert result.result_text == "you said: Hello, world!"
         assert result.stdout == "you said: Hello, world!"
+
+
+@pytest.mark.integration
+def test_goose_utf8_session_file(tmp_path):
+    """Test session files with UTF-8 content are read correctly."""
+    session_content = '{"role": "assistant", "content": "测试 résumé 🚀"}\n'
+    session_file = tmp_path / "test_session.jsonl"
+    session_file.write_text(session_content, encoding="utf-8")
+
+    with open(session_file, "r", encoding="utf-8") as f:
+        messages = [json.loads(line) for line in f if line.strip()]
+    assert len(messages) == 1
+    assert "测试" in messages[0]["content"]

From 72f586c5d8c0f8745aafcbae7f5c290d5eb2cf5a Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 10:29:22 -0400
Subject: [PATCH 07/34] Added metacoder configuration test cases for claude
 downgrade and no server combinations to support Issues #18, #19, and #20.

---
 .../goose_eval_claude_downgrade_test.yaml     | 30 +++++++++++++++++++
 tests/input/goose_no_server_test.yaml         | 30 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 tests/input/goose_eval_claude_downgrade_test.yaml
 create mode 100644 tests/input/goose_no_server_test.yaml

diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
new file mode 100644
index 0000000..6f0eb31
--- /dev/null
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+server_combinations:
+  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
new file mode 100644
index 0000000..2dc5551
--- /dev/null
+++ b/tests/input/goose_no_server_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+#server_combinations:
+#  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9

From d7beb19baa08632b61f08fa31596324338e77541 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:07:02 -0400
Subject: [PATCH 08/34] Added unit test for claude downgrade to support Issue
 #18. Cleaned up logging in runner.py. Added test configuration to support log
 capture for assertions that downgrade was successful.

---
 src/metacoder/evals/runner.py         | 13 +++++++------
 tests/conftest.py                     | 10 ++++++++++
 tests/test_evals/test_claude_judge.py | 26 ++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_evals/test_claude_judge.py

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index bc40f38..f31173e 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -202,7 +202,7 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
     @functools.lru_cache(maxsize=1)
     def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         if not os.getenv("OPENAI_API_KEY"):
-            logger.warning("OPENAI_API_KEY is not set.")
+            logger.info("OPENAI_API_KEY is not set.")
             return False
         """
             Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
@@ -227,13 +227,13 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         except APIStatusError as e:
             # 429 insufficient_quota, or other status codes
             if e.status_code == 429:
-                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
+                logger.info(f"OpenAI API Key has insufficient quota: {e}")
                 return False
-            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
+            logger.info(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
             # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
-            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            logger.info(f"OpenAI preflight failed; treating as no-quota: {e}")
             return False
 
     def run_single_eval(
@@ -294,13 +294,14 @@ def run_single_eval(
                 # Assume GEval will hit OpenAI unless we replace it.
                 if self.use_openai and not self._openai_quota_ok():
                     self.use_openai = False
-                    logger.warning("OpenAI quota exhausted; downgrading to Claude...")
+                    claude_model = "claude-3-5-sonnet-20240620"
+                    logger.warning(f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}")
                     from metacoder.evals.judges import ClaudeJudge
 
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
                         metric = make_geval(
-                            model=ClaudeJudge("claude-3-5-sonnet-20240620")
+                            model=ClaudeJudge(claude_model)
                         )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..95f4c37
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+import logging
+import sys
+
+
+def pytest_configure(config):
+    logging.basicConfig(
+        level=logging.WARNING,
+        format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
new file mode 100644
index 0000000..2ddc2ce
--- /dev/null
+++ b/tests/test_evals/test_claude_judge.py
@@ -0,0 +1,26 @@
+import logging
+from pathlib import Path
+
+from metacoder.evals.runner import EvalRunner
+
+
+def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that ClaudeJudge is used when OpenAI is disabled."""
+
+    # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+    # (no need to reset, `monkeypatch` automatically reverts after the test)
+    monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(Path("tests/input/goose_eval_claude_downgrade_test.yaml"))
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+            assert "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620" in caplog.text
+
+    finally:
+        pass

From d88ca905eb356ff589e0a2f17ae82e5d4735376f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:09:41 -0400
Subject: [PATCH 09/34] Added unit test for claude downgrade to support Issue
 #18. Cleaned up logging in runner.py. Added test configuration to support log
 capture for assertions that downgrade was successful. Addressed ruff
 warnings.

---
 src/metacoder/evals/runner.py         | 8 ++++----
 tests/test_evals/test_claude_judge.py | 9 +++++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index f31173e..bf231e9 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -295,14 +295,14 @@ def run_single_eval(
                 if self.use_openai and not self._openai_quota_ok():
                     self.use_openai = False
                     claude_model = "claude-3-5-sonnet-20240620"
-                    logger.warning(f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}")
+                    logger.warning(
+                        f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}"
+                    )
                     from metacoder.evals.judges import ClaudeJudge
 
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
-                        metric = make_geval(
-                            model=ClaudeJudge(claude_model)
-                        )
+                        metric = make_geval(model=ClaudeJudge(claude_model))
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
                         logger.warning(
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 2ddc2ce..42047bc 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -14,13 +14,18 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     runner = EvalRunner()
 
     try:
-        dataset = runner.load_dataset(Path("tests/input/goose_eval_claude_downgrade_test.yaml"))
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
-            assert "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620" in caplog.text
+            assert (
+                "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
+                in caplog.text
+            )
 
     finally:
         pass

From e7bba401faddb0b93316f3e3f8d7c0f8341c7c39 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:31:12 -0400
Subject: [PATCH 10/34] Added assertion to confirm that ClaudeJudge completed
 scoring the metric after the downgrade.

---
 tests/test_evals/test_claude_judge.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 42047bc..8e1fbac 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -3,6 +3,8 @@
 
 from metacoder.evals.runner import EvalRunner
 
+logger = logging.getLogger(__name__)
+
 
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
@@ -19,13 +21,21 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+
+            # Verfiy that the downgrade happened.
             assert (
                 "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
                 in caplog.text
             )
 
+            # Verify that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
+            )
+
     finally:
         pass

From d27277b49111a4ea6399ed14ade1cded6cbf1db2 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 13:54:27 -0400
Subject: [PATCH 11/34] Added assertion to force test to fail on Exception.
 Increased logging verbosity temporarily to debug Claude judge unit test on
 build server. Adjusted logic to work when multiple coders are specified.
 Improved log messages.

---
 src/metacoder/evals/runner.py         | 46 +++++++++++++++++++++------
 tests/conftest.py                     |  2 +-
 tests/test_evals/test_claude_judge.py | 31 +++++++++++++++---
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index bf231e9..d0ae3c6 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import time
+import traceback
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Type, cast
 
@@ -225,15 +226,20 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
             )
             return True
         except APIStatusError as e:
-            # 429 insufficient_quota, or other status codes
+            # 429 insufficient quota or too many requests
             if e.status_code == 429:
-                logger.info(f"OpenAI API Key has insufficient quota: {e}")
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
                 return False
-            logger.info(f"OpenAI API Status Error; treating as no-quota: {e}")
+            # 401 authentication problem, including invalid API key
+            if e.status_code == 401:
+                logger.warning(f"OpenAI API Authentication Error: {e}")
+                return False
+            # all other errors
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
-            # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
-            logger.info(f"OpenAI preflight failed; treating as no-quota: {e}")
+            # includes network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
             return False
 
     def run_single_eval(
@@ -288,27 +294,47 @@ def run_single_eval(
             test_case = self.create_test_case(case, actual_output)
 
             # Evaluate
-            logger.info(f"Evaluating with {metric_name}")
+            logger.info(
+                f"Evaluating {metric_name} using model {metric.model.model_name}"
+            )
 
             if isinstance(metric, GEval):
-                # Assume GEval will hit OpenAI unless we replace it.
+                # Assume GEval will use OpenAI until is disabled.
                 if self.use_openai and not self._openai_quota_ok():
+                    logger.warning(
+                        f"OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                    )
                     self.use_openai = False
+
+                # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
+                if not self.use_openai:
+                    from metacoder.evals.judges import ClaudeJudge
+
                     claude_model = "claude-3-5-sonnet-20240620"
                     logger.warning(
-                        f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}"
+                        f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
                     )
-                    from metacoder.evals.judges import ClaudeJudge
 
                     try:
-                        # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
+                        # Downgrade metric model to Claude judge.
                         metric = make_geval(model=ClaudeJudge(claude_model))
+                        logger.warning(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
+                        logging.error(traceback.format_exc())
                         logger.warning(
                             "Claude unavailable (%s); downgrading to DummyMetric.", e
                         )
                         metric = DummyMetric(threshold=0.5)
+                        logger.warning(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
+
+            logger.warning(
+                f"Actual {metric_name} model used: {metric.model.model_name}"
+            )
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/conftest.py b/tests/conftest.py
index 95f4c37..2416094 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 def pytest_configure(config):
     logging.basicConfig(
-        level=logging.WARNING,
+        level=logging.DEBUG,
         format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
         stream=sys.stdout,
     )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 8e1fbac..9b798df 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -1,4 +1,6 @@
 import logging
+import os
+import traceback
 from pathlib import Path
 
 from metacoder.evals.runner import EvalRunner
@@ -9,9 +11,9 @@
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
 
-    # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
-    # (no need to reset, `monkeypatch` automatically reverts after the test)
-    monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+    # # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+    # # (no need to reset, `monkeypatch` automatically reverts after the test)
+    # monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
 
     runner = EvalRunner()
 
@@ -24,11 +26,25 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
-            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            # Save the original OPENAI_API_KEY if it exists
+            # original_api_key = os.getenv("OPENAI_API_KEY")
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            results = runner.run_all_evals(
+                dataset, workdir=tmp_path, coders=["goose", "dummy"]
+            )
+
+            # # Revert the OPENAI_API_KEY to its original value
+            # if original_api_key is not None:
+            #     monkeypatch.setenv("OPENAI_API_KEY", original_api_key)
+            # else:
+            #     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 
             # Verfiy that the downgrade happened.
             assert (
-                "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                 in caplog.text
             )
 
@@ -37,5 +53,10 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
                 f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
             )
 
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        # traceback.print_exc()
+        logging.error(traceback.format_exc())
+        assert False  # force test to fail if an exception is caught here
     finally:
         pass

From 3f22fc6cfecb87782130b4e83d86664d8c3e67cb Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 14:08:12 -0400
Subject: [PATCH 12/34] Fixed runtime issues related to metric downgrade from
 CorrectnessMetric to DummyMetric.

---
 src/metacoder/evals/judges.py         |  2 +-
 src/metacoder/evals/runner.py         | 11 +++++------
 tests/test_evals/test_claude_judge.py | 14 --------------
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index dc21724..4cff6c3 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -23,7 +23,7 @@ def __init__(
         super().__init__()
         api_key = os.getenv("ANTHROPIC_API_KEY")
         if not api_key:
-            raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
         self.client = Anthropic(api_key=api_key)
         self.model_name = model_name
         self.max_tokens = max_tokens
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index d0ae3c6..10b1b09 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -302,7 +302,7 @@ def run_single_eval(
                 # Assume GEval will use OpenAI until is disabled.
                 if self.use_openai and not self._openai_quota_ok():
                     logger.warning(
-                        f"OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                        "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                     )
                     self.use_openai = False
 
@@ -325,16 +325,15 @@ def run_single_eval(
                         # Fallback: if you can't use Claude, downgrade gracefully.
                         logging.error(traceback.format_exc())
                         logger.warning(
-                            "Claude unavailable (%s); downgrading to DummyMetric.", e
+                            "Claude unavailable (%s); downgrading {metric_name} to DummyMetric.",
+                            e,
                         )
                         metric = DummyMetric(threshold=0.5)
                         logger.warning(
-                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                            f"Successfully downgraded {metric_name} to {metric.name}."
                         )
 
-            logger.warning(
-                f"Actual {metric_name} model used: {metric.model.model_name}"
-            )
+            logger.warning(f"Actual metric used: {metric.name}.")
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 9b798df..0c14861 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import traceback
 from pathlib import Path
 
@@ -11,10 +10,6 @@
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
 
-    # # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
-    # # (no need to reset, `monkeypatch` automatically reverts after the test)
-    # monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
-
     runner = EvalRunner()
 
     try:
@@ -28,20 +23,12 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
             # (no need to reset, `monkeypatch` automatically reverts after the test)
-            # Save the original OPENAI_API_KEY if it exists
-            # original_api_key = os.getenv("OPENAI_API_KEY")
             monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
 
             results = runner.run_all_evals(
                 dataset, workdir=tmp_path, coders=["goose", "dummy"]
             )
 
-            # # Revert the OPENAI_API_KEY to its original value
-            # if original_api_key is not None:
-            #     monkeypatch.setenv("OPENAI_API_KEY", original_api_key)
-            # else:
-            #     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
-
             # Verfiy that the downgrade happened.
             assert (
                 "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
@@ -55,7 +42,6 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
 
     except Exception as e:
         logger.error(f"An error occurred: {e}")
-        # traceback.print_exc()
         logging.error(traceback.format_exc())
         assert False  # force test to fail if an exception is caught here
     finally:

From d6e1e448d1cea5b0c3d0dae6bbeec890536bca59 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 14:20:02 -0400
Subject: [PATCH 13/34] Added test coverage of new evaluation judge
 functionality. Added test for the quota exhaustion fallback logic.

---
 tests/test_evals/test_claude_judge.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 0c14861..70bfa07 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -29,20 +29,27 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
                 dataset, workdir=tmp_path, coders=["goose", "dummy"]
             )
 
-            # Verfiy that the downgrade happened.
+            # Test that the quota exhaustion fallback logic worked as expected.
             assert (
                 "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                 in caplog.text
             )
 
-            # Verify that the eval completed by checking for a non-zero score.
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the eval completed by checking for a non-zero score.
             assert results[0].score > 0, (
-                f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
             )
 
     except Exception as e:
+        # Test that fallback logic does not result in an Exception.
         logger.error(f"An error occurred: {e}")
         logging.error(traceback.format_exc())
-        assert False  # force test to fail if an exception is caught here
+        assert False  # This assertion will fail if an Exception is caught here.
     finally:
         pass

From 882a3d91379835b9e213dbfb7d5744ecbfd2c91b Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:51:41 -0400
Subject: [PATCH 14/34] Reduced logging verbosity. Added Anthropic quota check.
 Added automatic downgrade to DummyMetric on quota check failure. Added notes
 on potential improvements to unit tests.

---
 src/metacoder/evals/judges.py         | 30 +++++++++-
 src/metacoder/evals/runner.py         | 27 +++++----
 tests/conftest.py                     |  2 +-
 tests/test_evals/test_claude_judge.py | 84 +++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 14 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 4cff6c3..a8b8d48 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -1,5 +1,5 @@
 # metacoder/evals/judges.py
-
+import logging
 import os
 
 from anthropic import Anthropic
@@ -7,6 +7,8 @@
 
 from deepeval.models.base_model import DeepEvalBaseLLM
 
+logger = logging.getLogger(__name__)
+
 
 class ClaudeJudge(DeepEvalBaseLLM):
     """
@@ -55,3 +57,29 @@ async def a_generate(self, prompt: str) -> str:
 
     def get_model_name(self) -> str:
         return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 10b1b09..0a8dc42 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -308,8 +308,6 @@ def run_single_eval(
 
                 # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
                 if not self.use_openai:
-                    from metacoder.evals.judges import ClaudeJudge
-
                     claude_model = "claude-3-5-sonnet-20240620"
                     logger.warning(
                         f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
@@ -317,23 +315,28 @@ def run_single_eval(
 
                     try:
                         # Downgrade metric model to Claude judge.
-                        metric = make_geval(model=ClaudeJudge(claude_model))
-                        logger.warning(
+                        from metacoder.evals.judges import ClaudeJudge
+
+                        judge = ClaudeJudge(claude_model)
+
+                        if not judge.has_available_quota():
+                            raise Exception(
+                                "No Anthropic credits available for ClaudeJudge."
+                            )
+
+                        metric = make_geval(model=judge)
+                        logger.info(
                             f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
                         )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
-                        logging.error(traceback.format_exc())
+                        logging.debug(traceback.format_exc())
+                        logger.debug(e)
                         logger.warning(
-                            "Claude unavailable (%s); downgrading {metric_name} to DummyMetric.",
-                            e,
+                            f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
                         )
                         metric = DummyMetric(threshold=0.5)
-                        logger.warning(
-                            f"Successfully downgraded {metric_name} to {metric.name}."
-                        )
-
-            logger.warning(f"Actual metric used: {metric.name}.")
+                        logger.warning(f"Downgraded {metric_name} to {metric.name}.")
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/conftest.py b/tests/conftest.py
index 2416094..95f4c37 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 def pytest_configure(config):
     logging.basicConfig(
-        level=logging.DEBUG,
+        level=logging.WARNING,
         format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
         stream=sys.stdout,
     )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 70bfa07..9ed23a6 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -9,6 +9,8 @@
 
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
+    # TODO: This test should avoid running the coder and only perform the eval step.
+    # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process).
 
     runner = EvalRunner()
 
@@ -53,3 +55,85 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         assert False  # This assertion will fail if an Exception is caught here.
     finally:
         pass
+
+
+def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available."""
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+
+            # One more OpenAI API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+            # One more Anthropic API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing")
+
+            # TODO: Also need to test this for Anthropic:
+            # Provider
+            # request
+            # failed
+            # with status: 400
+            # Bad
+            # Request.Payload: Some(Object
+            # {"error": Object {"message": String("Your credit balance is too low
+            #                   to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase
+            #                   credits."), "type": String("invalid_request_error")}, "request_id": String("
+            #                   req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin
+            #                   g("error")}).Returning
+            # error: RequestFailed(
+            #     "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits."
+
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"])
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric.
+            assert (
+                "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric."
+                in caplog.text
+            )
+
+            # Test that the CorrectnessMetric was successfully downgraded to DummyMetric.
+            assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass

From c98c9d7b2645838c404185f8d4baf97fd27f4269 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:32:26 -0400
Subject: [PATCH 15/34] Fixed issue #23. Forced processes to be launched with
 UTF-8 encoding to avoid default encoding errors.

---
 src/metacoder/coders/base_coder.py            | 14 ++++++++-
 tests/input/goose_eval_test.yaml              |  9 ++++--
 tests/input/literature_mcp_encoding_test.yaml | 29 +++++++++++++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 tests/input/literature_mcp_encoding_test.yaml

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index b44c6ec..f19d146 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,15 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,7 +193,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
-        def stream_output(pipe, output_lines, stream):
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        def stream_output(pipe, output_lines, stream):  # lines are already str decoded as UTF-8
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
                     print(line.rstrip(), file=stream)
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index 1037215..f41e249 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -7,7 +7,7 @@ coders:
   goose: {}
 
 models:
-  gpt-4o:
+  claude-sonnet:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
@@ -34,6 +34,9 @@ cases:
       MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
       MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
       MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-
     threshold: 0.7
-    
+  - name: character_encoding_test
+    metrics: [CorrectnessMetric]
+    input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+    expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+    threshold: 0.9
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
new file mode 100644
index 0000000..d0fea1b
--- /dev/null
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -0,0 +1,29 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  ols:
+    name: ols
+    command: uvx
+    args: [mcp-ols]
+
+server_combinations:
+  - [simple-pubmed]
+
+cases:
+- name: character_encoding_test
+  metrics:
+  - CorrectnessMetric
+  input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+  expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+  threshold: 0.9

From 4761d1977b7763b840257cb863aea307ebc2a8ed Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:36:29 -0400
Subject: [PATCH 16/34] Addressed ruff formatting issue.

---
 src/metacoder/coders/base_coder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index f19d146..f2fa7d9 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -201,7 +201,8 @@ def run_process(
                 logger.info(f"{e}")
                 pass  # OK if not available (e.g., redirected or older Python)
 
-        def stream_output(pipe, output_lines, stream):  # lines are already str decoded as UTF-8
+        # lines are already str decoded as UTF-8
+        def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
                     print(line.rstrip(), file=stream)

From 6b64a794593899d643c553200cf35490c9d98dda Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:12:44 -0400
Subject: [PATCH 17/34] Added output file check to fail if the output file
 already exists. Otherwise, create an empty file as UTF-8. Partially addresses
 Issue #24.

---
 src/metacoder/metacoder.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index 28113ec..5e1d616 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional, Union
 
@@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     output_path = Path(output)
     workdir_path = Path(workdir)
 
+    try:
+        # Create the output file only if it doesn't exist; fail if it does
+        with output_path.open("x", encoding="utf-8") as _:
+            pass
+    except FileExistsError:
+        print(
+            f"Error: '{output_path}' already exists. Please delete it or specify a different filename.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     # Convert coders tuple to list (empty tuple if not specified)
     coders_list = list(coders) if coders else None
 

From c436e7fe4698df2c8e3d3e04fa5af8224ade0f9a Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:29:38 -0400
Subject: [PATCH 18/34] Modified save_results to append to existing output file
 rather than overwrite. Enforced UTF-8 encoding, switched to safe_dump and
 added document delimiter between records. Also simplified document
 generation. Fixes issue #24. Added second test case to
 literature_mcp_encoding_test.yaml for testing.

---
 src/metacoder/evals/runner.py                 | 19 +++++++++++--------
 tests/input/literature_mcp_encoding_test.yaml |  8 ++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 0a8dc42..21e2671 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -518,18 +518,21 @@ def run_all_evals(
 
     def save_results(self, results: List[EvalResult], output_path: Path):
         """Save evaluation results to file."""
-        # Convert to list of dicts
-        results_data = []
-        for result in results:
-            results_data.append(result.model_dump())
+        # output_path.parent.mkdir(parents=True, exist_ok=True)  # Not sure if the folder should be created here
+        data = {
+            "results": [r.model_dump() for r in results],
+            "summary": self.generate_summary(results),
+        }
 
-        # Save as YAML
-        with open(output_path, "w") as f:
-            yaml.dump(
-                {"results": results_data, "summary": self.generate_summary(results)},
+        # Append a new YAML document to the output file.
+        with open(output_path, "a", encoding="utf-8", newline="") as f:
+            yaml.safe_dump(
+                data,
                 f,
+                explicit_start=True,  # writes '---' to mark a new document
                 default_flow_style=False,
                 sort_keys=False,
+                allow_unicode=True,
             )
 
     def generate_summary(self, results: List[EvalResult]) -> Dict[str, Any]:
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
index d0fea1b..f7b5b75 100644
--- a/tests/input/literature_mcp_encoding_test.yaml
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -27,3 +27,11 @@ cases:
   input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
   expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
   threshold: 0.9
+- name: "disease"
+  metrics: [CorrectnessMetric]
+  input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
+  expected_output: |
+    MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
+    MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
+    MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
+  threshold: 0.7

From b0b1c8b0b8833ee02cc9ac7c12ce7943e7b24516 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:28:44 -0400
Subject: [PATCH 19/34] Updated ClaudeJudge model to claude-sonnet-4-20250514.

---
 src/metacoder/evals/judges.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index a8b8d48..24b4277 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -16,9 +16,17 @@ class ClaudeJudge(DeepEvalBaseLLM):
     the `model` parameter to DeepEval metrics like GEval.
     """
 
+    # Note: Anthropic models can be listed via:
+    # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
+    # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
+    # Current list (September 3, 2025):
+    # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
+    # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
+    # claude-3-opus-20240229
+
     def __init__(
         self,
-        model_name: str = "claude-3-5-sonnet-20240620",
+        model_name: str = "claude-sonnet-4-20250514",
         max_tokens: int = 1024,
         temperature: float = 0.0,
     ):

From a7e71e3f4e2d177cccefb53788a94eddefb9b34d Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Wed, 3 Sep 2025 17:02:19 -0400
Subject: [PATCH 20/34] Revert "Modified save_results to append to existing
 output file rather than overwrite. Enforced UTF-8 encoding, switched to
 safe_dump and added document delimiter between records. Also simplified
 document generation. Fixes issue #24. Added second test case to
 literature_mcp_encoding_test.yaml for testing."

This reverts commit c436e7fe4698df2c8e3d3e04fa5af8224ade0f9a.
---
 src/metacoder/evals/runner.py                 | 19 ++++++++-----------
 tests/input/literature_mcp_encoding_test.yaml |  8 --------
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 21e2671..0a8dc42 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -518,21 +518,18 @@ def run_all_evals(
 
     def save_results(self, results: List[EvalResult], output_path: Path):
         """Save evaluation results to file."""
-        # output_path.parent.mkdir(parents=True, exist_ok=True)  # Not sure if the folder should be created here
-        data = {
-            "results": [r.model_dump() for r in results],
-            "summary": self.generate_summary(results),
-        }
+        # Convert to list of dicts
+        results_data = []
+        for result in results:
+            results_data.append(result.model_dump())
 
-        # Append a new YAML document to the output file.
-        with open(output_path, "a", encoding="utf-8", newline="") as f:
-            yaml.safe_dump(
-                data,
+        # Save as YAML
+        with open(output_path, "w") as f:
+            yaml.dump(
+                {"results": results_data, "summary": self.generate_summary(results)},
                 f,
-                explicit_start=True,  # writes '---' to mark a new document
                 default_flow_style=False,
                 sort_keys=False,
-                allow_unicode=True,
             )
 
     def generate_summary(self, results: List[EvalResult]) -> Dict[str, Any]:
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
index f7b5b75..d0fea1b 100644
--- a/tests/input/literature_mcp_encoding_test.yaml
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -27,11 +27,3 @@ cases:
   input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
   expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
   threshold: 0.9
-- name: "disease"
-  metrics: [CorrectnessMetric]
-  input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
-  expected_output: |
-    MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
-    MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
-    MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-  threshold: 0.7

From 7e143da3479909ed45f766cafbfb2190b9d3b4ab Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 13:31:22 -0400
Subject: [PATCH 21/34] Added UTF-8 encoding to prevent character mangling
 during YAML export on Windows (where the default codepage is cp1252).

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 0a8dc42..f1c7126 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -524,7 +524,7 @@ def save_results(self, results: List[EvalResult], output_path: Path):
             results_data.append(result.model_dump())
 
         # Save as YAML
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             yaml.dump(
                 {"results": results_data, "summary": self.generate_summary(results)},
                 f,

From 37cbb2f38ecbc672cda8c765d4ed0129ba7e839f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:02:15 -0400
Subject: [PATCH 22/34] Added support for grouping test case eval results with
 'group' key in config. Fixes Issue #27.

---
 src/metacoder/evals/eval_model.py             |  3 ++
 src/metacoder/evals/runner.py                 |  2 +
 .../goose_eval_claude_downgrade_test.yaml     |  2 +-
 tests/input/goose_eval_test.yaml              | 45 ++++++++++++-------
 tests/input/goose_no_server_test.yaml         |  2 +-
 5 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index d7dab3e..471c13d 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -21,6 +21,9 @@ class EvalCase(BaseModel):
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
+    group: Optional[str] = Field(
+        default="Default", description="Test category for result grouping."
+    )
     metrics: List[str] = Field(
         ...,
         description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index f1c7126..95c74bf 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -121,6 +121,7 @@ class EvalResult(BaseModel):
     model: str
     coder: str
     case_name: str
+    case_group: str
     metric_name: str
     score: float
     passed: bool
@@ -386,6 +387,7 @@ def run_single_eval(
                 model=model_name,
                 coder=coder_name,
                 case_name=case.name,
+                case_group=case.group,
                 metric_name=metric_name,
                 score=score if score is not None else 0.0,
                 passed=passed,
diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
index 6f0eb31..7d305ce 100644
--- a/tests/input/goose_eval_claude_downgrade_test.yaml
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -16,7 +16,7 @@ servers:
     command: uvx
     args: [mcp-simple-pubmed]
     env:
-      PUBMED_EMAIL: ctparker@lbl.gov
+      PUBMED_EMAIL: cjmungall@lbl.gov
 
 server_combinations:
   - [mcp-simple-pubmed]
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index f41e249..73b0615 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -11,32 +11,45 @@ models:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
+# Refer to metacoder/src/mcps/registry/scilit.yaml for the list of available MCPs.
 servers:
-  mcp-simple-pubmed:
+  artl:
+    name: artl
+    command: uvx
+    args: [artl-mcp]
+  simple-pubmed:
     name: pubmed
     command: uvx
     args: [mcp-simple-pubmed]
     env:
       PUBMED_EMAIL: cjmungall@lbl.gov
-  ols-mcp:
+  ols:
     name: ols
     command: uvx
     args: [mcp-ols]
 
 server_combinations:
-  - [mcp-simple-pubmed, ols-mcp]
+#  - [artl, simple-pubmed, ols]
+  - [artl]
+  - [simple-pubmed]
+#  - [ols]
 
 cases:
-  - name: "disease"
-    metrics: [CorrectnessMetric]
-    input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
-    expected_output: |
-      MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
-      MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
-      MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-    threshold: 0.7
-  - name: character_encoding_test
-    metrics: [CorrectnessMetric]
-    input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
-    expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
-    threshold: 0.9
+- name: PMID_28027860_Full_Text
+#  group: Text extraction # should default to "Default"
+  metrics:
+  - CorrectnessMetric
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: "Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
+
+# Per convo with Charles, Justin, Mark: this test case is kind of tricky and it seems
+# like an extremely difficult case that even a good LLM + MCP might not pass. We've
+# made some edits to give the LLM + MCP a fair chance
+- name: PMC8086273_Retraction
+  group: Summarization
+  metrics:
+  - CorrectnessMetric
+  input: "Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?"
+  expected_output: "The paper says No but it is retracted so the results should not be trusted."
+  threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
index 2dc5551..a027f80 100644
--- a/tests/input/goose_no_server_test.yaml
+++ b/tests/input/goose_no_server_test.yaml
@@ -16,7 +16,7 @@ servers:
     command: uvx
     args: [mcp-simple-pubmed]
     env:
-      PUBMED_EMAIL: ctparker@lbl.gov
+      PUBMED_EMAIL: cjmungall@lbl.gov
 
 #server_combinations:
 #  - [mcp-simple-pubmed]

From bdec2e3667f45dbf5c3ac3443564fab6ec73669c Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:13:35 -0400
Subject: [PATCH 23/34] Updated test_runner.py to include Default case_group in
 EvalResults to address validation errors in test suite.

---
 tests/test_evals/test_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py
index d1f0c3e..838ab60 100644
--- a/tests/test_evals/test_runner.py
+++ b/tests/test_evals/test_runner.py
@@ -174,6 +174,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,
@@ -182,6 +183,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case2",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.3,
                 passed=False,
@@ -190,6 +192,7 @@ def test_generate_summary(self):
                 model="model2",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.8,
                 passed=True,
@@ -225,6 +228,7 @@ def test_save_and_load_results(self, tmp_path):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,

From 93860972bf83ff4e716ddcacfa91d1d7c90c7cd4 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:18:35 -0400
Subject: [PATCH 24/34] Updated Anthropic fallback mode from
 claude-3-5-sonnet-20240620 to claude-sonnet-4-20250514.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 95c74bf..45f5a2b 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -309,7 +309,7 @@ def run_single_eval(
 
                 # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
                 if not self.use_openai:
-                    claude_model = "claude-3-5-sonnet-20240620"
+                    claude_model = "claude-sonnet-4-20250514"
                     logger.warning(
                         f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
                     )

From 0d855bc03e47e50948ab896e21f93481641eb8a4 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:23:27 -0400
Subject: [PATCH 25/34] Corrected test cases to match the expected Anthropic
 model.

---
 src/metacoder/evals/judges.py         | 4 ----
 tests/test_evals/test_claude_judge.py | 8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 24b4277..cc20e32 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -19,10 +19,6 @@ class ClaudeJudge(DeepEvalBaseLLM):
     # Note: Anthropic models can be listed via:
     # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
     # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
-    # Current list (September 3, 2025):
-    # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
-    # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
-    # claude-3-opus-20240229
 
     def __init__(
         self,
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 9ed23a6..f33f2a5 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -20,7 +20,7 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
-        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
@@ -39,7 +39,7 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
 
             # Test that the new evaluation judge was correctly selected for the metric model downgrade.
             assert (
-                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
                 in caplog.text
             )
 
@@ -68,7 +68,7 @@ def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
-        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
@@ -112,7 +112,7 @@ def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
 
             # Test that the new evaluation judge was correctly selected for the metric model downgrade.
             assert (
-                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
                 in caplog.text
             )
 

From 9d9bca0467d1e4a9074e7a9264148b95151d9d09 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:26:12 -0400
Subject: [PATCH 26/34] Removed unnecessary duplicate path element in work
 directory. Readability improvement to support fix for Issue #29. Adding as
 individual commit in case it needs to be rolled back.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 45f5a2b..de3323c 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -258,7 +258,7 @@ def run_single_eval(
         # Create coder instance
         coder = create_coder(
             coder_name,
-            workdir=str(workdir / f"{model_name}_{coder_name}_{case.name}"),
+            workdir=str(workdir),
             config=coder_config,
         )
 

From bd474c9e614bbc539cac8cb49b0762ebee74a77d Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:42:57 -0400
Subject: [PATCH 27/34] Fix Issue #30. Goose supports an environment variable
 to disable using the system keyring for secrets (GOOSE_DISABLE_KEYRING).

---
 src/metacoder/coders/goose.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 6b0b5c0..d8723ea 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -145,6 +145,8 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
+            # disable keyring (prevents errors on MacOS and Linux)
+            env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
             env["HOME"] = "."

From 142b8b8afca15be90fdd9a392fabb3c1ebf4de1f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:47:56 -0400
Subject: [PATCH 28/34] Partially addresses Issue #29 Windows compatibility.
 Uses os.cwd() instead of unix-specific "." to specify current working
 directory.

---
 src/metacoder/coders/goose.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index d8723ea..ea91efc 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 import time
 import logging
@@ -149,7 +150,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
-            env["HOME"] = "."
+            env["HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
             command = ["goose", "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From b5faef3f22b75a5f552e99c0640f5ff07146d80c Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:51:05 -0400
Subject: [PATCH 29/34] Uses safer XDG_CONFIG_HOME instead of changing HOME
 environment variable to avoid interfering with unix environment (shell
 history, etc.). Separate commit in case this needs to be rolled back.

---
 src/metacoder/coders/goose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index ea91efc..991ba09 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -150,7 +150,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
-            env["HOME"] = os.getcwd()
+            env["XDG_CONFIG_HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
             command = ["goose", "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From 6d6ba8d59c756677e72f8c53dd9dc1914aa06de6 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:01:32 -0400
Subject: [PATCH 30/34] Changed informational log message to make it clear that
 a directory path is not being referenced, but rather a server combination.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index de3323c..a80060a 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -484,7 +484,7 @@ def run_all_evals(
                             else " (no servers)"
                         )
                         logger.info(
-                            f"Progress: {current}/{total_combinations} - {coder_name}/{model_name}/{case.name}{server_desc}"
+                            f"Progress: {current}/{total_combinations} ({coder_name} | {model_name} | {case.name}{server_desc})"
                         )
 
                         # Create unique workdir for this combination

From 80772c2d99f5930f5f5bc3a29a6dbce5abf889c0 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:14:51 -0400
Subject: [PATCH 31/34] The Goose executable is now detected in a
 cross-platform way, and the full path information is propagated into the logs
 for easier debugging of the environment.

---
 src/metacoder/coders/goose.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 991ba09..931ef12 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -20,6 +20,13 @@
 logger = logging.getLogger(__name__)
 
 
+def find_goose() -> Path:
+    loc = shutil.which("goose")
+    if not loc:
+        raise FileNotFoundError("goose not found on PATH")
+    return Path(loc).resolve()
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -146,13 +153,16 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
+            goose_path = find_goose()
+            logger.info(f"Using goose executable at: {goose_path}")
+
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
             env["XDG_CONFIG_HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
-            command = ["goose", "run", "-t", text]
+            command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")
             # time the command
             start_time = time.time()

From ef6337c187ca8a45df4130be6994f1fb42209902 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:25:53 -0400
Subject: [PATCH 32/34] Moved hard-coded values into variables in preparation
 for cross-platform support. Adjusted log level and cleaned up comment for
 readability.

---
 src/metacoder/coders/goose.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 931ef12..e4ad2a3 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -154,13 +154,18 @@ def run(self, input_text: str) -> CoderOutput:
         self.prepare_workdir()
         with change_directory(self.workdir):
             goose_path = find_goose()
-            logger.info(f"Using goose executable at: {goose_path}")
+            logger.debug(f"Using goose executable at: {goose_path}")
 
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
-            # important - ensure that only local config files are used
-            # we assue chdir has been called beforehand
-            env["XDG_CONFIG_HOME"] = os.getcwd()
+            # Important:
+            # (1) ensure that only local config files are used;
+            # (2) assume chdir has been called beforehand.
+            cwd = os.getcwd()
+            local_home_path = Path(cwd)
+            home_env_var = "XDG_CONFIG_HOME"
+            env[home_env_var] = str(local_home_path)
+
             text = self.expand_prompt(input_text)
             command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From 87d556d1133e96c4e6cafd0a6c9272b920f37a1e Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 19:03:08 -0400
Subject: [PATCH 33/34] Added OS-specific Goose config folder structures.
 Replaced hard-coded paths with function call to generate correct path. Added
 OS-specific home directory environment variables. Added logging of Goose
 config path for confirmation with base coder.

---
 src/metacoder/coders/goose.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index e4ad2a3..382542c 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -1,5 +1,6 @@
 import json
 import os
+import platform
 from pathlib import Path
 import time
 import logging
@@ -27,6 +28,14 @@ def find_goose() -> Path:
     return Path(loc).resolve()
 
 
+def get_goose_config_path() -> str:
+    # OS-specific config layout
+    if platform.system().lower().startswith("win"):
+        return "Block\\goose\\config\\"
+
+    return ".config/goose/"
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -140,7 +149,7 @@ def default_config_objects(self) -> list[CoderConfigObject]:
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
+                relative_path=get_goose_config_path() + "config.yaml",
                 content=config_content,
             )
         ]
@@ -156,14 +165,29 @@ def run(self, input_text: str) -> CoderOutput:
             goose_path = find_goose()
             logger.debug(f"Using goose executable at: {goose_path}")
 
+            # Build environment with redirected config
+
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
+
             # Important:
             # (1) ensure that only local config files are used;
             # (2) assume chdir has been called beforehand.
             cwd = os.getcwd()
             local_home_path = Path(cwd)
-            home_env_var = "XDG_CONFIG_HOME"
+
+            # OS-specific config layout
+            goose_config_dir = local_home_path / get_goose_config_path()
+            # OS-specific home directory environment variable
+            if platform.system().lower().startswith("win"):
+                home_env_var = "APPDATA"
+            else:
+                home_env_var = "XDG_CONFIG_HOME"
+
+            goose_cfg_path = goose_config_dir / "config.yaml"
+
+            logger.info(f"Goose config: {goose_cfg_path}\n")
+
             env[home_env_var] = str(local_home_path)
 
             text = self.expand_prompt(input_text)

From a9113817bf69d0f2d6b06386a24f006037fa7805 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:10:47 -0700
Subject: [PATCH 34/34] Refactored OS environment detection to create relative
 paths for coder configs consistently.

---
 src/metacoder/coders/base_coder.py |   8 ++-
 src/metacoder/coders/goose.py      |  91 ++++++++++++++++++++-----
 tests/test_goose_paths.py          | 105 +++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_goose_paths.py

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index f2fa7d9..408a794 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -365,7 +365,8 @@ def prepare_workdir(self):
 
         if self.config_objects is None:
             self.config_objects = self.default_config_objects()
-        logger.info(f"📁 Preparing workdir: {self.workdir}")
+        logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
+        logger.info(f"                     (resolved): {Path(self.workdir).resolve()}")
         with change_directory(self.workdir):
             # clear old config objects
             for path, _type in self.default_config_paths().items():
@@ -379,7 +380,10 @@ def prepare_workdir(self):
                         path.unlink()
             logger.debug(f"🔧 Writing config objects: {self.config_objects}")
             for config_object in self.config_objects:
-                path = Path(config_object.relative_path)
+                rel = Path(config_object.relative_path)
+                if rel.is_absolute():
+                    raise ValueError(f"Config object path must be relative: {rel}")
+                path = rel
                 path.parent.mkdir(parents=True, exist_ok=True)
                 logger.info(
                     f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 382542c..d6c4b35 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -28,12 +28,71 @@ def find_goose() -> Path:
     return Path(loc).resolve()
 
 
-def get_goose_config_path() -> str:
-    # OS-specific config layout
+def get_home_env_var() -> str:
+    """
+    Determine the environment variable Goose should treat as "home"
+    for locating configuration files.
+
+    Windows:
+        Goose expects its configuration under:
+            %APPDATA%\\Block\\goose\\config\\
+        Therefore, we override APPDATA to point into the working directory.
+
+    Unix-like (Linux, macOS):
+        Goose follows the XDG Base Directory spec:
+            - If $XDG_CONFIG_HOME is set, config goes under:
+                  $XDG_CONFIG_HOME/goose/config.yaml
+            - Otherwise it falls back to:
+                  $HOME/.config/goose/config.yaml
+
+        We mirror this behavior by checking whether XDG_CONFIG_HOME is set
+        in the environment. If it is set, return "XDG_CONFIG_HOME";
+        otherwise, return "HOME".
+
+    Returns:
+        str: The environment variable name that should be overridden to
+             redirect Goose’s config into the working directory.
+    """
     if platform.system().lower().startswith("win"):
-        return "Block\\goose\\config\\"
+        return "APPDATA"
+
+    if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]:
+        return "XDG_CONFIG_HOME"
+    return "HOME"
+
+
+def get_goose_config_path() -> Path:
+    """
+    Get the relative config path (from the simulated home directory)
+    where Goose expects its configuration, based on the home
+    environment variable chosen by get_home_env_var().
+
+    Returns:
+        pathlib.Path: The relative config directory path.
+
+    Behavior:
+        - If get_home_env_var() == "APPDATA":
+            Path -> "Block/goose/config/"
+            (matches %APPDATA%\\Block\\goose\\config\\ on Windows)
+
+        - If get_home_env_var() == "HOME":
+            Path -> ".config/goose/"
+            (matches $HOME/.config/goose/ on Unix-like systems)
+
+        - If get_home_env_var() == "XDG_CONFIG_HOME":
+            Path -> "goose/"
+            (matches $XDG_CONFIG_HOME/goose/ on Unix-like systems)
+    """
+    home_env_var = get_home_env_var()
 
-    return ".config/goose/"
+    if home_env_var == "APPDATA":
+        return Path("Block/goose/config/")
+    elif home_env_var == "HOME":
+        return Path(".config/goose/")
+    elif home_env_var == "XDG_CONFIG_HOME":
+        return Path("goose/")
+    else:
+        raise RuntimeError(f"Unhandled home env var: {home_env_var}")
 
 
 class GooseCoder(BaseCoder):
@@ -66,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
             "type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value,
         }
 
+        is_stdio = mcp.type == MCPType.STDIO
+
+        if is_stdio and not mcp.command:
+            raise ValueError("STDIO MCP configuration requires 'command'.")
+
         if mcp.description:
             extension["description"] = mcp.description
 
@@ -146,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
         config_content["extensions"] = extensions
 
+        cfg_rel = get_goose_config_path() / "config.yaml"
+
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=get_goose_config_path() + "config.yaml",
+                relative_path=str(cfg_rel),
                 content=config_content,
             )
         ]
@@ -177,18 +243,13 @@ def run(self, input_text: str) -> CoderOutput:
             local_home_path = Path(cwd)
 
             # OS-specific config layout
-            goose_config_dir = local_home_path / get_goose_config_path()
-            # OS-specific home directory environment variable
-            if platform.system().lower().startswith("win"):
-                home_env_var = "APPDATA"
-            else:
-                home_env_var = "XDG_CONFIG_HOME"
+            home_env_var = get_home_env_var()
+            env[home_env_var] = str(local_home_path)
 
+            goose_config_dir = local_home_path / get_goose_config_path()
             goose_cfg_path = goose_config_dir / "config.yaml"
-
-            logger.info(f"Goose config: {goose_cfg_path}\n")
-
-            env[home_env_var] = str(local_home_path)
+            logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}")
+            logger.info(f"Goose config (expected at): {goose_cfg_path}")
 
             text = self.expand_prompt(input_text)
             command = [str(goose_path), "run", "-t", text]
diff --git a/tests/test_goose_paths.py b/tests/test_goose_paths.py
new file mode 100644
index 0000000..5a5fb53
--- /dev/null
+++ b/tests/test_goose_paths.py
@@ -0,0 +1,105 @@
+from pathlib import Path
+import pytest
+
+from metacoder.coders.goose import get_home_env_var, get_goose_config_path
+
+
+def _norm(p: Path | str) -> str:
+    """Normalize path separators & strip trailing slashes for stable compares."""
+    s = str(p).replace("\\", "/")
+    return s[:-1] if s.endswith("/") else s
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env",
+    [
+        ("Windows", None, "APPDATA"),
+        ("Linux", None, "HOME"),
+        ("Darwin", None, "HOME"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME"),
+    ],
+)
+def test_env_var_selection(monkeypatch, platform_name, xdg_value, expected_env):
+    # Simulate platform
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    # Simulate XDG presence/absence
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    actual = get_home_env_var()
+    assert actual == expected_env
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env, expected_rel_dir",
+    [
+        ("Windows", None, "APPDATA", "Block/goose/config"),
+        ("Linux", None, "HOME", ".config/goose"),
+        ("Darwin", None, "HOME", ".config/goose"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME", "goose"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME", "goose"),
+    ],
+)
+def test_config_path_matches_env(
+    monkeypatch, platform_name, xdg_value, expected_env, expected_rel_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    env_var = get_home_env_var()
+    rel_path = get_goose_config_path()
+
+    assert env_var == expected_env
+    assert _norm(rel_path) == expected_rel_dir
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, workdir, expected_effective_dir",
+    [
+        ("Windows", None, "C:/tmp/work", "C:/tmp/work/Block/goose/config/config.yaml"),
+        ("Linux", None, "/tmp/work", "/tmp/work/.config/goose/config.yaml"),
+        (
+            "Darwin",
+            None,
+            "/Users/alice/work",
+            "/Users/alice/work/.config/goose/config.yaml",
+        ),
+        ("Linux", "/custom/xdg", "/tmp/work", "/tmp/work/goose/config.yaml"),
+        (
+            "Darwin",
+            "/Users/alice/.conf",
+            "/Users/alice/work",
+            "/Users/alice/work/goose/config.yaml",
+        ),
+    ],
+)
+def test_effective_config_location(
+    monkeypatch, platform_name, xdg_value, workdir, expected_effective_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    local_home_path = Path(workdir)
+
+    goose_config_dir = local_home_path / get_goose_config_path()
+    goose_cfg_file = goose_config_dir / "config.yaml"
+
+    assert _norm(goose_cfg_file) == _norm(expected_effective_dir)