diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index b44c6ec..408a794 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,15 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,6 +193,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        # lines are already str decoded as UTF-8
         def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
@@ -352,7 +365,8 @@ def prepare_workdir(self):
 
         if self.config_objects is None:
             self.config_objects = self.default_config_objects()
-        logger.info(f"📁 Preparing workdir: {self.workdir}")
+        logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
+        logger.info(f"                     (resolved): {Path(self.workdir).resolve()}")
         with change_directory(self.workdir):
             # clear old config objects
             for path, _type in self.default_config_paths().items():
@@ -366,7 +380,10 @@ def prepare_workdir(self):
                         path.unlink()
             logger.debug(f"🔧 Writing config objects: {self.config_objects}")
             for config_object in self.config_objects:
-                path = Path(config_object.relative_path)
+                rel = Path(config_object.relative_path)
+                if rel.is_absolute():
+                    raise ValueError(f"Config object path must be relative: {rel}")
+                path = rel
                 path.parent.mkdir(parents=True, exist_ok=True)
                 logger.info(
                     f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index ee31b74..1a43295 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 8e9169e..3451ebe 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 20564a9..6af35c4 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 514dc2b..d6c4b35 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -1,4 +1,6 @@
 import json
+import os
+import platform
 from pathlib import Path
 import time
 import logging
@@ -19,6 +21,80 @@
 logger = logging.getLogger(__name__)
 
 
+def find_goose() -> Path:
+    loc = shutil.which("goose")
+    if not loc:
+        raise FileNotFoundError("goose not found on PATH")
+    return Path(loc).resolve()
+
+
+def get_home_env_var() -> str:
+    """
+    Determine the environment variable Goose should treat as "home"
+    for locating configuration files.
+
+    Windows:
+        Goose expects its configuration under:
+            %APPDATA%\\Block\\goose\\config\\
+        Therefore, we override APPDATA to point into the working directory.
+
+    Unix-like (Linux, macOS):
+        Goose follows the XDG Base Directory spec:
+            - If $XDG_CONFIG_HOME is set, config goes under:
+                  $XDG_CONFIG_HOME/goose/config.yaml
+            - Otherwise it falls back to:
+                  $HOME/.config/goose/config.yaml
+
+        We mirror this behavior by checking whether XDG_CONFIG_HOME is set
+        in the environment. If it is set, return "XDG_CONFIG_HOME";
+        otherwise, return "HOME".
+
+    Returns:
+        str: The environment variable name that should be overridden to
+             redirect Goose’s config into the working directory.
+    """
+    if platform.system().lower().startswith("win"):
+        return "APPDATA"
+
+    if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]:
+        return "XDG_CONFIG_HOME"
+    return "HOME"
+
+
+def get_goose_config_path() -> Path:
+    """
+    Get the relative config path (from the simulated home directory)
+    where Goose expects its configuration, based on the home
+    environment variable chosen by get_home_env_var().
+
+    Returns:
+        pathlib.Path: The relative config directory path.
+
+    Behavior:
+        - If get_home_env_var() == "APPDATA":
+            Path -> "Block/goose/config/"
+            (matches %APPDATA%\\Block\\goose\\config\\ on Windows)
+
+        - If get_home_env_var() == "HOME":
+            Path -> ".config/goose/"
+            (matches $HOME/.config/goose/ on Unix-like systems)
+
+        - If get_home_env_var() == "XDG_CONFIG_HOME":
+            Path -> "goose/"
+            (matches $XDG_CONFIG_HOME/goose/ on Unix-like systems)
+    """
+    home_env_var = get_home_env_var()
+
+    if home_env_var == "APPDATA":
+        return Path("Block/goose/config/")
+    elif home_env_var == "HOME":
+        return Path(".config/goose/")
+    elif home_env_var == "XDG_CONFIG_HOME":
+        return Path("goose/")
+    else:
+        raise RuntimeError(f"Unhandled home env var: {home_env_var}")
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -49,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
             "type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value,
         }
 
+        is_stdio = mcp.type == MCPType.STDIO
+
+        if is_stdio and not mcp.command:
+            raise ValueError("STDIO MCP configuration requires 'command'.")
+
         if mcp.description:
             extension["description"] = mcp.description
 
@@ -129,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
         config_content["extensions"] = extensions
 
+        cfg_rel = get_goose_config_path() / "config.yaml"
+
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
+                relative_path=str(cfg_rel),
                 content=config_content,
             )
         ]
@@ -145,18 +228,38 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
-            # important - ensure that only local config files are used
-            # we assue chdir has been called beforehand
-            env["HOME"] = "."
+            goose_path = find_goose()
+            logger.debug(f"Using goose executable at: {goose_path}")
+
+            # Build environment with redirected config
+
+            # disable keyring (prevents errors on MacOS and Linux)
+            env["GOOSE_DISABLE_KEYRING"] = "1"
+
+            # Important:
+            # (1) ensure that only local config files are used;
+            # (2) assume chdir has been called beforehand.
+            cwd = os.getcwd()
+            local_home_path = Path(cwd)
+
+            # OS-specific config layout
+            home_env_var = get_home_env_var()
+            env[home_env_var] = str(local_home_path)
+
+            goose_config_dir = local_home_path / get_goose_config_path()
+            goose_cfg_path = goose_config_dir / "config.yaml"
+            logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}")
+            logger.info(f"Goose config (expected at): {goose_cfg_path}")
+
             text = self.expand_prompt(input_text)
-            command = ["goose", "run", "-t", text]
+            command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")
             # time the command
             start_time = time.time()
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
@@ -165,7 +268,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]
diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
index 43aefb6..b6f4080 100644
--- a/src/metacoder/coders/qwen.py
+++ b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(
diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index d7dab3e..471c13d 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -21,6 +21,9 @@ class EvalCase(BaseModel):
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
+    group: Optional[str] = Field(
+        default="Default", description="Test category for result grouping."
+    )
     metrics: List[str] = Field(
         ...,
         description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
new file mode 100644
index 0000000..cc20e32
--- /dev/null
+++ b/src/metacoder/evals/judges.py
@@ -0,0 +1,89 @@
+# metacoder/evals/judges.py
+import logging
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    # Note: Anthropic models can be listed via:
+    # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
+    # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
+
+    def __init__(
+        self,
+        model_name: str = "claude-sonnet-4-20250514",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
+        self.client = Anthropic(api_key=api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 67a9619..a80060a 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -5,27 +5,32 @@
 """
 
 import copy
+import functools
 import importlib
 import logging
+import os
 import time
+import traceback
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 
 from pydantic import BaseModel
 import yaml
+
 from deepeval import evaluate
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import BaseMetric, GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
+from openai import APIStatusError
+from openai.types.chat import ChatCompletionMessageParam
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
 from metacoder.evals.eval_model import EvalCase, EvalDataset
 from metacoder.configuration import AIModelConfig, CoderConfig
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -59,24 +64,34 @@ def is_successful(self) -> bool:
         return self.success
 
 
-def get_default_metrics() -> Dict[str, BaseMetric]:
-    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
+    """Creates a GEval instance with the specified model."""
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        threshold=0.8,
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        model=model,  # may be None (defaults to OpenAI) or a Claude judge
+    )
+
+
+def get_default_metrics(
+    model: Optional[DeepEvalBaseLLM] = None,
+) -> Dict[str, BaseMetric]:
+    """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": GEval(
-            name="Correctness",
-            criteria="Determine whether the actual output is factually correct based on the expected output.",
-            # NOTE: you can only provide either criteria or evaluation_steps, and not both
-            evaluation_steps=[
-                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-                "You should also heavily penalize omission of detail",
-                "Vague language, or contradicting OPINIONS, are OK",
-            ],
-            threshold=0.8,
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-            ],
+        "CorrectnessMetric": make_geval(
+            model=model  # Note: GEval defaults to OpenAI if no model is specified.
         ),
         "DummyMetric": DummyMetric(threshold=0.5),
     }
@@ -106,6 +121,7 @@ class EvalResult(BaseModel):
     model: str
     coder: str
     case_name: str
+    case_group: str
     metric_name: str
     score: float
     passed: bool
@@ -123,6 +139,8 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
+        self.use_openai = True  # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -183,6 +201,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
             additional_metadata=case.additional_metadata,
         )
 
+    @functools.lru_cache(maxsize=1)
+    def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
+        if not os.getenv("OPENAI_API_KEY"):
+            logger.info("OPENAI_API_KEY is not set.")
+            return False
+        """
+            Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
+            Fast probe of the /chat/completions endpoint (the one GEval uses).
+            Returns False on 429 (insufficient_quota) or any exception.
+        """
+        try:
+            from openai import OpenAI
+
+            # turn off SDK retries for the check so it returns fast
+            client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
+            # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
+            raw = [{"role": "user", "content": "ping"}]
+            messages = cast(List[ChatCompletionMessageParam], raw)
+            client.chat.completions.create(
+                model=model,
+                messages=messages,
+                max_tokens=1,
+                temperature=0,
+            )
+            return True
+        except APIStatusError as e:
+            # 429 insufficient quota or too many requests
+            if e.status_code == 429:
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
+                return False
+            # 401 authentication problem, including invalid API key
+            if e.status_code == 401:
+                logger.warning(f"OpenAI API Authentication Error: {e}")
+                return False
+            # all other errors
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
+            return False
+        except Exception as e:
+            # includes network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            return False
+
     def run_single_eval(
         self,
         model_name: str,
@@ -198,7 +258,7 @@ def run_single_eval(
         # Create coder instance
         coder = create_coder(
             coder_name,
-            workdir=str(workdir / f"{model_name}_{coder_name}_{case.name}"),
+            workdir=str(workdir),
             config=coder_config,
         )
 
@@ -235,8 +295,65 @@ def run_single_eval(
             test_case = self.create_test_case(case, actual_output)
 
             # Evaluate
-            logger.info(f"Evaluating with {metric_name}")
-            eval_results = evaluate([test_case], [metric])
+            logger.info(
+                f"Evaluating {metric_name} using model {metric.model.model_name}"
+            )
+
+            if isinstance(metric, GEval):
+                # Assume GEval will use OpenAI until is disabled.
+                if self.use_openai and not self._openai_quota_ok():
+                    logger.warning(
+                        "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                    )
+                    self.use_openai = False
+
+                # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
+                if not self.use_openai:
+                    claude_model = "claude-sonnet-4-20250514"
+                    logger.warning(
+                        f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
+                    )
+
+                    try:
+                        # Downgrade metric model to Claude judge.
+                        from metacoder.evals.judges import ClaudeJudge
+
+                        judge = ClaudeJudge(claude_model)
+
+                        if not judge.has_available_quota():
+                            raise Exception(
+                                "No Anthropic credits available for ClaudeJudge."
+                            )
+
+                        metric = make_geval(model=judge)
+                        logger.info(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
+                    except Exception as e:
+                        # Fallback: if you can't use Claude, downgrade gracefully.
+                        logging.debug(traceback.format_exc())
+                        logger.debug(e)
+                        logger.warning(
+                            f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
+                        )
+                        metric = DummyMetric(threshold=0.5)
+                        logger.warning(f"Downgraded {metric_name} to {metric.name}.")
+
+            eval_results = evaluate(
+                [test_case],
+                [metric],
+                async_config=AsyncConfig(run_async=False),  # disable async
+                display_config=DisplayConfig(
+                    show_indicator=False,  # hide the progress meter
+                    print_results=False,
+                    verbose_mode=self.verbose,
+                ),
+                cache_config=CacheConfig(use_cache=False, write_cache=False),
+                error_config=ErrorConfig(
+                    ignore_errors=False,  # actually fail on failure
+                    skip_on_missing_params=True,
+                ),
+            )
 
             # Extract results - the structure varies by deepeval version
             test_result = eval_results.test_results[0]
@@ -270,6 +387,7 @@ def run_single_eval(
                 model=model_name,
                 coder=coder_name,
                 case_name=case.name,
+                case_group=case.group,
                 metric_name=metric_name,
                 score=score if score is not None else 0.0,
                 passed=passed,
@@ -366,7 +484,7 @@ def run_all_evals(
                             else " (no servers)"
                         )
                         logger.info(
-                            f"Progress: {current}/{total_combinations} - {coder_name}/{model_name}/{case.name}{server_desc}"
+                            f"Progress: {current}/{total_combinations} ({coder_name} | {model_name} | {case.name}{server_desc})"
                         )
 
                         # Create unique workdir for this combination
@@ -408,7 +526,7 @@ def save_results(self, results: List[EvalResult], output_path: Path):
             results_data.append(result.model_dump())
 
         # Save as YAML
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             yaml.dump(
                 {"results": results_data, "summary": self.generate_summary(results)},
                 f,
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index f62d3df..5e1d616 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional, Union
 
@@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     output_path = Path(output)
     workdir_path = Path(workdir)
 
+    try:
+        # Create the output file only if it doesn't exist; fail if it does
+        with output_path.open("x", encoding="utf-8") as _:
+            pass
+    except FileExistsError:
+        print(
+            f"Error: '{output_path}' already exists. Please delete it or specify a different filename.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     # Convert coders tuple to list (empty tuple if not specified)
     coders_list = list(coders) if coders else None
 
@@ -592,37 +604,43 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
-    click.echo("\n📈 Summary:")
-    click.echo(f"   Total: {summary['total_evaluations']}")
-    click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
+    frac_passed = (
+        summary["passed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
     )
-    click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
+    frac_failed = (
+        summary["failed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
     )
-    if summary["errors"] > 0:
-        click.echo(f"   Errors: {summary['errors']} ⚠️")
+
+    click.echo("\n📈 Summary:")
+    click.echo(f"   Total: {summary['total_evaluations']}")
+    click.echo(f"   Passed: {summary['passed']} ({frac_passed:.1%})")
+    click.echo(f"   Failed: {summary['failed']} ({frac_failed:.1%})")
+    click.echo(f"   Errors: {summary['errors']} ⚠️") if summary["errors"] else None
 
     # Print by-coder summary
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
+            coder_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
             )
             click.echo(
-                f"     {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
+                f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})"
             )
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
+            model_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
             )
             click.echo(
-                f"     {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
+                f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})"
             )
 
     click.echo("\n✅ Evaluation complete!")
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..95f4c37
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+import logging
+import sys
+
+
+def pytest_configure(config):
+    logging.basicConfig(
+        level=logging.WARNING,
+        format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
new file mode 100644
index 0000000..7d305ce
--- /dev/null
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: cjmungall@lbl.gov
+
+server_combinations:
+  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index 1037215..73b0615 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -7,33 +7,49 @@ coders:
   goose: {}
 
 models:
-  gpt-4o:
+  claude-sonnet:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
+# Refer to metacoder/src/mcps/registry/scilit.yaml for the list of available MCPs.
 servers:
-  mcp-simple-pubmed:
+  artl:
+    name: artl
+    command: uvx
+    args: [artl-mcp]
+  simple-pubmed:
     name: pubmed
     command: uvx
     args: [mcp-simple-pubmed]
     env:
       PUBMED_EMAIL: cjmungall@lbl.gov
-  ols-mcp:
+  ols:
     name: ols
     command: uvx
     args: [mcp-ols]
 
 server_combinations:
-  - [mcp-simple-pubmed, ols-mcp]
+#  - [artl, simple-pubmed, ols]
+  - [artl]
+  - [simple-pubmed]
+#  - [ols]
 
 cases:
-  - name: "disease"
-    metrics: [CorrectnessMetric]
-    input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
-    expected_output: |
-      MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
-      MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
-      MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
+- name: PMID_28027860_Full_Text
+#  group: Text extraction # should default to "Default"
+  metrics:
+  - CorrectnessMetric
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: "Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
 
-    threshold: 0.7
-    
+# Per convo with Charles, Justin, Mark: this test case is kind of tricky and it seems
+# like an extremely difficult case that even a good LLM + MCP might not pass. We've
+# made some edits to give the LLM + MCP a fair chance
+- name: PMC8086273_Retraction
+  group: Summarization
+  metrics:
+  - CorrectnessMetric
+  input: "Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?"
+  expected_output: "The paper says No but it is retracted so the results should not be trusted."
+  threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
new file mode 100644
index 0000000..a027f80
--- /dev/null
+++ b/tests/input/goose_no_server_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: cjmungall@lbl.gov
+
+#server_combinations:
+#  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
new file mode 100644
index 0000000..d0fea1b
--- /dev/null
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -0,0 +1,29 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  ols:
+    name: ols
+    command: uvx
+    args: [mcp-ols]
+
+server_combinations:
+  - [simple-pubmed]
+
+cases:
+- name: character_encoding_test
+  metrics:
+  - CorrectnessMetric
+  input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+  expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+  threshold: 0.9
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index a9498b6..5d9daf1 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -3,6 +3,7 @@
 These tests check that each coder can handle a simple arithmetic question.
 """
 
+import json
 import tempfile
 import pytest
 
@@ -164,3 +165,16 @@ def test_dummy_coder_always_works():
         assert result is not None
         assert result.result_text == "you said: Hello, world!"
         assert result.stdout == "you said: Hello, world!"
+
+
+@pytest.mark.integration
+def test_goose_utf8_session_file(tmp_path):
+    """Test session files with UTF-8 content are read correctly."""
+    session_content = '{"role": "assistant", "content": "测试 résumé 🚀"}\n'
+    session_file = tmp_path / "test_session.jsonl"
+    session_file.write_text(session_content, encoding="utf-8")
+
+    with open(session_file, "r", encoding="utf-8") as f:
+        messages = [json.loads(line) for line in f if line.strip()]
+    assert len(messages) == 1
+    assert "测试" in messages[0]["content"]
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
new file mode 100644
index 0000000..f33f2a5
--- /dev/null
+++ b/tests/test_evals/test_claude_judge.py
@@ -0,0 +1,139 @@
+import logging
+import traceback
+from pathlib import Path
+
+from metacoder.evals.runner import EvalRunner
+
+logger = logging.getLogger(__name__)
+
+
+def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that ClaudeJudge is used when OpenAI is disabled."""
+    # TODO: This test should avoid running the coder and only perform the eval step.
+    # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process).
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            results = runner.run_all_evals(
+                dataset, workdir=tmp_path, coders=["goose", "dummy"]
+            )
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
+                in caplog.text
+            )
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass
+
+
+def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available."""
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+
+            # One more OpenAI API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+            # One more Anthropic API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing")
+
+            # TODO: Also need to test this for Anthropic:
+            # Provider
+            # request
+            # failed
+            # with status: 400
+            # Bad
+            # Request.Payload: Some(Object
+            # {"error": Object {"message": String("Your credit balance is too low
+            #                   to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase
+            #                   credits."), "type": String("invalid_request_error")}, "request_id": String("
+            #                   req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin
+            #                   g("error")}).Returning
+            # error: RequestFailed(
+            #     "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits."
+
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"])
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
+                in caplog.text
+            )
+
+            # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric.
+            assert (
+                "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric."
+                in caplog.text
+            )
+
+            # Test that the CorrectnessMetric was successfully downgraded to DummyMetric.
+            assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass
diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py
index d1f0c3e..838ab60 100644
--- a/tests/test_evals/test_runner.py
+++ b/tests/test_evals/test_runner.py
@@ -174,6 +174,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,
@@ -182,6 +183,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case2",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.3,
                 passed=False,
@@ -190,6 +192,7 @@ def test_generate_summary(self):
                 model="model2",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.8,
                 passed=True,
@@ -225,6 +228,7 @@ def test_save_and_load_results(self, tmp_path):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,
diff --git a/tests/test_goose_paths.py b/tests/test_goose_paths.py
new file mode 100644
index 0000000..5a5fb53
--- /dev/null
+++ b/tests/test_goose_paths.py
@@ -0,0 +1,105 @@
+from pathlib import Path
+import pytest
+
+from metacoder.coders.goose import get_home_env_var, get_goose_config_path
+
+
+def _norm(p: Path | str) -> str:
+    """Normalize path separators & strip trailing slashes for stable compares."""
+    s = str(p).replace("\\", "/")
+    return s[:-1] if s.endswith("/") else s
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env",
+    [
+        ("Windows", None, "APPDATA"),
+        ("Linux", None, "HOME"),
+        ("Darwin", None, "HOME"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME"),
+    ],
+)
+def test_env_var_selection(monkeypatch, platform_name, xdg_value, expected_env):
+    # Simulate platform
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    # Simulate XDG presence/absence
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    actual = get_home_env_var()
+    assert actual == expected_env
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env, expected_rel_dir",
+    [
+        ("Windows", None, "APPDATA", "Block/goose/config"),
+        ("Linux", None, "HOME", ".config/goose"),
+        ("Darwin", None, "HOME", ".config/goose"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME", "goose"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME", "goose"),
+    ],
+)
+def test_config_path_matches_env(
+    monkeypatch, platform_name, xdg_value, expected_env, expected_rel_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    env_var = get_home_env_var()
+    rel_path = get_goose_config_path()
+
+    assert env_var == expected_env
+    assert _norm(rel_path) == expected_rel_dir
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, workdir, expected_effective_dir",
+    [
+        ("Windows", None, "C:/tmp/work", "C:/tmp/work/Block/goose/config/config.yaml"),
+        ("Linux", None, "/tmp/work", "/tmp/work/.config/goose/config.yaml"),
+        (
+            "Darwin",
+            None,
+            "/Users/alice/work",
+            "/Users/alice/work/.config/goose/config.yaml",
+        ),
+        ("Linux", "/custom/xdg", "/tmp/work", "/tmp/work/goose/config.yaml"),
+        (
+            "Darwin",
+            "/Users/alice/.conf",
+            "/Users/alice/work",
+            "/Users/alice/work/goose/config.yaml",
+        ),
+    ],
+)
+def test_effective_config_location(
+    monkeypatch, platform_name, xdg_value, workdir, expected_effective_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    local_home_path = Path(workdir)
+
+    goose_config_dir = local_home_path / get_goose_config_path()
+    goose_cfg_file = goose_config_dir / "config.yaml"
+
+    assert _norm(goose_cfg_file) == _norm(expected_effective_dir)