ai4curation · valerie-autumn-skye · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 30, 2025
diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,15 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,6 +193,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        # lines are already str decoded as UTF-8
         def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
@@ -352,7 +365,8 @@ def prepare_workdir(self):
 
         if self.config_objects is None:
             self.config_objects = self.default_config_objects()
-        logger.info(f"📁 Preparing workdir: {self.workdir}")
+        logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
+        logger.info(f"                     (resolved): {Path(self.workdir).resolve()}")
         with change_directory(self.workdir):
             # clear old config objects
             for path, _type in self.default_config_paths().items():
@@ -366,7 +380,10 @@ def prepare_workdir(self):
                         path.unlink()
             logger.debug(f"🔧 Writing config objects: {self.config_objects}")
             for config_object in self.config_objects:
-                path = Path(config_object.relative_path)
+                rel = Path(config_object.relative_path)
+                if rel.is_absolute():
+                    raise ValueError(f"Config object path must be relative: {rel}")
+                path = rel
                 path.parent.mkdir(parents=True, exist_ok=True)
                 logger.info(
                     f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
@@ -1,4 +1,6 @@
 import json
+import os
+import platform
 from pathlib import Path
 import time
 import logging
@@ -19,6 +21,80 @@
 logger = logging.getLogger(__name__)
 
 
+def find_goose() -> Path:
+    loc = shutil.which("goose")
+    if not loc:
+        raise FileNotFoundError("goose not found on PATH")
+    return Path(loc).resolve()
+
+
+def get_home_env_var() -> str:
+    """
+    Determine the environment variable Goose should treat as "home"
+    for locating configuration files.
+
+    Windows:
+        Goose expects its configuration under:
+            %APPDATA%\\Block\\goose\\config\\
+        Therefore, we override APPDATA to point into the working directory.
+
+    Unix-like (Linux, macOS):
+        Goose follows the XDG Base Directory spec:
+            - If $XDG_CONFIG_HOME is set, config goes under:
+                  $XDG_CONFIG_HOME/goose/config.yaml
+            - Otherwise it falls back to:
+                  $HOME/.config/goose/config.yaml
+
+        We mirror this behavior by checking whether XDG_CONFIG_HOME is set
+        in the environment. If it is set, return "XDG_CONFIG_HOME";
+        otherwise, return "HOME".
+
+    Returns:
+        str: The environment variable name that should be overridden to
+             redirect Goose’s config into the working directory.
+    """
+    if platform.system().lower().startswith("win"):
+        return "APPDATA"
+
+    if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]:
+        return "XDG_CONFIG_HOME"
+    return "HOME"
+
+
+def get_goose_config_path() -> Path:
+    """
+    Get the relative config path (from the simulated home directory)
+    where Goose expects its configuration, based on the home
+    environment variable chosen by get_home_env_var().
+
+    Returns:
+        pathlib.Path: The relative config directory path.
+
+    Behavior:
+        - If get_home_env_var() == "APPDATA":
+            Path -> "Block/goose/config/"
+            (matches %APPDATA%\\Block\\goose\\config\\ on Windows)
+
+        - If get_home_env_var() == "HOME":
+            Path -> ".config/goose/"
+            (matches $HOME/.config/goose/ on Unix-like systems)
+
+        - If get_home_env_var() == "XDG_CONFIG_HOME":
+            Path -> "goose/"
+            (matches $XDG_CONFIG_HOME/goose/ on Unix-like systems)
+    """
+    home_env_var = get_home_env_var()
+
+    if home_env_var == "APPDATA":
+        return Path("Block/goose/config/")
+    elif home_env_var == "HOME":
+        return Path(".config/goose/")
+    elif home_env_var == "XDG_CONFIG_HOME":
+        return Path("goose/")
+    else:
+        raise RuntimeError(f"Unhandled home env var: {home_env_var}")
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -49,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
             "type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value,
         }
 
+        is_stdio = mcp.type == MCPType.STDIO
+
+        if is_stdio and not mcp.command:
+            raise ValueError("STDIO MCP configuration requires 'command'.")
+
         if mcp.description:
             extension["description"] = mcp.description
 
@@ -129,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
         config_content["extensions"] = extensions
 
+        cfg_rel = get_goose_config_path() / "config.yaml"
+
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
+                relative_path=str(cfg_rel),
                 content=config_content,
             )
         ]
@@ -145,18 +228,38 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
-            # important - ensure that only local config files are used
-            # we assue chdir has been called beforehand
-            env["HOME"] = "."
+            goose_path = find_goose()
+            logger.debug(f"Using goose executable at: {goose_path}")
+
+            # Build environment with redirected config
+
+            # disable keyring (prevents errors on MacOS and Linux)
+            env["GOOSE_DISABLE_KEYRING"] = "1"
+
+            # Important:
+            # (1) ensure that only local config files are used;
+            # (2) assume chdir has been called beforehand.
+            cwd = os.getcwd()
+            local_home_path = Path(cwd)
+
+            # OS-specific config layout
+            home_env_var = get_home_env_var()
+            env[home_env_var] = str(local_home_path)
+
+            goose_config_dir = local_home_path / get_goose_config_path()
+            goose_cfg_path = goose_config_dir / "config.yaml"
+            logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}")
+            logger.info(f"Goose config (expected at): {goose_cfg_path}")
+
             text = self.expand_prompt(input_text)
-            command = ["goose", "run", "-t", text]
+            command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")
             # time the command
             start_time = time.time()
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
@@ -165,7 +268,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]

diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(

diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
@@ -21,6 +21,9 @@ class EvalCase(BaseModel):
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
+    group: Optional[str] = Field(
+        default="Default", description="Test category for result grouping."
+    )
     metrics: List[str] = Field(
         ...,
         description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
@@ -0,0 +1,89 @@
+# metacoder/evals/judges.py
+import logging
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    # Note: Anthropic models can be listed via:
+    # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
+    # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
+
+    def __init__(
+        self,
+        model_name: str = "claude-sonnet-4-20250514",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
+        self.client = Anthropic(api_key=api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise