Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cd95918
Enforce UTF-8 for Goose session files.
Aug 29, 2025
a791ce5
Fixes issue #15. Prevents divide by zero errors and cleans up summari…
Aug 29, 2025
49891a3
Cleaned up output by using consistent printing methods.
Aug 29, 2025
46ad344
Fixes Issue #18 by implementing metric downgrades to Claude if OpenAP…
Aug 30, 2025
fc7ba41
Satisfied ruff's bizarre rules.
Aug 30, 2025
54dd3d3
Added extra logging and test for goose UTF-8 handling.
Aug 30, 2025
72f586c
Added metacoder configuration test cases for claude downgrade and no …
Aug 30, 2025
d7beb19
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
d88ca90
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
e7bba40
Added assertion to confirm that ClaudeJudge completed scoring the met…
Aug 30, 2025
d27277b
Added assertion to force test to fail on Exception. Increased logging…
Aug 30, 2025
3f22fc6
Fixed runtime issues related to metric downgrade from CorrectnessMetr…
Aug 30, 2025
d6e1e44
Added test coverage of new evaluation judge functionality. Added test…
Aug 30, 2025
882a3d9
Reduced logging verbosity. Added Anthropic quota check. Added automat…
Sep 2, 2025
c98c9d7
Fixed issue #23. Forced processes to be launched with UTF-8 encoding …
Sep 2, 2025
4761d19
Addressed ruff formatting issue.
Sep 2, 2025
6b64a79
Added output file check to fail if the output file already exists. Ot…
Sep 2, 2025
c436e7f
Modified save_results to append to existing output file rather than o…
Sep 2, 2025
b0b1c8b
Updated ClaudeJudge model to claude-sonnet-4-20250514.
Sep 3, 2025
a7e71e3
Revert "Modified save_results to append to existing output file rathe…
Sep 3, 2025
7e143da
Added UTF-8 encoding to prevent character mangling during YAML export…
Sep 4, 2025
37cbb2f
Added support for grouping test case eval results with 'group' key in…
Sep 4, 2025
bdec2e3
Updated test_runner.py to include Default case_group in EvalResults t…
Sep 4, 2025
9386097
Updated Anthropic fallback mode from claude-3-5-sonnet-20240620 to cl…
Sep 4, 2025
0d855bc
Corrected test cases to match the expected Anthropic model.
Sep 4, 2025
9d9bca0
Removed unnecessary duplicate path element in work directory. Readabi…
Sep 5, 2025
bd474c9
Fix Issue #30. Goose supports an environment variable to disable usin…
Sep 5, 2025
142b8b8
Partially addresses Issue #29 Windows compatibility. Uses os.cwd() in…
Sep 5, 2025
b5faef3
Uses safer XDG_CONFIG_HOME instead of changing HOME environment varia…
Sep 5, 2025
6d6ba8d
Changed informational log message to make it clear that a directory p…
Sep 5, 2025
80772c2
The Goose executable is now detected in a cross-platform way, and the…
Sep 5, 2025
ef6337c
Moved hard-coded values into variables in preparation for cross-platf…
Sep 5, 2025
87d556d
Added OS-specific Goose config folder structures. Replaced hard-coded…
Sep 5, 2025
a911381
Refactored OS environment detection to create relative paths for code…
Sep 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions src/metacoder/coders/base_coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,15 @@ def run_process(
"""
if env is None:
env = self.expand_env(self.env)

# Decode the child process output as UTF-8 (instead of default encoding)
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
errors="replace", # avoid crashes on the occasional bad byte
env=env,
bufsize=1,
universal_newlines=True,
Expand All @@ -189,6 +193,15 @@ def run_process(
# check verbosity level
quiet_mode = logger.getEffectiveLevel() <= logging.INFO

# Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
for s in (sys.stdout, sys.stderr):
try:
s.reconfigure(encoding="utf-8", errors="replace") # Python 3.7+
except Exception as e:
logger.info(f"{e}")
pass # OK if not available (e.g., redirected or older Python)

# lines are already str decoded as UTF-8
def stream_output(pipe, output_lines, stream):
for line in iter(pipe.readline, ""):
if not quiet_mode:
Expand Down Expand Up @@ -352,7 +365,8 @@ def prepare_workdir(self):

if self.config_objects is None:
self.config_objects = self.default_config_objects()
logger.info(f"📁 Preparing workdir: {self.workdir}")
logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
logger.info(f" (resolved): {Path(self.workdir).resolve()}")
with change_directory(self.workdir):
# clear old config objects
for path, _type in self.default_config_paths().items():
Expand All @@ -366,7 +380,10 @@ def prepare_workdir(self):
path.unlink()
logger.debug(f"🔧 Writing config objects: {self.config_objects}")
for config_object in self.config_objects:
path = Path(config_object.relative_path)
rel = Path(config_object.relative_path)
if rel.is_absolute():
raise ValueError(f"Config object path must be relative: {rel}")
path = rel
path.parent.mkdir(parents=True, exist_ok=True)
logger.info(
f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
ao.tool_uses = tool_uses

end_time = time.time()
logger.info(f"🤖 Command took {end_time - start_time} seconds")
logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/codex.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
if "result" in message:
ao.result_text = message["result"]
end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
logger.info(f"💎 Command took {end_time - start_time} seconds")
logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")

# Parse the output
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
Expand Down
117 changes: 110 additions & 7 deletions src/metacoder/coders/goose.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import os
import platform
from pathlib import Path
import time
import logging
Expand All @@ -19,6 +21,80 @@
logger = logging.getLogger(__name__)


def find_goose() -> Path:
loc = shutil.which("goose")
if not loc:
raise FileNotFoundError("goose not found on PATH")
return Path(loc).resolve()


def get_home_env_var() -> str:
"""
Determine the environment variable Goose should treat as "home"
for locating configuration files.

Windows:
Goose expects its configuration under:
%APPDATA%\\Block\\goose\\config\\
Therefore, we override APPDATA to point into the working directory.

Unix-like (Linux, macOS):
Goose follows the XDG Base Directory spec:
- If $XDG_CONFIG_HOME is set, config goes under:
$XDG_CONFIG_HOME/goose/config.yaml
- Otherwise it falls back to:
$HOME/.config/goose/config.yaml

We mirror this behavior by checking whether XDG_CONFIG_HOME is set
in the environment. If it is set, return "XDG_CONFIG_HOME";
otherwise, return "HOME".

Returns:
str: The environment variable name that should be overridden to
redirect Goose’s config into the working directory.
"""
if platform.system().lower().startswith("win"):
return "APPDATA"

if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]:
return "XDG_CONFIG_HOME"
return "HOME"


def get_goose_config_path() -> Path:
"""
Get the relative config path (from the simulated home directory)
where Goose expects its configuration, based on the home
environment variable chosen by get_home_env_var().

Returns:
pathlib.Path: The relative config directory path.

Behavior:
- If get_home_env_var() == "APPDATA":
Path -> "Block/goose/config/"
(matches %APPDATA%\\Block\\goose\\config\\ on Windows)

- If get_home_env_var() == "HOME":
Path -> ".config/goose/"
(matches $HOME/.config/goose/ on Unix-like systems)

- If get_home_env_var() == "XDG_CONFIG_HOME":
Path -> "goose/"
(matches $XDG_CONFIG_HOME/goose/ on Unix-like systems)
"""
home_env_var = get_home_env_var()

if home_env_var == "APPDATA":
return Path("Block/goose/config/")
elif home_env_var == "HOME":
return Path(".config/goose/")
elif home_env_var == "XDG_CONFIG_HOME":
return Path("goose/")
else:
raise RuntimeError(f"Unhandled home env var: {home_env_var}")


class GooseCoder(BaseCoder):
"""
Note that running goose involves simulating a home directory in
Expand Down Expand Up @@ -49,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
"type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value,
}

is_stdio = mcp.type == MCPType.STDIO

if is_stdio and not mcp.command:
raise ValueError("STDIO MCP configuration requires 'command'.")

if mcp.description:
extension["description"] = mcp.description

Expand Down Expand Up @@ -129,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]:

config_content["extensions"] = extensions

cfg_rel = get_goose_config_path() / "config.yaml"

return [
CoderConfigObject(
file_type=FileType.YAML,
relative_path=".config/goose/config.yaml",
relative_path=str(cfg_rel),
content=config_content,
)
]
Expand All @@ -145,18 +228,38 @@ def run(self, input_text: str) -> CoderOutput:
env = self.expand_env(self.env)
self.prepare_workdir()
with change_directory(self.workdir):
# important - ensure that only local config files are used
# we assue chdir has been called beforehand
env["HOME"] = "."
goose_path = find_goose()
logger.debug(f"Using goose executable at: {goose_path}")

# Build environment with redirected config

# disable keyring (prevents errors on MacOS and Linux)
env["GOOSE_DISABLE_KEYRING"] = "1"

# Important:
# (1) ensure that only local config files are used;
# (2) assume chdir has been called beforehand.
cwd = os.getcwd()
local_home_path = Path(cwd)

# OS-specific config layout
home_env_var = get_home_env_var()
env[home_env_var] = str(local_home_path)

goose_config_dir = local_home_path / get_goose_config_path()
goose_cfg_path = goose_config_dir / "config.yaml"
logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}")
logger.info(f"Goose config (expected at): {goose_cfg_path}")

text = self.expand_prompt(input_text)
command = ["goose", "run", "-t", text]
command = [str(goose_path), "run", "-t", text]
logger.info(f"🦆 Running command: {' '.join(command)}")
# time the command
start_time = time.time()
result = self.run_process(command, env)
end_time = time.time()
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
logger.info(f"🦆 Command took {end_time - start_time} seconds")
logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
# look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
session_file: Optional[Path] = None
for line in result.stdout.split("\n"):
Expand All @@ -165,7 +268,7 @@ def run(self, input_text: str) -> CoderOutput:
session_file = Path(session_file_str)
break
if session_file and session_file.exists():
with open(session_file, "r") as f:
with open(session_file, "r", encoding="utf-8") as f:
ao.structured_messages = [
json.loads(line) for line in f if line.strip()
]
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")

# Create output - Qwen CLI doesn't provide structured output
ao = CoderOutput(
Expand Down
3 changes: 3 additions & 0 deletions src/metacoder/evals/eval_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class EvalCase(BaseModel):
"""

name: str = Field(..., description="Unique identifier for the test case")
group: Optional[str] = Field(
default="Default", description="Test category for result grouping."
)
metrics: List[str] = Field(
...,
description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
Expand Down
89 changes: 89 additions & 0 deletions src/metacoder/evals/judges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# metacoder/evals/judges.py
import logging
import os

from anthropic import Anthropic
from anthropic.types import MessageParam, TextBlockParam, TextBlock

from deepeval.models.base_model import DeepEvalBaseLLM

logger = logging.getLogger(__name__)


class ClaudeJudge(DeepEvalBaseLLM):
"""
Wraps Anthropic's Claude models so they can be used as
the `model` parameter to DeepEval metrics like GEval.
"""

# Note: Anthropic models can be listed via:
# curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
# {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}

def __init__(
self,
model_name: str = "claude-sonnet-4-20250514",
max_tokens: int = 1024,
temperature: float = 0.0,
):
super().__init__()
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise Exception("ANTHROPIC_API_KEY is not set in environment")
self.client = Anthropic(api_key=api_key)
self.model_name = model_name
self.max_tokens = max_tokens
self.temperature = temperature

def load_model(self):
return self

def generate(self, prompt: str) -> str:
# Build typed content blocks and messages to satisfy the SDK's type hints
content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
resp = self.client.messages.create(
model=self.model_name,
max_tokens=self.max_tokens,
temperature=self.temperature,
messages=messages,
)
# anthropic returns a list of content blocks; collect only the text blocks.
parts: list[str] = []
for block in resp.content:
if isinstance(block, TextBlock):
parts.append(block.text)
return "".join(parts)

async def a_generate(self, prompt: str) -> str:
# for now just call the sync path
return self.generate(prompt)

def get_model_name(self) -> str:
return self.model_name

def has_available_quota(self) -> bool:
"""
Try a very lightweight request to check if quota is available.
Returns True if quota exists, False if Anthropic responds with
quota-related errors.
"""
try:
# Use a minimal "ping" request
content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
self.client.messages.create(
model=self.model_name,
max_tokens=1, # cheapest possible
temperature=0.0,
messages=messages,
)
return True
except Exception as e:
msg = str(e).lower()
# Check for insufficient quota:
# 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
if "credit balance is too low" in msg or "400" in msg:
logger.warning(f"ClaudeJudge quota check failed: {e}")
return False
raise
Loading