diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py index b44c6ec..408a794 100644 --- a/src/metacoder/coders/base_coder.py +++ b/src/metacoder/coders/base_coder.py @@ -173,11 +173,15 @@ def run_process( """ if env is None: env = self.expand_env(self.env) + + # Decode the child process output as UTF-8 (instead of default encoding) process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", # avoid crashes on the occasional bad byte env=env, bufsize=1, universal_newlines=True, @@ -189,6 +193,15 @@ def run_process( # check verbosity level quiet_mode = logger.getEffectiveLevel() <= logging.INFO + # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do). + for s in (sys.stdout, sys.stderr): + try: + s.reconfigure(encoding="utf-8", errors="replace") # Python 3.7+ + except Exception as e: + logger.info(f"{e}") + pass # OK if not available (e.g., redirected or older Python) + + # lines are already str decoded as UTF-8 def stream_output(pipe, output_lines, stream): for line in iter(pipe.readline, ""): if not quiet_mode: @@ -352,7 +365,8 @@ def prepare_workdir(self): if self.config_objects is None: self.config_objects = self.default_config_objects() - logger.info(f"πŸ“ Preparing workdir: {self.workdir}") + logger.info(f"πŸ“ Preparing workdir (relative): {self.workdir}") + logger.info(f" (resolved): {Path(self.workdir).resolve()}") with change_directory(self.workdir): # clear old config objects for path, _type in self.default_config_paths().items(): @@ -366,7 +380,10 @@ def prepare_workdir(self): path.unlink() logger.debug(f"πŸ”§ Writing config objects: {self.config_objects}") for config_object in self.config_objects: - path = Path(config_object.relative_path) + rel = Path(config_object.relative_path) + if rel.is_absolute(): + raise ValueError(f"Config object path must be relative: {rel}") + path = rel path.parent.mkdir(parents=True, exist_ok=True) logger.info( f"πŸ”§ Writing config object: {config_object.relative_path} type={config_object.file_type}" diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py index ee31b74..1a43295 100644 --- a/src/metacoder/coders/claude.py +++ b/src/metacoder/coders/claude.py @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: ao.tool_uses = tool_uses end_time = time.time() - logger.info(f"πŸ€– Command took {end_time - start_time} seconds") + logger.info(f"πŸ€– Command took {end_time - start_time:.2f} seconds") ao.total_cost_usd = total_cost_usd ao.success = not is_error if not ao.success: diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py index 8e9169e..3451ebe 100644 --- a/src/metacoder/coders/codex.py +++ b/src/metacoder/coders/codex.py @@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput: if "result" in message: ao.result_text = message["result"] end_time = time.time() - print(f"πŸ€– Command took {end_time - start_time} seconds") + print(f"πŸ€– Command took {end_time - start_time:.2f} seconds") ao.total_cost_usd = total_cost_usd ao.success = not is_error if not ao.success: diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py index 20564a9..6af35c4 100644 --- a/src/metacoder/coders/gemini.py +++ b/src/metacoder/coders/gemini.py @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput: ) end_time = time.time() - logger.info(f"πŸ’Ž Command took {end_time - start_time} seconds") + logger.info(f"πŸ’Ž Command took {end_time - start_time:.2f} seconds") # Parse the output ao = CoderOutput(stdout=result.stdout, stderr=result.stderr) diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py index 514dc2b..d6c4b35 100644 --- a/src/metacoder/coders/goose.py +++ b/src/metacoder/coders/goose.py @@ -1,4 +1,6 @@ import json +import os +import platform from pathlib import Path import time import logging @@ -19,6 +21,80 @@ logger = logging.getLogger(__name__) +def find_goose() -> Path: + loc = shutil.which("goose") + if not loc: + raise FileNotFoundError("goose not found on PATH") + return Path(loc).resolve() + + +def get_home_env_var() -> str: + """ + Determine the environment variable Goose should treat as "home" + for locating configuration files. + + Windows: + Goose expects its configuration under: + %APPDATA%\\Block\\goose\\config\\ + Therefore, we override APPDATA to point into the working directory. + + Unix-like (Linux, macOS): + Goose follows the XDG Base Directory spec: + - If $XDG_CONFIG_HOME is set, config goes under: + $XDG_CONFIG_HOME/goose/config.yaml + - Otherwise it falls back to: + $HOME/.config/goose/config.yaml + + We mirror this behavior by checking whether XDG_CONFIG_HOME is set + in the environment. If it is set, return "XDG_CONFIG_HOME"; + otherwise, return "HOME". + + Returns: + str: The environment variable name that should be overridden to + redirect Goose’s config into the working directory. + """ + if platform.system().lower().startswith("win"): + return "APPDATA" + + if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]: + return "XDG_CONFIG_HOME" + return "HOME" + + +def get_goose_config_path() -> Path: + """ + Get the relative config path (from the simulated home directory) + where Goose expects its configuration, based on the home + environment variable chosen by get_home_env_var(). + + Returns: + pathlib.Path: The relative config directory path. + + Behavior: + - If get_home_env_var() == "APPDATA": + Path -> "Block/goose/config/" + (matches %APPDATA%\\Block\\goose\\config\\ on Windows) + + - If get_home_env_var() == "HOME": + Path -> ".config/goose/" + (matches $HOME/.config/goose/ on Unix-like systems) + + - If get_home_env_var() == "XDG_CONFIG_HOME": + Path -> "goose/" + (matches $XDG_CONFIG_HOME/goose/ on Unix-like systems) + """ + home_env_var = get_home_env_var() + + if home_env_var == "APPDATA": + return Path("Block/goose/config/") + elif home_env_var == "HOME": + return Path(".config/goose/") + elif home_env_var == "XDG_CONFIG_HOME": + return Path("goose/") + else: + raise RuntimeError(f"Unhandled home env var: {home_env_var}") + + class GooseCoder(BaseCoder): """ Note that running goose involves simulating a home directory in @@ -49,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict: "type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value, } + is_stdio = mcp.type == MCPType.STDIO + + if is_stdio and not mcp.command: + raise ValueError("STDIO MCP configuration requires 'command'.") + if mcp.description: extension["description"] = mcp.description @@ -129,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]: config_content["extensions"] = extensions + cfg_rel = get_goose_config_path() / "config.yaml" + return [ CoderConfigObject( file_type=FileType.YAML, - relative_path=".config/goose/config.yaml", + relative_path=str(cfg_rel), content=config_content, ) ] @@ -145,18 +228,38 @@ def run(self, input_text: str) -> CoderOutput: env = self.expand_env(self.env) self.prepare_workdir() with change_directory(self.workdir): - # important - ensure that only local config files are used - # we assue chdir has been called beforehand - env["HOME"] = "." + goose_path = find_goose() + logger.debug(f"Using goose executable at: {goose_path}") + + # Build environment with redirected config + + # disable keyring (prevents errors on MacOS and Linux) + env["GOOSE_DISABLE_KEYRING"] = "1" + + # Important: + # (1) ensure that only local config files are used; + # (2) assume chdir has been called beforehand. + cwd = os.getcwd() + local_home_path = Path(cwd) + + # OS-specific config layout + home_env_var = get_home_env_var() + env[home_env_var] = str(local_home_path) + + goose_config_dir = local_home_path / get_goose_config_path() + goose_cfg_path = goose_config_dir / "config.yaml" + logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}") + logger.info(f"Goose config (expected at): {goose_cfg_path}") + text = self.expand_prompt(input_text) - command = ["goose", "run", "-t", text] + command = [str(goose_path), "run", "-t", text] logger.info(f"πŸ¦† Running command: {' '.join(command)}") # time the command start_time = time.time() result = self.run_process(command, env) end_time = time.time() ao = CoderOutput(stdout=result.stdout, stderr=result.stderr) - logger.info(f"πŸ¦† Command took {end_time - start_time} seconds") + logger.info(f"πŸ¦† Command took {end_time - start_time:.2f} seconds") # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl session_file: Optional[Path] = None for line in result.stdout.split("\n"): @@ -165,7 +268,7 @@ def run(self, input_text: str) -> CoderOutput: session_file = Path(session_file_str) break if session_file and session_file.exists(): - with open(session_file, "r") as f: + with open(session_file, "r", encoding="utf-8") as f: ao.structured_messages = [ json.loads(line) for line in f if line.strip() ] diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py index 43aefb6..b6f4080 100644 --- a/src/metacoder/coders/qwen.py +++ b/src/metacoder/coders/qwen.py @@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput: ) end_time = time.time() - print(f"πŸ€– Command took {end_time - start_time} seconds") + print(f"πŸ€– Command took {end_time - start_time:.2f} seconds") # Create output - Qwen CLI doesn't provide structured output ao = CoderOutput( diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py index d7dab3e..471c13d 100644 --- a/src/metacoder/evals/eval_model.py +++ b/src/metacoder/evals/eval_model.py @@ -21,6 +21,9 @@ class EvalCase(BaseModel): """ name: str = Field(..., description="Unique identifier for the test case") + group: Optional[str] = Field( + default="Default", description="Test category for result grouping." + ) metrics: List[str] = Field( ..., description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)", diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py new file mode 100644 index 0000000..cc20e32 --- /dev/null +++ b/src/metacoder/evals/judges.py @@ -0,0 +1,89 @@ +# metacoder/evals/judges.py +import logging +import os + +from anthropic import Anthropic +from anthropic.types import MessageParam, TextBlockParam, TextBlock + +from deepeval.models.base_model import DeepEvalBaseLLM + +logger = logging.getLogger(__name__) + + +class ClaudeJudge(DeepEvalBaseLLM): + """ + Wraps Anthropic's Claude models so they can be used as + the `model` parameter to DeepEval metrics like GEval. + """ + + # Note: Anthropic models can be listed via: + # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01" + # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]} + + def __init__( + self, + model_name: str = "claude-sonnet-4-20250514", + max_tokens: int = 1024, + temperature: float = 0.0, + ): + super().__init__() + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + raise Exception("ANTHROPIC_API_KEY is not set in environment") + self.client = Anthropic(api_key=api_key) + self.model_name = model_name + self.max_tokens = max_tokens + self.temperature = temperature + + def load_model(self): + return self + + def generate(self, prompt: str) -> str: + # Build typed content blocks and messages to satisfy the SDK's type hints + content: list[TextBlockParam] = [{"type": "text", "text": prompt}] + messages: list[MessageParam] = [{"role": "user", "content": content}] + resp = self.client.messages.create( + model=self.model_name, + max_tokens=self.max_tokens, + temperature=self.temperature, + messages=messages, + ) + # anthropic returns a list of content blocks; collect only the text blocks. + parts: list[str] = [] + for block in resp.content: + if isinstance(block, TextBlock): + parts.append(block.text) + return "".join(parts) + + async def a_generate(self, prompt: str) -> str: + # for now just call the sync path + return self.generate(prompt) + + def get_model_name(self) -> str: + return self.model_name + + def has_available_quota(self) -> bool: + """ + Try a very lightweight request to check if quota is available. + Returns True if quota exists, False if Anthropic responds with + quota-related errors. + """ + try: + # Use a minimal "ping" request + content: list[TextBlockParam] = [{"type": "text", "text": "ping"}] + messages: list[MessageParam] = [{"role": "user", "content": content}] + self.client.messages.create( + model=self.model_name, + max_tokens=1, # cheapest possible + temperature=0.0, + messages=messages, + ) + return True + except Exception as e: + msg = str(e).lower() + # Check for insufficient quota: + # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits. + if "credit balance is too low" in msg or "400" in msg: + logger.warning(f"ClaudeJudge quota check failed: {e}") + return False + raise diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py index 67a9619..a80060a 100644 --- a/src/metacoder/evals/runner.py +++ b/src/metacoder/evals/runner.py @@ -5,27 +5,32 @@ """ import copy +import functools import importlib import logging +import os import time +import traceback from pathlib import Path -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type, cast from pydantic import BaseModel import yaml + from deepeval import evaluate -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase -from deepeval.metrics import GEval -from deepeval.test_case import LLMTestCaseParams +from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig +from deepeval.models import DeepEvalBaseLLM +from deepeval.metrics import BaseMetric, GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from openai import APIStatusError +from openai.types.chat import ChatCompletionMessageParam from metacoder.coders.base_coder import BaseCoder, CoderOutput from metacoder.registry import AVAILABLE_CODERS from metacoder.evals.eval_model import EvalCase, EvalDataset from metacoder.configuration import AIModelConfig, CoderConfig - logger = logging.getLogger(__name__) @@ -59,24 +64,34 @@ def is_successful(self) -> bool: return self.success -def get_default_metrics() -> Dict[str, BaseMetric]: - """Get default metrics. Creates instances lazily to avoid network calls during import.""" +def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval: + """Creates a GEval instance with the specified model.""" + return GEval( + name="Correctness", + criteria="Determine whether the actual output is factually correct based on the expected output.", + # NOTE: you can only provide either criteria or evaluation_steps, and not both + evaluation_steps=[ + "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", + "You should also heavily penalize omission of detail", + "Vague language, or contradicting OPINIONS, are OK", + ], + threshold=0.8, + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + model=model, # may be None (defaults to OpenAI) or a Claude judge + ) + + +def get_default_metrics( + model: Optional[DeepEvalBaseLLM] = None, +) -> Dict[str, BaseMetric]: + """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import.""" return { - "CorrectnessMetric": GEval( - name="Correctness", - criteria="Determine whether the actual output is factually correct based on the expected output.", - # NOTE: you can only provide either criteria or evaluation_steps, and not both - evaluation_steps=[ - "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", - "You should also heavily penalize omission of detail", - "Vague language, or contradicting OPINIONS, are OK", - ], - threshold=0.8, - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - ], + "CorrectnessMetric": make_geval( + model=model # Note: GEval defaults to OpenAI if no model is specified. ), "DummyMetric": DummyMetric(threshold=0.5), } @@ -106,6 +121,7 @@ class EvalResult(BaseModel): model: str coder: str case_name: str + case_group: str metric_name: str score: float passed: bool @@ -123,6 +139,8 @@ class EvalRunner: def __init__(self, verbose: bool = False): self.verbose = verbose + self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out. + if verbose: logging.basicConfig(level=logging.DEBUG) else: @@ -183,6 +201,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase: additional_metadata=case.additional_metadata, ) + @functools.lru_cache(maxsize=1) + def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool: + if not os.getenv("OPENAI_API_KEY"): + logger.info("OPENAI_API_KEY is not set.") + return False + """ + Preflight: detect β€œno OpenAI quota” and skip/redirect before calling evaluate. + Fast probe of the /chat/completions endpoint (the one GEval uses). + Returns False on 429 (insufficient_quota) or any exception. + """ + try: + from openai import OpenAI + + # turn off SDK retries for the check so it returns fast + client = OpenAI(max_retries=0, timeout=8) # NO retries, quick fail + # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}]) + raw = [{"role": "user", "content": "ping"}] + messages = cast(List[ChatCompletionMessageParam], raw) + client.chat.completions.create( + model=model, + messages=messages, + max_tokens=1, + temperature=0, + ) + return True + except APIStatusError as e: + # 429 insufficient quota or too many requests + if e.status_code == 429: + logger.warning(f"OpenAI API Key has insufficient quota: {e}") + return False + # 401 authentication problem, including invalid API key + if e.status_code == 401: + logger.warning(f"OpenAI API Authentication Error: {e}") + return False + # all other errors + logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}") + return False + except Exception as e: + # includes network issues, etc. + logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}") + return False + def run_single_eval( self, model_name: str, @@ -198,7 +258,7 @@ def run_single_eval( # Create coder instance coder = create_coder( coder_name, - workdir=str(workdir / f"{model_name}_{coder_name}_{case.name}"), + workdir=str(workdir), config=coder_config, ) @@ -235,8 +295,65 @@ def run_single_eval( test_case = self.create_test_case(case, actual_output) # Evaluate - logger.info(f"Evaluating with {metric_name}") - eval_results = evaluate([test_case], [metric]) + logger.info( + f"Evaluating {metric_name} using model {metric.model.model_name}" + ) + + if isinstance(metric, GEval): + # Assume GEval will use OpenAI until is disabled. + if self.use_openai and not self._openai_quota_ok(): + logger.warning( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + ) + self.use_openai = False + + # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics. + if not self.use_openai: + claude_model = "claude-sonnet-4-20250514" + logger.warning( + f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}." + ) + + try: + # Downgrade metric model to Claude judge. + from metacoder.evals.judges import ClaudeJudge + + judge = ClaudeJudge(claude_model) + + if not judge.has_available_quota(): + raise Exception( + "No Anthropic credits available for ClaudeJudge." + ) + + metric = make_geval(model=judge) + logger.info( + f"Successfully downgraded {metric_name} model to {metric.model.model_name}." + ) + except Exception as e: + # Fallback: if you can't use Claude, downgrade gracefully. + logging.debug(traceback.format_exc()) + logger.debug(e) + logger.warning( + f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric." + ) + metric = DummyMetric(threshold=0.5) + logger.warning(f"Downgraded {metric_name} to {metric.name}.") + + eval_results = evaluate( + [test_case], + [metric], + async_config=AsyncConfig(run_async=False), # disable async + display_config=DisplayConfig( + show_indicator=False, # hide the progress meter + print_results=False, + verbose_mode=self.verbose, + ), + cache_config=CacheConfig(use_cache=False, write_cache=False), + error_config=ErrorConfig( + ignore_errors=False, # actually fail on failure + skip_on_missing_params=True, + ), + ) # Extract results - the structure varies by deepeval version test_result = eval_results.test_results[0] @@ -270,6 +387,7 @@ def run_single_eval( model=model_name, coder=coder_name, case_name=case.name, + case_group=case.group, metric_name=metric_name, score=score if score is not None else 0.0, passed=passed, @@ -366,7 +484,7 @@ def run_all_evals( else " (no servers)" ) logger.info( - f"Progress: {current}/{total_combinations} - {coder_name}/{model_name}/{case.name}{server_desc}" + f"Progress: {current}/{total_combinations} ({coder_name} | {model_name} | {case.name}{server_desc})" ) # Create unique workdir for this combination @@ -408,7 +526,7 @@ def save_results(self, results: List[EvalResult], output_path: Path): results_data.append(result.model_dump()) # Save as YAML - with open(output_path, "w") as f: + with open(output_path, "w", encoding="utf-8") as f: yaml.dump( {"results": results_data, "summary": self.generate_summary(results)}, f, diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py index f62d3df..5e1d616 100644 --- a/src/metacoder/metacoder.py +++ b/src/metacoder/metacoder.py @@ -1,4 +1,5 @@ import logging +import sys from pathlib import Path from typing import Optional, Union @@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: output_path = Path(output) workdir_path = Path(workdir) + try: + # Create the output file only if it doesn't exist; fail if it does + with output_path.open("x", encoding="utf-8") as _: + pass + except FileExistsError: + print( + f"Error: '{output_path}' already exists. Please delete it or specify a different filename.", + file=sys.stderr, + ) + sys.exit(1) + # Convert coders tuple to list (empty tuple if not specified) coders_list = list(coders) if coders else None @@ -592,37 +604,43 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: # Print summary summary = runner.generate_summary(results) - click.echo("\nπŸ“ˆ Summary:") - click.echo(f" Total: {summary['total_evaluations']}") - click.echo( - f" Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)" + frac_passed = ( + summary["passed"] / summary["total_evaluations"] + if summary["total_evaluations"] + else 0 ) - click.echo( - f" Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)" + frac_failed = ( + summary["failed"] / summary["total_evaluations"] + if summary["total_evaluations"] + else 0 ) - if summary["errors"] > 0: - click.echo(f" Errors: {summary['errors']} ⚠️") + + click.echo("\nπŸ“ˆ Summary:") + click.echo(f" Total: {summary['total_evaluations']}") + click.echo(f" Passed: {summary['passed']} ({frac_passed:.1%})") + click.echo(f" Failed: {summary['failed']} ({frac_failed:.1%})") + click.echo(f" Errors: {summary['errors']} ⚠️") if summary["errors"] else None # Print by-coder summary if len(summary["by_coder"]) > 1: click.echo("\n By Coder:") for coder, stats in summary["by_coder"].items(): - pass_rate = ( - stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0 + coder_frac_passed = ( + stats["passed"] / stats["total"] if stats["total"] else 0 ) click.echo( - f" {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)" + f" {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})" ) # Print by-model summary if len(summary["by_model"]) > 1: click.echo("\n By Model:") for model, stats in summary["by_model"].items(): - pass_rate = ( - stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0 + model_frac_passed = ( + stats["passed"] / stats["total"] if stats["total"] else 0 ) click.echo( - f" {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)" + f" {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})" ) click.echo("\nβœ… Evaluation complete!") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..95f4c37 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +import logging +import sys + + +def pytest_configure(config): + logging.basicConfig( + level=logging.WARNING, + format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s", + stream=sys.stdout, + ) diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml new file mode 100644 index 0000000..7d305ce --- /dev/null +++ b/tests/input/goose_eval_claude_downgrade_test.yaml @@ -0,0 +1,30 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + mcp-simple-pubmed: + name: pubmed + command: uvx + args: [mcp-simple-pubmed] + env: + PUBMED_EMAIL: cjmungall@lbl.gov + +server_combinations: + - [mcp-simple-pubmed] + +cases: +- name: PMID_28027860_Full_Text + metrics: [CorrectnessMetric] + input: "What is the first sentence of section 2 in PMID: 28027860?" + expected_output: | + Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial." + threshold: 0.9 diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml index 1037215..73b0615 100644 --- a/tests/input/goose_eval_test.yaml +++ b/tests/input/goose_eval_test.yaml @@ -7,33 +7,49 @@ coders: goose: {} models: - gpt-4o: + claude-sonnet: provider: anthropic name: claude-sonnet-4-20250514 +# Refer to metacoder/src/mcps/registry/scilit.yaml for the list of available MCPs. servers: - mcp-simple-pubmed: + artl: + name: artl + command: uvx + args: [artl-mcp] + simple-pubmed: name: pubmed command: uvx args: [mcp-simple-pubmed] env: PUBMED_EMAIL: cjmungall@lbl.gov - ols-mcp: + ols: name: ols command: uvx args: [mcp-ols] server_combinations: - - [mcp-simple-pubmed, ols-mcp] +# - [artl, simple-pubmed, ols] + - [artl] + - [simple-pubmed] +# - [ols] cases: - - name: "disease" - metrics: [CorrectnessMetric] - input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs" - expected_output: | - MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15) - MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29) - MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome) +- name: PMID_28027860_Full_Text +# group: Text extraction # should default to "Default" + metrics: + - CorrectnessMetric + input: "What is the first sentence of section 2 in PMID: 28027860?" + expected_output: "Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial." + threshold: 0.9 - threshold: 0.7 - +# Per convo with Charles, Justin, Mark: this test case is kind of tricky and it seems +# like an extremely difficult case that even a good LLM + MCP might not pass. We've +# made some edits to give the LLM + MCP a fair chance +- name: PMC8086273_Retraction + group: Summarization + metrics: + - CorrectnessMetric + input: "Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?" + expected_output: "The paper says No but it is retracted so the results should not be trusted." + threshold: 0.9 diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml new file mode 100644 index 0000000..a027f80 --- /dev/null +++ b/tests/input/goose_no_server_test.yaml @@ -0,0 +1,30 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + mcp-simple-pubmed: + name: pubmed + command: uvx + args: [mcp-simple-pubmed] + env: + PUBMED_EMAIL: cjmungall@lbl.gov + +#server_combinations: +# - [mcp-simple-pubmed] + +cases: +- name: PMID_28027860_Full_Text + metrics: [CorrectnessMetric] + input: "What is the first sentence of section 2 in PMID: 28027860?" + expected_output: | + Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial." + threshold: 0.9 diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml new file mode 100644 index 0000000..d0fea1b --- /dev/null +++ b/tests/input/literature_mcp_encoding_test.yaml @@ -0,0 +1,29 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + ols: + name: ols + command: uvx + args: [mcp-ols] + +server_combinations: + - [simple-pubmed] + +cases: +- name: character_encoding_test + metrics: + - CorrectnessMetric + input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses? + expected_output: 'The paper says No but it is retracted so the results should not be trusted.' + threshold: 0.9 diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py index a9498b6..5d9daf1 100644 --- a/tests/test_coders/test_coders_basic.py +++ b/tests/test_coders/test_coders_basic.py @@ -3,6 +3,7 @@ These tests check that each coder can handle a simple arithmetic question. """ +import json import tempfile import pytest @@ -164,3 +165,16 @@ def test_dummy_coder_always_works(): assert result is not None assert result.result_text == "you said: Hello, world!" assert result.stdout == "you said: Hello, world!" + + +@pytest.mark.integration +def test_goose_utf8_session_file(tmp_path): + """Test session files with UTF-8 content are read correctly.""" + session_content = '{"role": "assistant", "content": "ζ΅‹θ―• rΓ©sumΓ© πŸš€"}\n' + session_file = tmp_path / "test_session.jsonl" + session_file.write_text(session_content, encoding="utf-8") + + with open(session_file, "r", encoding="utf-8") as f: + messages = [json.loads(line) for line in f if line.strip()] + assert len(messages) == 1 + assert "ζ΅‹θ―•" in messages[0]["content"] diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py new file mode 100644 index 0000000..f33f2a5 --- /dev/null +++ b/tests/test_evals/test_claude_judge.py @@ -0,0 +1,139 @@ +import logging +import traceback +from pathlib import Path + +from metacoder.evals.runner import EvalRunner + +logger = logging.getLogger(__name__) + + +def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch): + """Test that ClaudeJudge is used when OpenAI is disabled.""" + # TODO: This test should avoid running the coder and only perform the eval step. + # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process). + + runner = EvalRunner() + + try: + dataset = runner.load_dataset( + Path("tests/input/goose_eval_claude_downgrade_test.yaml") + ) + + # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used. + # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time. + # Instead, resort to capturing the WARNING logs for assertions related to the downgrade. + with caplog.at_level(logging.WARNING): + # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing") + + results = runner.run_all_evals( + dataset, workdir=tmp_path, coders=["goose", "dummy"] + ) + + # Test that the quota exhaustion fallback logic worked as expected. + assert ( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + in caplog.text + ) + + # Test that the new evaluation judge was correctly selected for the metric model downgrade. + assert ( + "Downgrading CorrectnessMetric model from gpt-4.1 to claude-" + in caplog.text + ) + + # Test that the eval completed by checking for a non-zero score. + assert results[0].score > 0, ( + f"Expected a {results[0].metric_name} score for {results[0].case_name}." + ) + + except Exception as e: + # Test that fallback logic does not result in an Exception. + logger.error(f"An error occurred: {e}") + logging.error(traceback.format_exc()) + assert False # This assertion will fail if an Exception is caught here. + finally: + pass + + +def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch): + """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available.""" + + runner = EvalRunner() + + try: + dataset = runner.load_dataset( + Path("tests/input/goose_eval_claude_downgrade_test.yaml") + ) + + # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used. + # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time. + # Instead, resort to capturing the WARNING logs for assertions related to the downgrade. + with caplog.at_level(logging.WARNING): + # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing") + + # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + + # One more OpenAI API test case also needs to be handled (401 errors): + # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail. + # monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + # One more Anthropic API test case also needs to be handled (401 errors): + # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail. + # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing") + + # TODO: Also need to test this for Anthropic: + # Provider + # request + # failed + # with status: 400 + # Bad + # Request.Payload: Some(Object + # {"error": Object {"message": String("Your credit balance is too low + # to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase + # credits."), "type": String("invalid_request_error")}, "request_id": String(" + # req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin + # g("error")}).Returning + # error: RequestFailed( + # "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits." + + results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"]) + + # Test that the quota exhaustion fallback logic worked as expected. + assert ( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + in caplog.text + ) + + # Test that the new evaluation judge was correctly selected for the metric model downgrade. + assert ( + "Downgrading CorrectnessMetric model from gpt-4.1 to claude-" + in caplog.text + ) + + # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric. + assert ( + "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric." + in caplog.text + ) + + # Test that the CorrectnessMetric was successfully downgraded to DummyMetric. + assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text + + # Test that the eval completed by checking for a non-zero score. + assert results[0].score > 0, ( + f"Expected a {results[0].metric_name} score for {results[0].case_name}." + ) + + except Exception as e: + # Test that fallback logic does not result in an Exception. + logger.error(f"An error occurred: {e}") + logging.error(traceback.format_exc()) + assert False # This assertion will fail if an Exception is caught here. + finally: + pass diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py index d1f0c3e..838ab60 100644 --- a/tests/test_evals/test_runner.py +++ b/tests/test_evals/test_runner.py @@ -174,6 +174,7 @@ def test_generate_summary(self): model="model1", coder="coder1", case_name="case1", + case_group="Default", metric_name="metric1", score=0.9, passed=True, @@ -182,6 +183,7 @@ def test_generate_summary(self): model="model1", coder="coder1", case_name="case2", + case_group="Default", metric_name="metric1", score=0.3, passed=False, @@ -190,6 +192,7 @@ def test_generate_summary(self): model="model2", coder="coder1", case_name="case1", + case_group="Default", metric_name="metric1", score=0.8, passed=True, @@ -225,6 +228,7 @@ def test_save_and_load_results(self, tmp_path): model="model1", coder="coder1", case_name="case1", + case_group="Default", metric_name="metric1", score=0.9, passed=True, diff --git a/tests/test_goose_paths.py b/tests/test_goose_paths.py new file mode 100644 index 0000000..5a5fb53 --- /dev/null +++ b/tests/test_goose_paths.py @@ -0,0 +1,105 @@ +from pathlib import Path +import pytest + +from metacoder.coders.goose import get_home_env_var, get_goose_config_path + + +def _norm(p: Path | str) -> str: + """Normalize path separators & strip trailing slashes for stable compares.""" + s = str(p).replace("\\", "/") + return s[:-1] if s.endswith("/") else s + + +@pytest.mark.parametrize( + "platform_name, xdg_value, expected_env", + [ + ("Windows", None, "APPDATA"), + ("Linux", None, "HOME"), + ("Darwin", None, "HOME"), + ("Linux", "/custom/xdg", "XDG_CONFIG_HOME"), + ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME"), + ], +) +def test_env_var_selection(monkeypatch, platform_name, xdg_value, expected_env): + # Simulate platform + import platform as _platform + + monkeypatch.setattr(_platform, "system", lambda: platform_name) + + # Simulate XDG presence/absence + if xdg_value is not None: + monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value) + else: + monkeypatch.delenv("XDG_CONFIG_HOME", raising=False) + + actual = get_home_env_var() + assert actual == expected_env + + +@pytest.mark.parametrize( + "platform_name, xdg_value, expected_env, expected_rel_dir", + [ + ("Windows", None, "APPDATA", "Block/goose/config"), + ("Linux", None, "HOME", ".config/goose"), + ("Darwin", None, "HOME", ".config/goose"), + ("Linux", "/custom/xdg", "XDG_CONFIG_HOME", "goose"), + ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME", "goose"), + ], +) +def test_config_path_matches_env( + monkeypatch, platform_name, xdg_value, expected_env, expected_rel_dir +): + import platform as _platform + + monkeypatch.setattr(_platform, "system", lambda: platform_name) + + if xdg_value is not None: + monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value) + else: + monkeypatch.delenv("XDG_CONFIG_HOME", raising=False) + + env_var = get_home_env_var() + rel_path = get_goose_config_path() + + assert env_var == expected_env + assert _norm(rel_path) == expected_rel_dir + + +@pytest.mark.parametrize( + "platform_name, xdg_value, workdir, expected_effective_dir", + [ + ("Windows", None, "C:/tmp/work", "C:/tmp/work/Block/goose/config/config.yaml"), + ("Linux", None, "/tmp/work", "/tmp/work/.config/goose/config.yaml"), + ( + "Darwin", + None, + "/Users/alice/work", + "/Users/alice/work/.config/goose/config.yaml", + ), + ("Linux", "/custom/xdg", "/tmp/work", "/tmp/work/goose/config.yaml"), + ( + "Darwin", + "/Users/alice/.conf", + "/Users/alice/work", + "/Users/alice/work/goose/config.yaml", + ), + ], +) +def test_effective_config_location( + monkeypatch, platform_name, xdg_value, workdir, expected_effective_dir +): + import platform as _platform + + monkeypatch.setattr(_platform, "system", lambda: platform_name) + + if xdg_value is not None: + monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value) + else: + monkeypatch.delenv("XDG_CONFIG_HOME", raising=False) + + local_home_path = Path(workdir) + + goose_config_dir = local_home_path / get_goose_config_path() + goose_cfg_file = goose_config_dir / "config.yaml" + + assert _norm(goose_cfg_file) == _norm(expected_effective_dir)