Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/agentready/assessors/documentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,7 @@ def _create_remediation(self) -> Remediation:
],
tools=[],
commands=[],
examples=[
"""# Project Name
examples=["""# Project Name

## Overview
What this project does and why it exists.
Expand All @@ -477,8 +476,7 @@ def _create_remediation(self) -> Remediation:
# Format code
black .
```
"""
],
"""],
citations=[
Citation(
source="GitHub",
Expand Down
6 changes: 2 additions & 4 deletions src/agentready/assessors/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,7 @@ def _create_remediation(self) -> Remediation:
"pre-commit install",
"pre-commit run --all-files",
],
examples=[
"""# .pre-commit-config.yaml
examples=["""# .pre-commit-config.yaml
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
Expand All @@ -306,8 +305,7 @@ def _create_remediation(self) -> Remediation:
rev: 5.12.0
hooks:
- id: isort
"""
],
"""],
citations=[
Citation(
source="pre-commit.com",
Expand Down
51 changes: 40 additions & 11 deletions src/agentready/cli/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import click

from ..services.eval_harness.harbor_config import HarborConfig
from ..services.eval_harness.harbor_config import ALLOWED_MODELS, HarborConfig
from ..services.eval_harness.tbench_runner import _real_tbench_result
from ..services.harbor.agent_toggler import AssessorStateToggler
from ..services.harbor.comparer import compare_assessor_impact
Expand All @@ -27,10 +27,16 @@
default=None,
help="Benchmark subset (tbench: smoketest/full)",
)
@click.option(
"--agent",
type=click.Choice(["claude-code", "cursor-cli"]),
default="claude-code",
help="Agent for evaluation",
)
@click.option(
"--model",
type=click.Choice(["claude-haiku-4-5", "claude-sonnet-4-5"]),
default="claude-haiku-4-5",
type=click.Choice(list(ALLOWED_MODELS)),
default="anthropic/claude-haiku-4-5",
help="Model for evaluation",
)
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
Expand All @@ -53,7 +59,15 @@
help="Skip dependency checks (for advanced users)",
)
def benchmark(
repository, harness, subset, model, verbose, timeout, output_dir, skip_preflight
repository,
harness,
subset,
agent,
model,
verbose,
timeout,
output_dir,
skip_preflight,
):
"""Run agent coding benchmarks.

Expand Down Expand Up @@ -81,14 +95,23 @@ def benchmark(
# Route to appropriate harness
if harness == "tbench":
_run_tbench(
repo_path, subset, model, verbose, timeout, output_dir, skip_preflight
repo_path,
subset,
agent,
model,
verbose,
timeout,
output_dir,
skip_preflight,
)
else:
click.echo(f"Unknown harness: {harness}", err=True)
raise click.Abort()


def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_preflight):
def _run_tbench(
repo_path, subset, agent, model, verbose, timeout, output_dir, skip_preflight
):
"""Run Terminal-Bench evaluation."""
# Default subset to 'full' if not specified
if subset is None:
Expand All @@ -107,6 +130,7 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre
click.echo("AgentReady Terminal-Bench Benchmark")
click.echo(f"{'=' * 50}\n")
click.echo(f"Repository: {repo_path}")
click.echo(f"Agent: {agent}")
click.echo(f"Model: {model}")
click.echo(f"Subset: {subset} ({'1-2 tasks' if smoketest else '89 tasks'})")
click.echo(f"Timeout: {timeout}s\n")
Expand Down Expand Up @@ -135,19 +159,24 @@ def _run_tbench(repo_path, subset, model, verbose, timeout, output_dir, skip_pre
raise click.Abort()

# Validate API key BEFORE creating HarborConfig
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
if agent == "claude-code":
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
elif agent == "cursor-cli":
api_key = os.environ.get("CURSOR_API_KEY", "")

if not api_key:
key_name = "ANTHROPIC_API_KEY" if agent == "claude-code" else "CURSOR_API_KEY"
click.echo(
"Error: ANTHROPIC_API_KEY environment variable not set.\n"
"Set it with: export ANTHROPIC_API_KEY=your-key-here",
f"Error: {key_name} environment variable not set.\n"
f"Set it with: export {key_name}=your-key-here",
err=True,
)
raise click.Abort()

# Create HarborConfig (will not raise ValueError now)
harbor_config = HarborConfig(
model=f"anthropic/{model}",
agent="claude-code",
model=model,
agent=agent,
jobs_dir=Path(tempfile.mkdtemp()),
api_key=api_key,
timeout=timeout,
Expand Down
18 changes: 6 additions & 12 deletions src/agentready/services/assessment_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def _initialize_db(self) -> None:
"""Initialize database schema."""
try:
with sqlite3.connect(self.db_path) as conn:
conn.execute(
"""
conn.execute("""
CREATE TABLE IF NOT EXISTS assessments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_url TEXT NOT NULL,
Expand All @@ -45,23 +44,18 @@ def _initialize_db(self) -> None:
expires_at TIMESTAMP,
UNIQUE(repository_url, commit_hash)
)
"""
)
""")

# Create index for faster queries
conn.execute(
"""
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_repo_commit
ON assessments(repository_url, commit_hash)
"""
)
""")

conn.execute(
"""
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_expires_at
ON assessments(expires_at)
"""
)
""")

conn.commit()
except sqlite3.Error as e:
Expand Down
13 changes: 13 additions & 0 deletions src/agentready/services/eval_harness/harbor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,27 @@
from typing import Optional

# Allowed models (excludes opus due to cost)
# Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview
# Cursor models: https://cursor.com/docs/models
ALLOWED_MODELS = {
"anthropic/claude-haiku-4-5",
"anthropic/claude-sonnet-4-5",
"cursor/composer-1",
"cursor/gpt-5.2-codex",
"cursor/gpt-5.2-codex-fast",
"cursor/gemini-3-pro",
"cursor/opus-4.5",
"cursor/sonnet-4.5",
"cursor/sonnet-4.5-thinking",
"cursor/gpt-5.1-high",
"cursor/gemini-3-flash",
}

# Allowed agents (excludes oracle as it's not relevant for real-world assessment)
# Harbor supported agents: https://github.com/laude-institute/harbor/blob/main/src/harbor/agents/factory.py
ALLOWED_AGENTS = {
"claude-code",
"cursor-cli",
}


Expand Down
57 changes: 37 additions & 20 deletions src/agentready/services/eval_harness/tbench_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,31 +125,48 @@
# Pass through current environment but ensure API key is set
# Harbor's claude-code agent has MiniMax API hardcoded - override it
clean_env = os.environ.copy()
clean_env["ANTHROPIC_API_KEY"] = config.api_key
clean_env["ANTHROPIC_AUTH_TOKEN"] = config.api_key # Harbor uses this
clean_env["ANTHROPIC_BASE_URL"] = "https://api.anthropic.com" # Override MiniMax
clean_env["ANTHROPIC_API_BASE"] = "https://api.anthropic.com" # Alternative var

# Define agent-specific environment variable configurations
# Structure: (Env Key, Env Value, Is Sensitive)
agent_env_configs = {
"claude-code": [
("ANTHROPIC_API_KEY", config.api_key, True),
("ANTHROPIC_AUTH_TOKEN", config.api_key, True),
("ANTHROPIC_BASE_URL", "https://api.anthropic.com", False),
("ANTHROPIC_API_BASE", "https://api.anthropic.com", False),
],
"cursor-cli": [
("CURSOR_API_KEY", config.api_key, True),
],
}

if config.agent not in agent_env_configs:
raise ValueError(f"Invalid agent: {config.agent}")

# Set environment variables and build display/copyable lists
env_vars_display = []
env_vars_copyable = []

for var_name, var_value, is_sensitive in agent_env_configs[config.agent]:
clean_env[var_name] = var_value

# Build display string (truncate sensitive values)
if not is_sensitive:
display_value = var_value
env_vars_display.append(f"{var_name}={display_value}")

# Build copyable string (use variable reference for sensitive values)
if is_sensitive:
copyable_value = f"${var_name}"
else:
copyable_value = var_value
env_vars_copyable.append(f"{var_name}={copyable_value}")

# Clear MiniMax settings if present
clean_env.pop("MINIMAX_API_KEY", None)

# Print Harbor command for debugging and manual execution
shell_cmd = " ".join(shlex.quote(arg) for arg in cmd)

# Prepare environment variable strings (truncate API key for security in display)
env_vars_display = [
f"ANTHROPIC_API_KEY={config.api_key[:20]}...", # Truncated for display
f"ANTHROPIC_AUTH_TOKEN={config.api_key[:20]}...",
f"ANTHROPIC_BASE_URL={clean_env['ANTHROPIC_BASE_URL']}",
f"ANTHROPIC_API_BASE={clean_env['ANTHROPIC_API_BASE']}",
]

# Full command for copy/paste (use $ANTHROPIC_API_KEY to avoid exposing key)
env_vars_copyable = [
"ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY",
"ANTHROPIC_AUTH_TOKEN=$ANTHROPIC_API_KEY",
f"ANTHROPIC_BASE_URL={clean_env['ANTHROPIC_BASE_URL']}",
f"ANTHROPIC_API_BASE={clean_env['ANTHROPIC_API_BASE']}",
]
full_cmd_copyable = " ".join(env_vars_copyable) + " " + shell_cmd

print(f"\n{'=' * 70}")
Expand All @@ -157,7 +174,7 @@
print(f"{'=' * 70}")
print(f"\n{full_cmd_copyable}\n")
print(f"{'=' * 70}")
print("Command Breakdown:")

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
print(f"{'=' * 70}")
print(f"\nCommand: {shell_cmd}\n")
print("Environment Variables:")
Expand Down
6 changes: 2 additions & 4 deletions tests/e2e/test_critical_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,14 +276,12 @@ def test_assess_with_valid_config(self):
with tempfile.TemporaryDirectory() as tmp_dir:
# Create valid config file
config_file = Path(tmp_dir) / "config.yaml"
config_file.write_text(
"""
config_file.write_text("""
weights:
claude_md: 2.0
excluded_attributes:
- repomix_config
"""
)
""")

output_dir = Path(tmp_dir) / "output"

Expand Down
6 changes: 2 additions & 4 deletions tests/e2e/test_critical_paths_simplified.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,12 @@ def test_valid_config_application(self, temp_output_dir):
with tempfile.TemporaryDirectory() as tmp_dir:
# Create valid config
config_file = Path(tmp_dir) / "config.yaml"
config_file.write_text(
"""
config_file.write_text("""
weights:
claude_md: 2.0
excluded_attributes:
- repomix_config
"""
)
""")

# Run assessment with config
result = helper.run_assessment(
Expand Down
6 changes: 2 additions & 4 deletions tests/unit/cli/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,12 @@ class TestConfigLoading:
def test_load_config_valid_yaml(self, tmp_path):
"""Test loading valid config file."""
config_file = tmp_path / "config.yaml"
config_file.write_text(
"""
config_file.write_text("""
weights:
claude_md_file: 2.0
excluded_attributes:
- test_attribute
"""
)
""")

config = load_config(config_file)

Expand Down
Loading
Loading