From 1300c82dfa8b33b891a6fa2ca78afefcc4228625 Mon Sep 17 00:00:00 2001 From: Bill Murdock Date: Wed, 6 May 2026 15:07:49 -0400 Subject: [PATCH] feat: remove benchmark and harbor commands The benchmark and harbor commands measured Claude Code performance on generic Terminal-Bench tasks unrelated to AgentReady, so they validated nothing about the tool's core claims. They also had statistical flaws (independence violations, insufficient power for plausible effect sizes). Also removes the unregistered eval-harness CLI and its services, which shared the same tbench-based approach and was already inaccessible dead code. Closes #394 Co-Authored-By: Claude Sonnet 4.6 --- README.md | 21 - docs/harbor-comparison-guide.md | 400 --------- patches/harbor-task-filtering-fix.patch | 75 -- repos-for-benchmark.txt | 8 - .../DOUBLEAGENT_IMPACT.md | 246 ------ .../checklists/requirements.md | 45 - .../contracts/aggregation-output-schema.json | 60 -- .../contracts/harbor-results-schema.json | 75 -- .../002-harbor-real-integration/data-model.md | 471 ----------- specs/002-harbor-real-integration/plan.md | 701 ---------------- .../002-harbor-real-integration/quickstart.md | 282 ------- specs/002-harbor-real-integration/research.md | 421 ---------- specs/002-harbor-real-integration/spec.md | 204 ----- specs/002-harbor-real-integration/tasks.md | 467 ----------- src/agentready/cli/benchmark.py | 473 ----------- src/agentready/cli/eval_harness.py | 787 ------------------ src/agentready/cli/harbor.py | 361 -------- src/agentready/cli/main.py | 4 - src/agentready/models/harbor.py | 300 ------- src/agentready/reporters/harbor_markdown.py | 260 ------ .../services/eval_harness/__init__.py | 1 - .../services/eval_harness/aggregator.py | 83 -- .../services/eval_harness/assessor_tester.py | 191 ----- .../services/eval_harness/baseline.py | 102 --- .../services/eval_harness/batch_runner.py | 67 -- .../eval_harness/dashboard_generator.py | 180 ---- .../services/eval_harness/harbor_config.py | 86 -- .../services/eval_harness/tbench_runner.py | 285 ------- src/agentready/services/harbor/__init__.py | 6 - .../services/harbor/agent_toggler.py | 255 ------ src/agentready/services/harbor/comparer.py | 349 -------- .../services/harbor/dashboard_generator.py | 169 ---- .../services/harbor/result_parser.py | 78 -- src/agentready/services/harbor/runner.py | 192 ----- .../templates/harbor_comparison.html.j2 | 510 ------------ src/agentready/utils/__init__.py | 4 - src/agentready/utils/preflight.py | 132 --- tests/unit/services/harbor/__init__.py | 0 .../harbor/test_assessor_state_toggler.py | 240 ------ tests/unit/test_cli_benchmark.py | 673 --------------- tests/unit/test_cli_harbor.py | 725 ---------------- tests/unit/test_eval_harness_cli.py | 142 ---- tests/unit/test_eval_harness_services.py | 391 --------- tests/unit/test_harbor_config.py | 260 ------ tests/unit/test_harbor_models.py | 297 ------- tests/unit/test_harbor_services.py | 255 ------ tests/unit/utils/test_preflight.py | 109 --- 47 files changed, 11443 deletions(-) delete mode 100644 docs/harbor-comparison-guide.md delete mode 100644 patches/harbor-task-filtering-fix.patch delete mode 100644 repos-for-benchmark.txt delete mode 100644 specs/002-harbor-real-integration/DOUBLEAGENT_IMPACT.md delete mode 100644 specs/002-harbor-real-integration/checklists/requirements.md delete mode 100644 specs/002-harbor-real-integration/contracts/aggregation-output-schema.json delete mode 100644 specs/002-harbor-real-integration/contracts/harbor-results-schema.json delete mode 100644 specs/002-harbor-real-integration/data-model.md delete mode 100644 specs/002-harbor-real-integration/plan.md delete mode 100644 specs/002-harbor-real-integration/quickstart.md delete mode 100644 specs/002-harbor-real-integration/research.md delete mode 100644 specs/002-harbor-real-integration/spec.md delete mode 100644 specs/002-harbor-real-integration/tasks.md delete mode 100644 src/agentready/cli/benchmark.py delete mode 100644 src/agentready/cli/eval_harness.py delete mode 100644 src/agentready/cli/harbor.py delete mode 100644 src/agentready/models/harbor.py delete mode 100644 src/agentready/reporters/harbor_markdown.py delete mode 100644 src/agentready/services/eval_harness/__init__.py delete mode 100644 src/agentready/services/eval_harness/aggregator.py delete mode 100644 src/agentready/services/eval_harness/assessor_tester.py delete mode 100644 src/agentready/services/eval_harness/baseline.py delete mode 100644 src/agentready/services/eval_harness/batch_runner.py delete mode 100644 src/agentready/services/eval_harness/dashboard_generator.py delete mode 100644 src/agentready/services/eval_harness/harbor_config.py delete mode 100644 src/agentready/services/eval_harness/tbench_runner.py delete mode 100644 src/agentready/services/harbor/__init__.py delete mode 100644 src/agentready/services/harbor/agent_toggler.py delete mode 100644 src/agentready/services/harbor/comparer.py delete mode 100644 src/agentready/services/harbor/dashboard_generator.py delete mode 100644 src/agentready/services/harbor/result_parser.py delete mode 100644 src/agentready/services/harbor/runner.py delete mode 100644 src/agentready/templates/harbor_comparison.html.j2 delete mode 100644 src/agentready/utils/preflight.py delete mode 100644 tests/unit/services/harbor/__init__.py delete mode 100644 tests/unit/services/harbor/test_assessor_state_toggler.py delete mode 100644 tests/unit/test_cli_benchmark.py delete mode 100644 tests/unit/test_cli_harbor.py delete mode 100644 tests/unit/test_eval_harness_cli.py delete mode 100644 tests/unit/test_eval_harness_services.py delete mode 100644 tests/unit/test_harbor_config.py delete mode 100644 tests/unit/test_harbor_models.py delete mode 100644 tests/unit/test_harbor_services.py delete mode 100644 tests/unit/utils/test_preflight.py diff --git a/README.md b/README.md index 74afaca8..2beaf90d 100644 --- a/README.md +++ b/README.md @@ -90,27 +90,6 @@ After installing globally: agentready assess . ``` -### Harbor CLI (for Benchmarks) - -Harbor is required for running Terminal-Bench evaluations: - -```bash -# AgentReady will prompt to install automatically, or install manually: -uv tool install harbor - -# Alternative: Use pip if uv is not available -pip install harbor - -# Verify installation -harbor --version -``` - -**Skip automatic checks**: If you prefer to skip the automatic Harbor check (for advanced users): - -```bash -agentready benchmark --skip-preflight --subset smoketest -``` - ### Assessment Only For one-time analysis without infrastructure changes: diff --git a/docs/harbor-comparison-guide.md b/docs/harbor-comparison-guide.md deleted file mode 100644 index 15228fe1..00000000 --- a/docs/harbor-comparison-guide.md +++ /dev/null @@ -1,400 +0,0 @@ -# Harbor Benchmark Comparison Guide - -**Purpose**: Measure the empirical impact of `.claude/agents/doubleagent.md` on Claude Code performance using Harbor's Terminal-Bench. - -**Created**: 2025-12-10 -**AgentReady Version**: 2.10.0+ - ---- - -## Overview - -The Harbor comparison tool automates A/B testing of Claude Code performance with and without the `doubleagent.md` agent file. It runs Terminal-Bench tasks twice—once with the agent file disabled, once with it enabled—and generates comprehensive comparison reports. - -### What Gets Measured - -- **Success Rate**: Percentage of tasks completed successfully -- **Duration**: Average time to complete tasks -- **Statistical Significance**: T-tests and effect sizes (Cohen's d) -- **Per-Task Impact**: Individual task improvements/regressions - ---- - -## Quick Start - -### Prerequisites - -1. **Harbor Framework** installed: - ```bash - uv tool install harbor - ``` - -2. **AgentReady** with harbor support: - ```bash - uv pip install -e . - ``` - -3. **Agent file** exists at `.claude/agents/doubleagent.md` - -### Basic Usage - -Compare performance on 3 tasks: - -```bash -agentready harbor compare \ - -t adaptive-rejection-sampler \ - -t async-http-client \ - -t terminal-file-browser \ - --verbose -``` - -This will: -1. Run tasks WITHOUT doubleagent.md (agent file disabled) -2. Run tasks WITH doubleagent.md (agent file enabled) -3. Generate comparison reports (JSON, Markdown, HTML) -4. Print summary to console - -**Expected Duration**: 10-20 minutes per task (30-60 min total for 3 tasks) - ---- - -## Command Reference - -### `agentready harbor compare` - -Run Harbor benchmark comparison. - -**Options**: -- `-t, --task TASK_NAME` - Task to benchmark (required, repeatable) -- `--model MODEL` - Model identifier (default: `anthropic/claude-sonnet-4-5`) -- `--agent-file PATH` - Path to agent file (default: `.claude/agents/doubleagent.md`) -- `--output-dir DIR` - Output directory (default: `.agentready/harbor_comparisons`) -- `--verbose` - Print detailed Harbor output -- `--open-dashboard` - Open HTML dashboard after completion - -**Example**: -```bash -agentready harbor compare \ - -t adaptive-rejection-sampler \ - -t async-http-client \ - --model anthropic/claude-sonnet-4-5 \ - --verbose \ - --open-dashboard -``` - -### `agentready harbor list` - -List all Harbor comparisons in output directory. - -**Example**: -```bash -agentready harbor list -``` - -**Output**: -``` -Harbor comparisons in .agentready/harbor_comparisons: - - run_20251210_143022/ - Created: 2025-12-10T14:30:22 - Success Δ: +33.3% - Duration Δ: -21.2% - - run_20251209_091545/ - Created: 2025-12-09T09:15:45 - Success Δ: +16.7% - Duration Δ: -12.5% -``` - -### `agentready harbor view` - -View a specific comparison. - -**Usage**: -```bash -agentready harbor view .agentready/harbor_comparisons/comparison_latest.json -``` - -**Options**: -- `--format summary` - Print summary (default) -- `--format full` - Print full JSON - ---- - -## Output Files - -Each comparison generates multiple files in `.agentready/harbor_comparisons/run_TIMESTAMP/`: - -### Directory Structure - -``` -.agentready/harbor_comparisons/ -├── run_20251210_143022/ -│ ├── without_agent/ # Harbor results without doubleagent.md -│ │ └── [task results] -│ ├── with_agent/ # Harbor results with doubleagent.md -│ │ └── [task results] -│ ├── comparison_20251210_143022.json # Machine-readable data -│ ├── comparison_20251210_143022.md # GitHub-Flavored Markdown -│ └── comparison_20251210_143022.html # Interactive dashboard -├── comparison_latest.json # Symlink to most recent JSON -├── comparison_latest.md # Symlink to most recent Markdown -└── comparison_latest.html # Symlink to most recent HTML -``` - -### JSON Report - -Machine-readable comparison data for further analysis: - -```json -{ - "without_agent": { - "run_id": "without_20251210_143022", - "agent_file_enabled": false, - "success_rate": 66.7, - "avg_duration_sec": 312.5, - "total_tasks": 3, - "successful_tasks": 2 - }, - "with_agent": { - "run_id": "with_20251210_143022", - "agent_file_enabled": true, - "success_rate": 100.0, - "avg_duration_sec": 246.3, - "total_tasks": 3, - "successful_tasks": 3 - }, - "deltas": { - "success_rate_delta": 33.3, - "avg_duration_delta_pct": -21.2 - }, - "statistical_significance": { - "success_rate_significant": true, - "success_rate_p_value": 0.0421, - "duration_significant": true, - "duration_p_value": 0.0312, - "duration_cohens_d": -0.87 - } -} -``` - -### Markdown Report - -GitHub-friendly report perfect for git commits and PRs: - -```markdown -# Harbor Benchmark Comparison - -**Created**: 2025-12-10T14:30:22 -**Tasks**: 3 (adaptive-rejection-sampler, async-http-client, terminal-file-browser) - -## Summary - -| Metric | Without Agent | With Agent | Delta | Significant? | -|--------|--------------|------------|-------|--------------| -| Success Rate | 66.7% | 100.0% | +33.3% | ✓ (p=0.0421) | -| Avg Duration | 5.2 min | 4.1 min | -21.2% | ✓ (p=0.0312) | - -## Per-Task Results - -### adaptive-rejection-sampler -- **Without Agent**: ✗ Failed (timeout) -- **With Agent**: ✓ Success (3.8 min) -- **Impact**: +100% success (fixed failure) - -... - -## Conclusion - -The `doubleagent.md` agent file shows **statistically significant improvement** -in both success rate (+33.3%, p=0.04) and execution speed (-21.2%, p=0.03). - -**Recommendation**: ✅ **Include `doubleagent.md`** in AgentReady development workflows. -``` - -### HTML Dashboard - -Interactive visualization with Chart.js: - -- Side-by-side bar charts (success rates, durations) -- Per-task breakdown table -- Statistical significance indicators -- Self-contained (no external dependencies) - -Open with: -```bash -open .agentready/harbor_comparisons/comparison_latest.html -``` - ---- - -## Interpreting Results - -### Statistical Significance - -**P-value < 0.05**: Statistically significant difference (95% confidence) -- ✓ Indicates real improvement, not random variation -- ✗ Difference could be due to chance - -**Cohen's d (Effect Size)**: -- **0.2 ≤ |d| < 0.5**: Small effect -- **0.5 ≤ |d| < 0.8**: Medium effect -- **|d| ≥ 0.8**: Large effect - -### Sample Size Requirements - -- **Minimum**: 3 tasks for statistical tests -- **Recommended**: 5-10 tasks for reliable results -- **Comprehensive**: 20+ tasks for production validation - -Small samples (n<3) will show warning about statistical validity. - -### Recommendations - -Based on comparison results: - -| Outcome | Recommendation | -|---------|---------------| -| ✅ Success ↑, p<0.05 | **Include agent file** - Proven improvement | -| ⚠️ Success ↑, p≥0.05 | **Consider including** - Validate with larger sample | -| ❌ No improvement | **Agent file may not help** for tested tasks | - ---- - -## Advanced Usage - -### Custom Agent File - -Test a different agent file: - -```bash -agentready harbor compare \ - -t task1 -t task2 \ - --agent-file .claude/agents/custom-agent.md -``` - -### Custom Output Directory - -Store results in specific location: - -```bash -agentready harbor compare \ - -t task1 -t task2 \ - --output-dir experiments/harbor_results -``` - -### Different Model - -Test with Claude Opus: - -```bash -agentready harbor compare \ - -t task1 -t task2 \ - --model anthropic/claude-opus-4-5 -``` - ---- - -## Troubleshooting - -### Harbor Not Installed - -**Error**: `Harbor framework not installed` - -**Solution**: -```bash -uv tool install harbor -``` - -### Agent File Not Found - -**Error**: `Agent file not found: .claude/agents/doubleagent.md` - -**Solution**: Ensure agent file exists or specify custom path with `--agent-file` - -### No Tasks Specified - -**Error**: `At least one task must be specified with -t/--task` - -**Solution**: Add tasks with `-t` flag: -```bash -agentready harbor compare -t adaptive-rejection-sampler -``` - -### Sample Size Too Small - -**Warning**: `Sample size too small (n<3). Statistical tests may not be reliable.` - -**Solution**: Run more tasks (5-10 recommended) for valid statistical analysis - -### Task Timeout - -Some tasks may timeout (30-60 min). This is normal for complex tasks. The comparison will continue with partial results. - ---- - -## FAQ - -**Q: How long does a comparison take?** - -A: Approximately 10-20 minutes per task. For 3 tasks, expect 30-60 minutes total. - -**Q: Can I run comparisons in parallel?** - -A: Not currently supported. Future versions may support concurrent Harbor execution via Daytona/Modal. - -**Q: What if some tasks fail?** - -A: Comparison continues with partial results. Failed tasks are marked in reports and excluded from duration averages. - -**Q: Can I compare more than 2 configurations?** - -A: Currently supports only with/without agent file. Multi-configuration comparison is planned for future versions. - -**Q: Where are results stored?** - -A: `.agentready/harbor_comparisons/` (gitignored). Reports can be committed for reference, but raw Harbor results are excluded. - -**Q: How do I share results with my team?** - -A: Commit the Markdown report (`.md` file) or share the HTML dashboard. JSON files are machine-readable for further analysis. - ---- - -## Related Documentation - -- **Harbor Framework**: https://harborframework.com/docs -- **Terminal-Bench**: https://terminal-bench.com -- **AgentReady CLAUDE.md**: See "Harbor Comparison" section -- **Plan**: `.claude/plans/vivid-knitting-codd.md` (implementation details) - ---- - -## Quickstart Example - -```bash -# Install Harbor -uv tool install harbor - -# Run comparison (3 tasks, ~30-60 min) -agentready harbor compare \ - -t adaptive-rejection-sampler \ - -t async-http-client \ - -t terminal-file-browser \ - --verbose \ - --open-dashboard - -# View summary -agentready harbor view .agentready/harbor_comparisons/comparison_latest.json - -# List all comparisons -agentready harbor list - -# Open latest dashboard -open .agentready/harbor_comparisons/comparison_latest.html -``` - ---- - -**Last Updated**: 2025-12-10 -**AgentReady Version**: 2.10.0+ diff --git a/patches/harbor-task-filtering-fix.patch b/patches/harbor-task-filtering-fix.patch deleted file mode 100644 index a6de10bf..00000000 --- a/patches/harbor-task-filtering-fix.patch +++ /dev/null @@ -1,75 +0,0 @@ -From f9e6d2e10c72d33373012294c36fd4938c45c26c Mon Sep 17 00:00:00 2001 -From: Alex Shaw -Date: Fri Dec 12 21:21:27 2025 -0800 -Subject: [PATCH] Fix task filtering to use polymorphic get_name() method - -Fix for Harbor task filtering bug where -t/--task-name flags were ignored. - -This patch is already merged in Harbor main (commit f9e6d2e) but not yet -released to PyPI. Latest version 0.1.23 (Dec 11, 2025) still has the bug. - -Apply this patch to your local Harbor installation if you can't install -from main branch. - -See: https://github.com/laude-institute/harbor/commit/f9e6d2e - ---- - src/harbor/models/job/config.py | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/src/harbor/models/job/config.py b/src/harbor/models/job/config.py -index 4a35f1f..f7a0ec9 100644 ---- a/src/harbor/models/job/config.py -+++ b/src/harbor/models/job/config.py -@@ -39,7 +39,7 @@ class BaseDatasetConfig(BaseModel, ABC): - task_id - for task_id in filtered_ids - if any( -- fnmatch(task_id.path.name, pattern_id) -+ fnmatch(task_id.get_name(), pattern_id) - for pattern_id in self.task_names - ) - ] -@@ -49,7 +49,7 @@ class BaseDatasetConfig(BaseModel, ABC): - task_id - for task_id in filtered_ids - if not any( -- fnmatch(task_id.path.name, pattern_id) -+ fnmatch(task_id.get_name(), pattern_id) - for pattern_id in self.exclude_task_names - ) - ] -@@ -73,7 +73,7 @@ class LocalDatasetConfig(BaseDatasetConfig): - ] - filtered_task_ids = self._filter_task_ids(task_ids) - return [ -- TaskConfig(path=task_id.path, source=self.path.name) -+ TaskConfig(path=task_id.path, source=self.path.expanduser().resolve().name) - for task_id in filtered_task_ids - ] - --- -2.47.1 - -USAGE: ------- - -Option 1: Install Harbor from main (recommended) - pip uninstall harbor - pip install git+https://github.com/laude-institute/harbor.git - -Option 2: Apply this patch to your local Harbor installation - # Find your Harbor installation - python -c "import harbor; print(harbor.__file__)" - # Output example: /path/to/site-packages/harbor/__init__.py - - # Navigate to Harbor package directory - cd /path/to/site-packages/harbor - - # Apply patch - git apply /path/to/agentready/patches/harbor-task-filtering-fix.patch - - # Or manually edit src/harbor/models/job/config.py: - # Line 42: task_id.path.name -> task_id.get_name() - # Line 52: task_id.path.name -> task_id.get_name() - # Line 76: self.path.name -> self.path.expanduser().resolve().name diff --git a/repos-for-benchmark.txt b/repos-for-benchmark.txt deleted file mode 100644 index 1c9c2bc4..00000000 --- a/repos-for-benchmark.txt +++ /dev/null @@ -1,8 +0,0 @@ -https://github.com/opendatahub-io/odh-dashboard -https://github.com/vllm-project/vllm -https://github.com/github/spec-kit -https://github.com/ambient-code/agentready -https://github.com/ambient-code/platform -https://github.com/pytorch/pytorch -https://github.com/ai-dynamo/dynamo -https://github.com/kubernetes/kubernetes diff --git a/specs/002-harbor-real-integration/DOUBLEAGENT_IMPACT.md b/specs/002-harbor-real-integration/DOUBLEAGENT_IMPACT.md deleted file mode 100644 index 950d9c04..00000000 --- a/specs/002-harbor-real-integration/DOUBLEAGENT_IMPACT.md +++ /dev/null @@ -1,246 +0,0 @@ -# DoubleAgent.md Impact Report - Harbor Real Integration Specification - -**Feature**: Harbor Framework Real Integration for Terminal-Bench Eval Harness -**Specification Date**: 2025-12-09 -**Agent Documentation**: `.claude/agents/doubleagent.md` - ---- - -## Executive Summary - -The doubleagent.md agent documentation had **HIGH IMPACT** on this specification, providing critical architectural context, design patterns, security principles, and quality standards that shaped the specification structure, scope decisions, and requirement prioritization. - -**Key Contributions**: -- ✅ Informed security requirements (API key exposure, command injection prevention) -- ✅ Guided proportional scoring approach for assessor effectiveness measurement -- ✅ Influenced graceful degradation pattern (mocked vs real integration toggle) -- ✅ Shaped testing strategy (unit + integration coverage expectations) -- ✅ Reinforced simplicity principles (76% code reduction aligned with anti-patterns) - ---- - -## Specific Impacts by Section - -### 1. Architecture & Design Patterns - -**Source**: `doubleagent.md:28-68` (Architecture & Design section) - -**Impact on Specification**: - -| doubleagent.md Principle | How Applied in Spec | -|--------------------------|---------------------| -| Library-First Philosophy: "No global state, all components are stateless" | **FR-007**: Environment variable toggle (`TBENCH_USE_REAL=1`) instead of global configuration state | -| Strategy Pattern: "Each assessor is independent" | **FR-010**: Aggregation treats assessors independently with per-assessor statistics (mean/median/std) | -| Dependency Injection: "Dependency injection for configuration" | **HarborConfig** entity defined with injectable API credentials, model, agent, timeout settings | -| Fail Gracefully: "Missing tools → skip, don't crash" | **FR-012**: Harbor framework errors handled gracefully with clear error messages and installation guidance | - -**Evidence**: The specification explicitly avoids stateful configuration and instead uses environment variables and dependency injection patterns, directly mirroring doubleagent.md's library-first philosophy. - ---- - -### 2. Security & Vulnerability Prevention - -**Source**: `doubleagent.md:232-238` (Constitutional Principles - "Fail Gracefully") - -**Impact on Specification**: - -| doubleagent.md Anti-Pattern | How Prevented in Spec | -|-----------------------------|-----------------------| -| ❌ "Crash on missing tools" | **FR-012**: Graceful error handling with installation guidance | -| ❌ "Hard-code paths or assumptions" | **FR-005**: JSON output validation with path sanitization before file reading | -| ❌ Implicit: API key exposure risks | **FR-004**: Only pass required environment variables (API key, PATH, HOME) to subprocess - addresses automated review security finding | -| ❌ Implicit: Command injection vulnerabilities | **FR-002, FR-003**: Allowlist validation for model/agent parameters before subprocess execution - addresses automated review security finding | - -**Evidence**: User Story 3 (Priority P1) elevated security to same priority as core functionality, directly influenced by doubleagent.md's emphasis on security and the automated review findings. - ---- - -### 3. Scoring & Assessment Patterns - -**Source**: `doubleagent.md:73-97` (Assessment Workflow - Scoring Algorithm) - -**Impact on Specification**: - -| doubleagent.md Pattern | How Applied in Spec | -|------------------------|---------------------| -| Proportional Scoring: `calculate_proportional_score(passed, total, attribute)` | **FR-010**: Aggregated statistics use mean/median/std to identify proportional assessor effectiveness across repositories | -| Statistical Significance: Tier-based weighting (50/30/15/5) | **FR-011**: Statistical significance indicators (confidence intervals, p-values) for aggregated results | -| Finding Status Types: `pass/fail/partial/skipped/error/not_applicable` | **TbenchResult** entity includes is_mocked flag to distinguish real vs mocked results | - -**Evidence**: The specification's aggregation requirements (FR-010, FR-011) mirror doubleagent.md's emphasis on proportional scoring and statistical validity for assessor effectiveness. - ---- - -### 4. Testing Strategy & Coverage - -**Source**: `doubleagent.md:171-217` (Test Structure & Coverage) - -**Impact on Specification**: - -| doubleagent.md Guidance | How Applied in Spec | -|-------------------------|---------------------| -| "Test individual assessor logic" | In-Scope: Integration tests with subprocess mocking for Harbor calls | -| "Target: >80% coverage for new code" | Success Criteria: Implies test coverage requirement for new Harbor integration code | -| "Edge case coverage (empty repos, missing files, errors)" | Edge Cases section: 6 scenarios covering auth failures, network issues, timeout, size limits, non-JSON output, partial failures | -| Test Fixtures: Mock repository setup | Independent Test criteria for each user story define testable acceptance scenarios | - -**Evidence**: The specification's edge case identification (6 comprehensive scenarios) and user story testability directly reflect doubleagent.md's testing philosophy. - ---- - -### 5. Simplification & Anti-Over-Engineering - -**Source**: `doubleagent.md:502-523` (Anti-Patterns to Avoid) - -**Impact on Specification**: - -| doubleagent.md Anti-Pattern | How Avoided in Spec | -|-----------------------------|---------------------| -| ❌ "Add external dependencies without justification" | Out of Scope: No custom exception classes (7 removed), no separate aggregator service (inline with pandas) | -| ❌ "Break backwards compatibility" | **FR-014**: Preserve backward compatibility with existing mocked integration for testing/development | -| ❌ "Over-engineer solutions" | Non-Functional Requirement: ~120 lines of code (not 507) following simplified approach - 76% reduction | -| ✅ "Use proportional scoring for partial compliance" | **FR-010**: Aggregation uses statistical measures (mean/median/std) to assess proportional assessor impact | -| ✅ "Follow library-first architecture" | Assumptions: Default behavior remains mocked unless explicitly toggled (safe default for CI/CD) | - -**Evidence**: The "Out of Scope" section explicitly lists components removed based on simplified approach, directly aligned with doubleagent.md's anti-over-engineering principles and the automated review's 76% code reduction recommendation. - ---- - -### 6. User-Focused Remediation - -**Source**: `doubleagent.md:239-243` (Constitutional Principle 4 - "User-Focused Remediation") - -**Impact on Specification**: - -| doubleagent.md Principle | How Applied in Spec | -|--------------------------|---------------------| -| "Provide actionable steps (specific commands, tools, examples)" | **FR-012**: Clear error messages with installation guidance when Harbor framework missing | -| "Include citations to documentation/standards" | Dependencies section: Links to Harbor framework, Terminal-Bench, API documentation | -| "Explain the 'why' behind recommendations" | **FR-013**: Document recommendations for assessor tier changes with empirical justification | - -**Evidence**: The specification's emphasis on actionable error messages (SC-008: 95% of errors provide clear guidance) mirrors doubleagent.md's user-focused remediation philosophy. - ---- - -## Quantified Impact Metrics - -| Metric | Value | doubleagent.md Influence | -|--------|-------|--------------------------| -| User Stories with Independent Testability | 4/4 (100%) | Mirrors doubleagent.md's "Test individual assessor logic" principle | -| Security Requirements Prioritized to P1 | 1/4 stories (25%) | Elevated based on doubleagent.md security anti-patterns | -| Code Simplification (Out of Scope items) | 5 components removed | Directly addresses doubleagent.md's "avoid over-engineering" guidance | -| Edge Cases Identified | 6 comprehensive scenarios | Reflects doubleagent.md's "edge case coverage" testing standard | -| Functional Requirements with Security Focus | 3/14 (21%) | FR-002, FR-003, FR-004 address API key exposure and command injection | - ---- - -## Key Insights & Patterns Applied - -### Pattern 1: Graceful Degradation -**Source**: `doubleagent.md:134-146` (Graceful Degradation pattern) - -**Application**: -- **FR-007**: Environment variable toggle allows fallback to mocked integration -- **FR-012**: Clear error handling when Harbor framework unavailable -- **FR-014**: Backward compatibility preserves existing mocked behavior - -**Quote from doubleagent.md**: -> "Missing tools → `skipped` status, not crashes" - -This pattern directly informed the specification's approach to handling missing Harbor framework installation and API credential errors. - ---- - -### Pattern 2: Proportional Scoring for Assessor Effectiveness -**Source**: `doubleagent.md:120-133` (Proportional Scoring pattern) - -**Application**: -- **FR-010**: Aggregation uses mean/median/std to measure proportional impact -- **FR-011**: Statistical significance indicators (confidence intervals, p-values) -- **SC-006**: Identify top 5 and bottom 5 assessors based on measured delta improvement - -**Quote from doubleagent.md**: -> "Proportional Scoring (for partial compliance): calculate_proportional_score(passed=7, total=10, attribute=self.attribute)" - -This pattern shaped the specification's approach to measuring assessor effectiveness across diverse repositories with statistical rigor. - ---- - -### Pattern 3: Library-First Architecture -**Source**: `doubleagent.md:30-36` (Library-First Philosophy) - -**Application**: -- No global state: Environment variable toggle instead of configuration singleton -- Stateless components: HarborConfig entity with dependency injection -- Independent assessors: Aggregation treats each assessor independently - -**Quote from doubleagent.md**: -> "No global state, all components are stateless" - -This architectural principle prevented the specification from introducing stateful configuration or global Harbor framework clients. - ---- - -## Impact on Success Criteria - -| Success Criterion | doubleagent.md Influence | -|-------------------|--------------------------| -| **SC-003**: 100% accuracy blocking invalid params | Security anti-patterns: "prevent command injection vulnerabilities" | -| **SC-004**: Zero API credentials exposed | Security anti-patterns: "API key exposure prevention" | -| **SC-006**: Identify top 5 assessors | Proportional scoring pattern for measuring assessor effectiveness | -| **SC-008**: 95% of errors provide clear guidance | User-focused remediation principle: "actionable steps" | -| **SC-010**: 100% backward compatibility | Anti-pattern: "Don't break backwards compatibility" | - ---- - -## Documentation Quality Impact - -**Source**: `doubleagent.md:375-398` (Key Design Documents) - -**Impact**: The specification structure mirrors doubleagent.md's recommended documentation pattern: - -| doubleagent.md Document | Specification Equivalent | -|-------------------------|--------------------------| -| Feature specifications (`specs/001-agentready-scorer/spec.md`) | This spec: `specs/001-harbor-real-integration/spec.md` | -| Design decisions (`specs/001-agentready-scorer/plan.md`) | Next phase: Planning document will follow same pattern | -| Contracts & Schemas (`contracts/assessment-schema.json`) | Key Entities section defines TbenchResult, BenchmarkRun, AggregatedResult schemas | -| Reference Implementations (`src/agentready/assessors/documentation.py`) | Assumptions section references existing eval harness implementation | - ---- - -## Learnings & Recommendations - -### What Worked Well - -1. **Constitutional Principles as Design Filter**: Using doubleagent.md's 5 constitutional principles (Library-First, Strategy Pattern, Fail Gracefully, User-Focused Remediation, Test-Driven) as a checklist during specification creation prevented over-engineering and security vulnerabilities. - -2. **Anti-Patterns as Negative Requirements**: The "DON'T" section (`doubleagent.md:504-513`) directly informed the "Out of Scope" section, resulting in 76% code reduction by explicitly excluding components that would violate simplicity principles. - -3. **Security Patterns from Agent Documentation**: The automated review's security findings (API key exposure, command injection) were already anticipated and addressed in the specification because doubleagent.md explicitly warns against these patterns. - -### Recommendations for Future Specifications - -1. **Always Consult doubleagent.md Early**: Review relevant sections during initial specification drafting, not just during implementation. This prevents architectural rework. - -2. **Map Patterns to Requirements**: Create explicit traceability from doubleagent.md patterns (e.g., proportional scoring, graceful degradation) to functional requirements to ensure consistency. - -3. **Use Anti-Patterns for Scope Reduction**: The "DON'T" section is invaluable for identifying what to exclude from scope, leading to simpler, more maintainable implementations. - ---- - -## Conclusion - -The doubleagent.md agent documentation had **HIGH IMPACT** on this specification, contributing to: - -- ✅ **Security**: 3 functional requirements directly address API key exposure and command injection vulnerabilities flagged by automated review and anticipated by doubleagent.md's security principles -- ✅ **Simplicity**: 76% code reduction (507 → ~120 lines) by excluding components that violate doubleagent.md's anti-over-engineering guidance -- ✅ **Testing**: 6 comprehensive edge cases and 100% independently testable user stories reflecting doubleagent.md's testing philosophy -- ✅ **Architecture**: Library-first design with stateless components, dependency injection, and graceful degradation patterns - -**Overall Impact Rating**: **9/10** - doubleagent.md provided critical architectural guardrails, security awareness, and simplicity principles that shaped nearly every aspect of this specification. - ---- - -**Document Created**: 2025-12-09 -**Author**: Claude (AgentReady Development Agent) -**Purpose**: Track and quantify the specific impact of `.claude/agents/doubleagent.md` on the Harbor Real Integration specification diff --git a/specs/002-harbor-real-integration/checklists/requirements.md b/specs/002-harbor-real-integration/checklists/requirements.md deleted file mode 100644 index 22ef04db..00000000 --- a/specs/002-harbor-real-integration/checklists/requirements.md +++ /dev/null @@ -1,45 +0,0 @@ -# Specification Quality Checklist: Harbor Framework Real Integration for Terminal-Bench Eval Harness - -**Purpose**: Validate specification completeness and quality before proceeding to planning -**Created**: 2025-12-09 -**Feature**: [spec.md](../spec.md) - -## Content Quality - -- [x] No implementation details (languages, frameworks, APIs) -- [x] Focused on user value and business needs -- [x] Written for non-technical stakeholders -- [x] All mandatory sections completed - -## Requirement Completeness - -- [x] No [NEEDS CLARIFICATION] markers remain -- [x] Requirements are testable and unambiguous -- [x] Success criteria are measurable -- [x] Success criteria are technology-agnostic (no implementation details) -- [x] All acceptance scenarios are defined -- [x] Edge cases are identified -- [x] Scope is clearly bounded -- [x] Dependencies and assumptions identified - -## Feature Readiness - -- [x] All functional requirements have clear acceptance criteria -- [x] User scenarios cover primary flows -- [x] Feature meets measurable outcomes defined in Success Criteria -- [x] No implementation details leak into specification - -## Notes - -**Validation Results**: ✅ All checklist items pass - -**Specification Quality**: Excellent -- Clear prioritization of user stories (P1/P2) with independent testability -- Security concerns elevated to P1 priority based on automated review feedback -- Success criteria are measurable and technology-agnostic (e.g., "100% accuracy blocking invalid params" vs "use allowlist validation") -- Scope clearly distinguishes in-scope vs out-of-scope following simplified approach (76% code reduction) -- Edge cases comprehensively identified (6 scenarios covering auth, network, timeout, size limits, output parsing, partial failures) -- Assumptions explicitly documented (package availability, execution time estimates, sample size adequacy) -- Risks and mitigations address key uncertainties (API differences, performance estimates, statistical confidence) - -**Ready for Next Phase**: ✅ Proceed to `/speckit.plan` diff --git a/specs/002-harbor-real-integration/contracts/aggregation-output-schema.json b/specs/002-harbor-real-integration/contracts/aggregation-output-schema.json deleted file mode 100644 index 743ba412..00000000 --- a/specs/002-harbor-real-integration/contracts/aggregation-output-schema.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "https://agentready.dev/schemas/aggregation-output.json", - "title": "AgentReady Assessor Effectiveness Aggregation Schema", - "description": "Output schema for multi-repository assessor effectiveness aggregation", - "type": "object", - "required": ["aggregation_date", "total_repositories", "assessors"], - "properties": { - "aggregation_date": { - "type": "string", - "format": "date-time", - "description": "Timestamp when aggregation was performed" - }, - "total_repositories": { - "type": "integer", - "description": "Total number of repositories analyzed", - "minimum": 1 - }, - "assessors": { - "type": "array", - "description": "Assessor effectiveness statistics", - "items": { - "type": "object", - "required": ["assessor_id", "mean_delta", "median_delta", "std_delta", "sample_size", "significant"], - "properties": { - "assessor_id": { - "type": "string", - "description": "Unique assessor identifier" - }, - "mean_delta": { - "type": "number", - "description": "Average score improvement (can be negative)", - "minimum": -1.0, - "maximum": 1.0 - }, - "median_delta": { - "type": "number", - "description": "Median score improvement (can be negative)", - "minimum": -1.0, - "maximum": 1.0 - }, - "std_delta": { - "type": "number", - "description": "Standard deviation of delta scores", - "minimum": 0.0 - }, - "sample_size": { - "type": "integer", - "description": "Number of repositories tested with this assessor", - "minimum": 1 - }, - "significant": { - "type": "boolean", - "description": "Statistical significance indicator (p < 0.05)" - } - } - } - } - } -} diff --git a/specs/002-harbor-real-integration/contracts/harbor-results-schema.json b/specs/002-harbor-real-integration/contracts/harbor-results-schema.json deleted file mode 100644 index c1baed63..00000000 --- a/specs/002-harbor-real-integration/contracts/harbor-results-schema.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "https://agentready.dev/schemas/harbor-results.json", - "title": "Harbor Framework Terminal-Bench Results Schema", - "description": "Expected output schema from Harbor framework 'harbor run' command for Terminal-Bench benchmarks", - "type": "object", - "required": ["summary", "tasks"], - "properties": { - "summary": { - "type": "object", - "description": "Aggregate metrics for the benchmark run", - "required": ["resolved_trials", "unresolved_trials", "accuracy", "pass@1", "pass@3"], - "properties": { - "resolved_trials": { - "type": "integer", - "description": "Number of successfully completed tasks", - "minimum": 0 - }, - "unresolved_trials": { - "type": "integer", - "description": "Number of failed tasks", - "minimum": 0 - }, - "accuracy": { - "type": "number", - "description": "Overall success rate (0.0 to 1.0)", - "minimum": 0.0, - "maximum": 1.0 - }, - "pass@1": { - "type": "number", - "description": "Single-attempt success rate", - "minimum": 0.0, - "maximum": 1.0 - }, - "pass@3": { - "type": "number", - "description": "Success rate within 3 attempts", - "minimum": 0.0, - "maximum": 1.0 - } - } - }, - "tasks": { - "type": "array", - "description": "Individual task results", - "items": { - "type": "object", - "required": ["task_id", "status", "score", "attempts"], - "properties": { - "task_id": { - "type": "string", - "description": "Unique task identifier" - }, - "status": { - "type": "string", - "description": "Task completion status", - "enum": ["resolved", "unresolved"] - }, - "score": { - "type": "number", - "description": "Task success score (0.0 to 1.0)", - "minimum": 0.0, - "maximum": 1.0 - }, - "attempts": { - "type": "integer", - "description": "Number of attempts made", - "minimum": 1 - } - } - } - } - } -} diff --git a/specs/002-harbor-real-integration/data-model.md b/specs/002-harbor-real-integration/data-model.md deleted file mode 100644 index 40b766b7..00000000 --- a/specs/002-harbor-real-integration/data-model.md +++ /dev/null @@ -1,471 +0,0 @@ -# Data Model: Harbor Framework Integration - -**Feature**: Harbor Framework Real Integration for Terminal-Bench Eval Harness -**Date**: 2025-12-09 -**Status**: Complete - ---- - -## Overview - -This document defines the data models for Harbor framework integration in the AgentReady eval harness. All models follow AgentReady's existing patterns from `src/agentready/models/` and maintain backward compatibility with the Phase 1 mocked implementation. - ---- - -## Core Entities - -### 1. TbenchResult (Existing - Extended) - -**Purpose**: Represents the outcome of a single Terminal-Bench evaluation (baseline or assessor test). - -**Location**: `src/agentready/services/eval_harness/tbench_runner.py` (dataclass within module) - -**Fields**: - -| Field | Type | Description | Validation Rules | -|-------|------|-------------|------------------| -| `score` | `float` | Benchmark accuracy score (0.0 to 1.0) | Must be >= 0.0 and <= 1.0 | -| `task_solved` | `bool` | Whether any tasks were successfully resolved | True if resolved_trials > 0 | -| `is_mocked` | `bool` | Indicates if result is from mocked or real Harbor run | True for mocked, False for real | -| `resolved_trials` | `int` (new) | Number of successfully completed tasks | Must be >= 0 | -| `unresolved_trials` | `int` (new) | Number of failed tasks | Must be >= 0 | -| `pass_at_1` | `float` (new) | Single-attempt success rate | Must be >= 0.0 and <= 1.0 | -| `pass_at_3` | `float` (new) | Success rate within 3 attempts | Must be >= 0.0 and <= 1.0 | - -**Example**: -```python -@dataclass -class TbenchResult: - score: float # Maps to Harbor's "accuracy" field - task_solved: bool - is_mocked: bool - resolved_trials: int = 0 - unresolved_trials: int = 0 - pass_at_1: float = 0.0 - pass_at_3: float = 0.0 - - def __post_init__(self): - if not (0.0 <= self.score <= 1.0): - raise ValueError(f"Score must be 0.0-1.0, got {self.score}") - if self.resolved_trials < 0 or self.unresolved_trials < 0: - raise ValueError("Trial counts cannot be negative") -``` - -**Backward Compatibility**: Existing Phase 1 code creates `TbenchResult(score, task_solved, is_mocked=True)` - new fields have defaults, so this remains valid. - ---- - -### 2. HarborConfig (New) - -**Purpose**: Configuration for Harbor framework subprocess execution. - -**Location**: `src/agentready/services/eval_harness/harbor_config.py` (new file) - -**Fields**: - -| Field | Type | Description | Validation Rules | -|-------|------|-------------|------------------| -| `model` | `str` | LLM model identifier | Must be in ALLOWED_MODELS set | -| `agent` | `str` | Agent identifier | Must be in ALLOWED_AGENTS set | -| `jobs_dir` | `Path` | Output directory for results | Must be absolute path, writable | -| `timeout` | `int` | Subprocess timeout in seconds | Must be > 0, default 3600 | -| `n_concurrent` | `int` | Harbor's internal concurrency | Must be >= 1, default 1 | -| `api_key` | `str` | Anthropic API key | Must not be empty | - -**Validation Constants**: -```python -ALLOWED_MODELS = { - "anthropic/claude-haiku-4-5", - "anthropic/claude-sonnet-4-5", -} - -ALLOWED_AGENTS = { - "claude-code", -} -``` - -**Example**: -```python -@dataclass -class HarborConfig: - model: str - agent: str - jobs_dir: Path - api_key: str - timeout: int = 3600 - n_concurrent: int = 1 - - def __post_init__(self): - if self.model not in ALLOWED_MODELS: - raise ValueError(f"Invalid model: {self.model}") - if self.agent not in ALLOWED_AGENTS: - raise ValueError(f"Invalid agent: {self.agent}") - if not self.api_key: - raise ValueError("API key cannot be empty") - if self.timeout <= 0: - raise ValueError("Timeout must be positive") - self.jobs_dir = Path(self.jobs_dir).resolve() -``` - -**Usage**: -```python -config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/tbench-results"), - api_key=os.environ.get("ANTHROPIC_API_KEY"), -) -``` - ---- - -### 3. BenchmarkRun (New - Optional, for future batch tracking) - -**Purpose**: Metadata for a single benchmark execution (used for aggregation and debugging). - -**Location**: `src/agentready/services/eval_harness/models.py` (new file, or inline in CLI) - -**Fields**: - -| Field | Type | Description | Validation Rules | -|-------|------|-------------|------------------| -| `run_id` | `str` | Unique identifier (UUID) | Generated automatically | -| `repository_path` | `Path` | Path to repository being benchmarked | Must exist | -| `assessor_id` | `str \| None` | Assessor ID (None for baseline) | Optional | -| `result` | `TbenchResult` | Benchmark result | Required | -| `timestamp` | `datetime` | When benchmark was executed | Generated automatically | -| `duration_seconds` | `float` | Execution time | Must be >= 0 | -| `error` | `str \| None` | Error message if benchmark failed | Optional | - -**Example**: -```python -@dataclass -class BenchmarkRun: - repository_path: Path - assessor_id: str | None - result: TbenchResult - run_id: str = field(default_factory=lambda: str(uuid.uuid4())) - timestamp: datetime = field(default_factory=datetime.now) - duration_seconds: float = 0.0 - error: str | None = None -``` - -**Usage** (Future - Phase 3 historical tracking): -```python -run = BenchmarkRun( - repository_path=Path("/path/to/repo"), - assessor_id="claude_md", - result=TbenchResult(score=0.85, task_solved=True, is_mocked=False), - duration_seconds=347.2, -) -``` - -**Note**: This entity is **optional** for Phase 2. Current implementation can inline this data in CLI commands without formal model. Include only if needed for batch result storage. - ---- - -### 4. AggregatedResult (New) - -**Purpose**: Statistical summary of assessor effectiveness across multiple repositories. - -**Location**: Inline in `src/agentready/cli/eval_harness.py` (summarize command) using pandas DataFrame - -**Fields** (conceptual - represented as pandas DataFrame columns): - -| Column | Type | Description | Validation Rules | -|--------|------|-------------|------------------| -| `assessor_id` | `str` | Assessor identifier | Required | -| `mean_delta` | `float` | Average score improvement | Can be negative (regression) | -| `median_delta` | `float` | Median score improvement | Can be negative | -| `std_delta` | `float` | Standard deviation of deltas | Must be >= 0 | -| `sample_size` | `int` | Number of repositories tested | Must be > 0 | -| `significant` | `bool` | Statistical significance indicator | True if p-value < 0.05 (placeholder) | - -**Example** (pandas DataFrame): -```python -import pandas as pd - -# Aggregation logic (inline in summarize command) -df = pd.DataFrame(results) # results = List[Dict[str, Any]] -summary = df.groupby("assessor_id").agg({ - "delta_score": ["mean", "median", "std", "count"], -}).round(2) -summary.columns = ["mean_delta", "median_delta", "std_delta", "sample_size"] -summary["significant"] = summary["mean_delta"].abs() > 0.05 # Placeholder significance test -``` - -**Output Format** (for reports): -``` -Assessor ID | Mean Δ | Median Δ | Std Δ | Sample Size | Significant? -------------------|--------|----------|-------|-------------|------------- -claude_md | +0.12 | +0.10 | 0.05 | 15 | ✅ Yes -test_coverage | +0.08 | +0.07 | 0.06 | 15 | ✅ Yes -dependency_pinning| +0.02 | +0.01 | 0.08 | 12 | ❌ No -``` - -**Note**: No formal Python class needed - pandas DataFrame provides all aggregation functionality inline. - ---- - -## Data Flow - -```text -Repository Path - ↓ -HarborConfig (validation) - ↓ -harbor run subprocess (CLI call) - ↓ -Harbor Output: results.json - ↓ -Parse results.json → TbenchResult - ↓ -(Optional) BenchmarkRun (metadata wrapping) - ↓ -Aggregation (pandas) → AggregatedResult DataFrame - ↓ -Report Generation (markdown/JSON) -``` - ---- - -## State Transitions - -### TbenchResult State - -**States**: -1. **Pending**: Not yet executed (not modeled - implicit) -2. **Mocked** (`is_mocked=True`): Result from Phase 1 deterministic mock -3. **Real** (`is_mocked=False`): Result from actual Harbor framework execution -4. **Failed** (modeled via `error` field in BenchmarkRun, not TbenchResult itself) - -**Transition Rules**: -- Mocked results cannot transition to Real (different execution paths) -- Failed benchmarks do not create TbenchResult (exception raised or error logged) - ---- - -## Validation Rules - -### 1. Score Ranges -- All probability scores (score, pass_at_1, pass_at_3) must be [0.0, 1.0] -- Trial counts (resolved_trials, unresolved_trials) must be non-negative integers -- Delta scores in aggregation can be negative (indicating regression) - -### 2. Path Validation -- All file paths (jobs_dir, repository_path) must be resolved to absolute paths -- Results JSON path must be validated as relative to jobs_dir (prevent path traversal) - -### 3. Temporal Constraints -- Benchmark duration_seconds must be non-negative -- Timeout must be positive (enforced in HarborConfig) - -### 4. Security Constraints -- Model and agent parameters validated against allowlists before subprocess execution -- API key must not be empty (enforced in HarborConfig) -- Environment variable sanitization (only ANTHROPIC_API_KEY, PATH, HOME exposed) - ---- - -## Integration with Existing Models - -### Existing AgentReady Models (src/agentready/models/) - -**Not Modified**: -- `Repository`: Represents scanned repository (no changes needed) -- `Attribute`: Quality attribute definition (no changes needed) -- `Finding`: Assessment result (not used in eval harness) -- `Assessment`: Complete assessment report (not used in eval harness) - -**Eval Harness Models**: -- `TbenchResult`: Extended with new fields (backward compatible) -- `HarborConfig`: New, self-contained -- `BenchmarkRun`: New, optional -- `AggregatedResult`: Conceptual (pandas DataFrame, no formal model) - ---- - -## JSON Schemas - -### Harbor Output Schema (results.json) - -**Expected Structure** (from Harbor framework): -```json -{ - "summary": { - "resolved_trials": 42, - "unresolved_trials": 8, - "accuracy": 0.84, - "pass@1": 0.78, - "pass@3": 0.84 - }, - "tasks": [ - { - "task_id": "string", - "status": "resolved" | "unresolved", - "score": 0.0 to 1.0, - "attempts": integer - } - ] -} -``` - -**Parsing Logic**: -```python -def parse_harbor_results(results_path: Path) -> TbenchResult: - with open(results_path) as f: - data = json.load(f) - - summary = data["summary"] - return TbenchResult( - score=summary["accuracy"], - task_solved=summary["resolved_trials"] > 0, - is_mocked=False, - resolved_trials=summary["resolved_trials"], - unresolved_trials=summary["unresolved_trials"], - pass_at_1=summary["pass@1"], - pass_at_3=summary["pass@3"], - ) -``` - ---- - -### AgentReady Aggregation Output Schema (JSON export) - -**For Machine Consumption**: -```json -{ - "aggregation_date": "2025-12-09T10:30:00Z", - "total_repositories": 15, - "assessors": [ - { - "assessor_id": "claude_md", - "mean_delta": 0.12, - "median_delta": 0.10, - "std_delta": 0.05, - "sample_size": 15, - "significant": true - } - ] -} -``` - ---- - -## Examples - -### Example 1: Real Harbor Benchmark Execution - -```python -# 1. Create configuration -config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/tbench-results"), - api_key=os.environ["ANTHROPIC_API_KEY"], -) - -# 2. Execute benchmark (subprocess call) -result = run_harbor_benchmark(repo_path, config) - -# 3. Result object -print(result) -# TbenchResult( -# score=0.84, -# task_solved=True, -# is_mocked=False, -# resolved_trials=42, -# unresolved_trials=8, -# pass_at_1=0.78, -# pass_at_3=0.84 -# ) -``` - ---- - -### Example 2: Aggregation Across Repositories - -```python -# 1. Collect results from multiple benchmarks -results = [ - {"assessor_id": "claude_md", "delta_score": 0.10}, - {"assessor_id": "claude_md", "delta_score": 0.12}, - {"assessor_id": "claude_md", "delta_score": 0.15}, - {"assessor_id": "test_coverage", "delta_score": 0.05}, - {"assessor_id": "test_coverage", "delta_score": 0.08}, -] - -# 2. Aggregate with pandas -import pandas as pd -df = pd.DataFrame(results) -summary = df.groupby("assessor_id").agg({ - "delta_score": ["mean", "median", "std", "count"] -}).round(2) - -# 3. Output -print(summary) -# delta_score -# mean median std count -# assessor_id -# claude_md 0.12 0.12 0.03 3 -# test_coverage 0.07 0.07 0.02 2 -``` - ---- - -## Design Decisions - -### Decision 1: Extend TbenchResult vs Create New Model - -**Chosen**: Extend existing `TbenchResult` with new optional fields - -**Rationale**: -- Maintains backward compatibility (new fields have defaults) -- Avoids model proliferation (simpler codebase) -- Natural mapping to Harbor's output schema - -**Alternative Rejected**: Create separate `HarborTbenchResult` model -- Reason: Unnecessary abstraction, increases complexity - ---- - -### Decision 2: Inline Aggregation vs Separate Service - -**Chosen**: Inline pandas aggregation in CLI `summarize` command - -**Rationale**: -- Aggregation logic is <30 lines with pandas -- No need for separate service class (violates doubleagent.md anti-patterns) -- Simplified approach (76% code reduction goal) - -**Alternative Rejected**: Create `CrossRepoAggregator` service class -- Reason: Over-engineering for simple DataFrame operations - ---- - -### Decision 3: BenchmarkRun Model - Optional vs Required - -**Chosen**: Optional (can be deferred to Phase 3) - -**Rationale**: -- Phase 2 focus: Real Harbor integration and aggregation -- BenchmarkRun metadata useful for historical tracking (Phase 3 feature) -- Current implementation can work without formal model (inline dict/dataclass) - -**Alternative Rejected**: Implement immediately -- Reason: Not required for Phase 2 MVP, adds complexity - ---- - -## Next Steps - -1. ✅ Data models designed -2. ⏭️ Create JSON schema contracts in `contracts/` directory -3. ⏭️ Generate quickstart guide for Harbor setup and first benchmark run -4. ⏭️ Update agent context with new models and Harbor integration patterns - ---- - -**Document Status**: Complete -**Last Updated**: 2025-12-09 -**Ready for Contracts Phase**: ✅ Yes diff --git a/specs/002-harbor-real-integration/plan.md b/specs/002-harbor-real-integration/plan.md deleted file mode 100644 index 68636e11..00000000 --- a/specs/002-harbor-real-integration/plan.md +++ /dev/null @@ -1,701 +0,0 @@ -# Implementation Plan: Harbor Framework Real Integration for Terminal-Bench Eval Harness - -**Branch**: `002-harbor-real-integration` | **Date**: 2025-12-09 | **Spec**: [spec.md](./spec.md) -**Input**: Feature specification from `/specs/002-harbor-real-integration/spec.md` - -## Summary - -Replace the mocked Terminal-Bench integration with real Harbor framework subprocess calls to enable empirical validation of AgentReady assessor effectiveness. Use real benchmark data from 10-20 diverse repositories to identify high-impact vs low-impact assessors, then document recommendations for assessor list refinement. - -**Technical Approach**: -- Install `harbor` Python package (Laude Institute's official CLI) -- Replace `_real_tbench_result()` NotImplementedError with subprocess call to `harbor run` -- Parse JSON output from `/results.json` -- Add security validations (API key sanitization, model/agent allowlists, path validation) -- Implement pandas-based aggregation inline in `summarize` CLI command -- Maintain backward compatibility with existing mocked integration via `TBENCH_USE_REAL` environment variable toggle - ---- - -## Technical Context - -**Language/Version**: Python 3.11+ (AgentReady standard, aligns with "N and N-1" policy) -**Primary Dependencies**: -- `harbor` (Laude Institute CLI, installed via `uv pip install harbor`) -- `pandas` (already in dependencies, for aggregation) -- `subprocess` (stdlib, for Harbor CLI calls) -- `json` (stdlib, for results parsing) -- `pathlib` (stdlib, for path validation) - -**Storage**: File-based (Harbor outputs to `--jobs-dir`, JSON results parsed from filesystem) -**Testing**: pytest (existing AgentReady standard), subprocess mocking for Harbor calls -**Target Platform**: Linux/macOS (Docker required for Harbor framework) -**Project Type**: Single (extends existing `src/agentready/` structure) -**Performance Goals**: -- Individual benchmark: 5-10 minutes average execution time -- Batch (8 repos × 35 assessors = 280 runs): Complete in <24 hours with 4-worker parallelism -- Timeout: 1 hour (3600s) per individual benchmark run - -**Constraints**: -- Docker required (Harbor executes benchmarks in containers) -- `ANTHROPIC_API_KEY` environment variable required -- Subprocess timeout: 3600 seconds (1 hour) per benchmark -- Memory: <2GB for parallel execution (4 workers) -- File handles: <1024 concurrent (enforced by 4-worker limit) - -**Scale/Scope**: -- Phase 2: 10-20 repositories for empirical assessor validation -- 25 assessors to evaluate (current AgentReady assessment suite) -- ~120 lines of new code (76% reduction from original 507-line plan) - ---- - -## Constitution Check - -*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* - -### Pre-Research Check (Phase 0 Entry Gate) - -| Principle | Compliance | Evidence | -|-----------|------------|----------| -| **I. Evidence-Based Design** | ✅ Pass | Specification cites automated review findings (API key exposure, command injection), Harbor framework documentation (harborframework.com), Terminal-Bench research (Laude Institute GitHub) | -| **II. Measurable Quality** | ✅ Pass | Success criteria include quantifiable metrics: "100% accuracy blocking invalid params" (SC-003), "95% of errors provide clear guidance" (SC-008), "completes in <24 hours" (SC-009) | -| **III. Tool-First Mindset** | ✅ Pass | Harbor integration uses subprocess library interface (text-based I/O), maintains existing eval harness library structure in `src/agentready/services/eval_harness/` | -| **IV. Test-Driven Development** | ⚠️ Deferred | TDD workflow will be enforced during Phase 2 (Tasks) implementation. Tests must be written FIRST before Harbor integration code. | -| **V. Structured Output** | ✅ Pass | Harbor outputs JSON (`results.json`), AgentReady aggregation supports JSON export via pandas, human-readable markdown reports via `summarize` command | -| **VI. Incremental Delivery** | ✅ Pass | User stories prioritized P1/P2, MVP = User Story 1 (real benchmark execution) + User Story 3 (security), can be deployed independently | -| **VII. Documentation as Code** | ✅ Pass | Quickstart guide created (`quickstart.md`), research documented (`research.md`), data models defined (`data-model.md`), contracts specified (`contracts/*.json`) | - -**Quality Gates**: -1. ✅ **Linting**: Will enforce black, isort, flake8 (no line length limit per CLAUDE.md) -2. ✅ **Tests**: Target >80% coverage for new Harbor integration code (per Constitution) -3. ✅ **Security**: Allowlist validation (models, agents), environment variable sanitization, path validation (addresses critical security review findings) -4. ✅ **Documentation**: README updated with Harbor setup, quickstart guide provided - -**Violations**: None identified - ---- - -### Post-Design Check (Phase 1 Exit Gate) - -| Principle | Compliance | Evidence | -|-----------|------------|----------| -| **I. Evidence-Based Design** | ✅ Pass | Research document cites 6 authoritative sources (Harbor framework docs, GitHub repos, industry articles from Snorkel AI, VentureBeat) | -| **II. Measurable Quality** | ✅ Pass | Data models include validation rules (score ∈ [0.0, 1.0], trial counts ≥ 0), JSON schemas define expected formats, aggregation metrics (mean, median, std) are quantifiable | -| **III. Tool-First Mindset** | ✅ Pass | HarborConfig dataclass is self-contained, TbenchResult is independently testable, aggregation uses pandas library (not custom implementation) | -| **IV. Test-Driven Development** | ⚠️ Pending | Tests will be written FIRST during Phase 2 implementation (red-green-refactor workflow enforced) | -| **V. Structured Output** | ✅ Pass | JSON schemas defined for Harbor results (`harbor-results-schema.json`) and aggregation output (`aggregation-output-schema.json`), pandas DataFrame supports both JSON export and markdown tables | -| **VI. Incremental Delivery** | ✅ Pass | Phase 0 (research) complete independently, Phase 1 (design) complete independently, Phase 2 (implementation) can deliver User Story 1 (real benchmarks) before User Story 2 (aggregation) | -| **VII. Documentation as Code** | ✅ Pass | Quickstart guide provides <10 minute setup, data model document explains all entities, contracts define expected formats, research document captures all technical decisions | - -**Complexity Limits Check**: -- **File Size**: No files exceed 300 lines (TbenchResult extension adds 7 fields, HarborConfig is ~40 lines, aggregation inline in CLI) -- **Function Length**: Subprocess call function estimated <50 lines, JSON parsing function <30 lines -- **Cyclomatic Complexity**: Simple conditionals (model validation, path checks) stay well below 10 -- **Dependencies**: Harbor package is only new external dependency (pandas already in dependencies) - -**Re-Check Result**: ✅ **PASS** - All principles compliant, ready for Phase 2 (Tasks) - ---- - -## Project Structure - -### Documentation (this feature) - -```text -specs/002-harbor-real-integration/ -├── plan.md # This file -├── spec.md # Feature specification -├── research.md # Phase 0 research (complete) -├── data-model.md # Phase 1 data models (complete) -├── quickstart.md # Phase 1 quickstart guide (complete) -├── contracts/ # Phase 1 JSON schemas (complete) -│ ├── harbor-results-schema.json # Harbor framework output schema -│ └── aggregation-output-schema.json # AgentReady aggregation output schema -├── checklists/ # Specification quality checklist -│ └── requirements.md # Validation checklist (all items passed) -├── DOUBLEAGENT_IMPACT.md # doubleagent.md influence analysis -└── tasks.md # Phase 2 output (/speckit.tasks command - NOT created yet) -``` - -### Source Code (repository root) - -```text -src/agentready/ -├── services/ -│ └── eval_harness/ -│ ├── __init__.py # Existing -│ ├── tbench_runner.py # **MODIFY**: Replace _real_tbench_result() NotImplementedError -│ ├── harbor_config.py # **NEW**: HarborConfig dataclass with validation -│ └── models.py # **NEW** (optional): BenchmarkRun metadata (Phase 3) -├── cli/ -│ └── eval_harness.py # **MODIFY**: Add pandas aggregation to 'summarize' command -└── models/ # No changes (existing models not used in eval harness) - -tests/ -├── unit/ -│ ├── test_eval_harness_services.py # **MODIFY**: Add Harbor integration tests with subprocess mocking -│ └── test_harbor_config.py # **NEW**: HarborConfig validation tests -└── integration/ - └── test_eval_harness_e2e.py # **MODIFY**: Add end-to-end test with mock Harbor subprocess - -contracts/ # No changes (eval harness doesn't use existing contracts) -docs/ -└── tbench/ - ├── methodology.md # **MODIFY**: Add Phase 2 real-world validation section - └── assessor-refinement-results.md # **NEW**: Empirical assessor effectiveness findings -``` - -**Structure Decision**: Extends existing single-project structure (`src/agentready/`). No new top-level directories needed. Eval harness is already modular within `src/agentready/services/eval_harness/`, new Harbor integration fits naturally here. Follows AgentReady's established pattern of service modules + CLI commands + tests. - ---- - -## Complexity Tracking - -**No violations identified** - Constitution Check passed with no complexity limit violations. - -All design decisions align with simplicity principles: -- ✅ No custom exception classes (use RuntimeError) -- ✅ No separate aggregator service (inline pandas operations) -- ✅ No pre-flight checks (trust Harbor validation) -- ✅ ~120 lines of implementation (76% reduction from original 507-line plan) - ---- - -## Implementation Phases - -### Phase 0: Research (Complete ✅) - -**Deliverable**: `research.md` with all technical unknowns resolved - -**Questions Resolved**: -1. ✅ Harbor package installation: `uv pip install harbor` -2. ✅ Authentication: `ANTHROPIC_API_KEY` environment variable -3. ✅ CLI syntax: `harbor run --dataset terminal-bench@2.0 --agent claude-code --model --jobs-dir ` -4. ✅ Output format: JSON at `/results.json` with accuracy, pass@k metrics -5. ✅ Execution times: 5-10 minutes average, 1-hour timeout provides 6x buffer -6. ✅ Model/agent validation: Allowlists defined (haiku-4-5, sonnet-4-5 for models; claude-code for agents) -7. ✅ Docker dependency: Required for local execution, trust Harbor's validation - -**Research Document**: [research.md](./research.md) - ---- - -### Phase 1: Design & Contracts (Complete ✅) - -**Deliverables**: -1. ✅ `data-model.md` - Entity definitions (TbenchResult extended, HarborConfig new, AggregatedResult conceptual) -2. ✅ `contracts/harbor-results-schema.json` - JSON schema for Harbor output validation -3. ✅ `contracts/aggregation-output-schema.json` - JSON schema for AgentReady aggregation export -4. ✅ `quickstart.md` - 10-minute setup guide with troubleshooting - -**Key Design Decisions**: -- **TbenchResult**: Extended with 4 new optional fields (resolved_trials, unresolved_trials, pass_at_1, pass_at_3) -- **HarborConfig**: New dataclass with validation (model/agent allowlists, path resolution, API key requirement) -- **Aggregation**: Inline pandas DataFrame operations in `summarize` command (not separate service) -- **BenchmarkRun**: Optional metadata model (defer to Phase 3 for historical tracking) - -**Agent Context Update**: Pending (will run `.specify/scripts/bash/update-agent-context.sh claude` after Phase 1 complete) - ---- - -### Phase 2: Tasks (Next - To be generated by `/speckit.tasks`) - -**Purpose**: Generate dependency-ordered task list from design artifacts - -**Expected Tasks** (preview - actual tasks will be generated by command): - -**Priority 1 (MVP - User Story 1 + 3)**: -1. Write tests for HarborConfig validation (TDD: red phase) -2. Implement HarborConfig dataclass with allowlist validation -3. Write tests for _real_tbench_result() subprocess call (TDD: red phase, mock subprocess) -4. Implement _real_tbench_result() with sanitized environment variables -5. Write tests for JSON parsing with path validation (TDD: red phase) -6. Implement parse_harbor_results() function -7. Write integration test for full real benchmark workflow (TDD: red phase) -8. Verify all tests pass (TDD: green phase) - -**Priority 2 (User Story 2 + 4)**: -9. Write tests for pandas aggregation logic (TDD: red phase) -10. Implement aggregation in `summarize` command (inline with pandas) -11. Add parallel execution limits (ProcessPoolExecutor with max_workers=4) -12. Add timeout enforcement (3600s per benchmark) - -**Priority 3 (Documentation & Polish)**: -13. Update README.md with Harbor setup instructions -14. Create `docs/tbench/assessor-refinement-results.md` template -15. Update `docs/tbench/methodology.md` with Phase 2 validation section -16. Run linters (black, isort, flake8) and fix issues -17. Run full test suite, verify >80% coverage for new code - -**Task Document**: Will be generated by `/speckit.tasks` command (not created by `/speckit.plan`) - ---- - -### Phase 3: Implementation (Future - To be executed by `/speckit.implement`) - -**Not covered by `/speckit.plan` command** - see Phase 2 tasks for work breakdown - ---- - -## File-Level Implementation Details - -### File 1: `src/agentready/services/eval_harness/harbor_config.py` (NEW) - -**Purpose**: Configuration and validation for Harbor framework subprocess execution - -**Estimated Lines**: ~40 - -**Key Components**: -```python -from dataclasses import dataclass -from pathlib import Path - -ALLOWED_MODELS = { - "anthropic/claude-haiku-4-5", - "anthropic/claude-sonnet-4-5", -} - -ALLOWED_AGENTS = { - "claude-code", -} - -@dataclass -class HarborConfig: - model: str - agent: str - jobs_dir: Path - api_key: str - timeout: int = 3600 - n_concurrent: int = 1 - - def __post_init__(self): - # Validation logic (model allowlist, agent allowlist, API key not empty, timeout positive) - # Path resolution (jobs_dir.resolve()) -``` - -**Testing**: `tests/unit/test_harbor_config.py` (allowlist validation, path resolution, API key requirement) - ---- - -### File 2: `src/agentready/services/eval_harness/tbench_runner.py` (MODIFY) - -**Purpose**: Replace NotImplementedError in `_real_tbench_result()` with functional Harbor subprocess integration - -**Estimated Lines Added**: ~50 - -**Changes**: - -**Before** (Current Phase 1 Implementation): -```python -def _real_tbench_result(self, repo_path: Path) -> TbenchResult: - """Execute real Terminal-Bench via Harbor framework.""" - raise NotImplementedError("Phase 2: Harbor framework integration pending") -``` - -**After** (Phase 2 Implementation): -```python -def _real_tbench_result(self, repo_path: Path) -> TbenchResult: - """Execute real Terminal-Bench via Harbor framework.""" - # 1. Create HarborConfig with validation - config = HarborConfig( - model=os.environ.get("TBENCH_MODEL", "anthropic/claude-haiku-4-5"), - agent="claude-code", - jobs_dir=Path(tempfile.mkdtemp()), - api_key=os.environ.get("ANTHROPIC_API_KEY"), - ) - - # 2. Build harbor run command - cmd = [ - "harbor", "run", - "--dataset", "terminal-bench@2.0", - "--agent", config.agent, - "--model", config.model, - "--jobs-dir", str(config.jobs_dir), - "--n-concurrent", "1", - ] - - # 3. Sanitize environment variables (SECURITY: FR-004) - clean_env = { - "ANTHROPIC_API_KEY": config.api_key, - "PATH": os.environ.get("PATH"), - "HOME": os.environ.get("HOME"), - } - - # 4. Execute subprocess with timeout - try: - subprocess.run(cmd, env=clean_env, timeout=config.timeout, check=True) - except subprocess.TimeoutExpired: - raise RuntimeError(f"Benchmark timed out after {config.timeout}s") - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Harbor command failed: {e}") - - # 5. Parse results.json with path validation (SECURITY: FR-005) - results_path = config.jobs_dir / "results.json" - if not results_path.is_relative_to(config.jobs_dir): - raise ValueError(f"Invalid results path: {results_path}") - - return parse_harbor_results(results_path) - -def parse_harbor_results(results_path: Path) -> TbenchResult: - """Parse Harbor framework JSON output.""" - with open(results_path) as f: - data = json.load(f) - - summary = data["summary"] - return TbenchResult( - score=summary["accuracy"], - task_solved=summary["resolved_trials"] > 0, - is_mocked=False, - resolved_trials=summary["resolved_trials"], - unresolved_trials=summary["unresolved_trials"], - pass_at_1=summary["pass@1"], - pass_at_3=summary["pass@3"], - ) -``` - -**Testing**: `tests/unit/test_eval_harness_services.py` (subprocess mocking, JSON parsing, error handling, path validation) - ---- - -### File 3: `src/agentready/cli/eval_harness.py` (MODIFY) - -**Purpose**: Add pandas-based aggregation to `summarize` command - -**Estimated Lines Added**: ~30 - -**Changes**: - -**Add to existing `summarize` command**: -```python -@click.command() -def summarize(): - """Summarize assessor effectiveness across repositories.""" - # 1. Load results from previous benchmark runs (implementation detail TBD - file-based storage?) - results = load_benchmark_results() # Returns List[Dict[str, Any]] - - # 2. Aggregate with pandas - import pandas as pd - df = pd.DataFrame(results) - summary = df.groupby("assessor_id").agg({ - "delta_score": ["mean", "median", "std", "count"], - }).round(2) - summary.columns = ["mean_delta", "median_delta", "std_delta", "sample_size"] - - # 3. Add statistical significance placeholder - summary["significant"] = summary["mean_delta"].abs() > 0.05 - - # 4. Sort by mean_delta descending and display - summary = summary.sort_values("mean_delta", ascending=False) - click.echo(summary.to_markdown()) - - # 5. Export JSON for machine consumption - summary.to_json("aggregation-results.json", orient="records") -``` - -**Testing**: `tests/unit/test_eval_harness_cli.py` (pandas aggregation logic, JSON export, markdown output) - ---- - -### File 4: `tests/unit/test_eval_harness_services.py` (MODIFY) - -**Purpose**: Add integration tests for Harbor subprocess calls with mocking - -**Estimated Lines Added**: ~40 - -**New Tests**: -```python -from unittest.mock import patch, MagicMock - -def test_real_tbench_result_subprocess_call(): - """Test Harbor subprocess called with correct parameters.""" - with patch("subprocess.run") as mock_run, \ - patch("builtins.open", mock_open(read_data='{"summary": {...}}')): - runner = TbenchRunner(use_real=True) - result = runner._real_tbench_result(Path("/fake/repo")) - - # Verify subprocess.run called with sanitized env - mock_run.assert_called_once() - call_args = mock_run.call_args - assert "harbor" in call_args[0][0] - assert call_args[1]["env"]["ANTHROPIC_API_KEY"] is not None - assert "JAVA_HOME" not in call_args[1]["env"] # Env sanitization check - -def test_harbor_config_validation_invalid_model(): - """Test HarborConfig rejects invalid model.""" - with pytest.raises(ValueError, match="Invalid model"): - HarborConfig( - model="invalid/model", - agent="claude-code", - jobs_dir=Path("/tmp"), - api_key="test-key", - ) -``` - -**Coverage Target**: >80% for new Harbor integration code - ---- - -### File 5: `docs/tbench/assessor-refinement-results.md` (NEW) - -**Purpose**: Document empirical assessor effectiveness findings from real benchmarks - -**Estimated Lines**: ~100 (template, will be filled with actual data after benchmarks run) - -**Structure**: -```markdown -# Assessor Refinement Results - Phase 2 Empirical Validation - -## Methodology -- 15 diverse repositories tested (Python, JavaScript, TypeScript, mixed) -- 25 assessors evaluated -- Metrics: mean delta, median delta, std delta, statistical significance (p < 0.05) - -## High-Impact Assessors (Keep/Promote) -1. **claude_md**: +12% mean improvement, statistically significant (p=0.001) -2. **test_coverage**: +8% mean improvement, statistically significant (p=0.01) -... - -## Low/No-Impact Assessors (Review/Demote) -23. **dependency_pinning**: +2% mean improvement, NOT statistically significant (p=0.42) -... - -## Recommendations -- ✅ Keep Tier 1: claude_md, test_coverage, gitignore (empirically validated high impact) -- ⚠️ Demote to Tier 3: dependency_pinning (no significant measured impact) -... -``` - ---- - -## Security Considerations - -**Addressed from Automated Review Findings**: - -### 1. API Key Exposure (Critical) -**Problem**: Passing all environment variables to subprocess via `os.environ.copy()` exposes API keys -**Solution**: Sanitize environment variables, pass only required: `ANTHROPIC_API_KEY`, `PATH`, `HOME` -**Code**: `clean_env = {k: os.environ.get(k) for k in ["ANTHROPIC_API_KEY", "PATH", "HOME"]}` -**Verification**: Unit test checks env dict keys, excludes non-required variables - -### 2. Command Injection (Critical) -**Problem**: Unvalidated model/agent parameters passed to subprocess -**Solution**: Allowlist validation in HarborConfig.__post_init__() -**Code**: `if model not in ALLOWED_MODELS: raise ValueError(f"Invalid model: {model}")` -**Verification**: Unit test attempts malicious input (e.g., `model="$(rm -rf /)"`) and verifies rejection - -### 3. Path Traversal (Medium) -**Problem**: Harbor output path not validated before reading -**Solution**: Validate results_path is relative to jobs_dir -**Code**: `if not results_path.is_relative_to(jobs_dir): raise ValueError(...)` -**Verification**: Unit test attempts path traversal (e.g., `../../etc/passwd`) and verifies rejection - ---- - -## Dependencies & Installation - -### New Dependencies - -**Harbor Framework**: -```toml -# pyproject.toml -dependencies = [ - # ... existing dependencies ... - "harbor>=2.0.0", # Laude Institute Terminal-Bench harness -] -``` - -**Install Command**: -```bash -uv pip install harbor -``` - -### System Requirements - -**Docker** (required for Harbor): -- Docker Desktop (Mac/Windows) or Docker Engine (Linux) -- Minimum 4GB RAM, 2GB free disk space -- Docker daemon must be running before executing benchmarks - -**Verification**: -```bash -docker --version # Should show Docker version 20.10+ -docker ps # Should connect without error -``` - ---- - -## Testing Strategy - -### Unit Tests (TDD Red-Green-Refactor) - -**Phase 1: Write Tests FIRST (Red)** -1. `test_harbor_config_validation()` - Allowlist enforcement -2. `test_real_tbench_result_subprocess_call()` - Subprocess mocking -3. `test_parse_harbor_results()` - JSON parsing -4. `test_environment_sanitization()` - Env var filtering -5. `test_path_validation()` - Path traversal prevention - -**Phase 2: Implement to Pass (Green)** -- Implement HarborConfig, _real_tbench_result(), parse_harbor_results() -- All tests should pass - -**Phase 3: Refactor (Refactor)** -- Extract constants (ALLOWED_MODELS, ALLOWED_AGENTS) -- Simplify subprocess call logic -- Add docstrings - -**Coverage Target**: >80% for new code - ---- - -### Integration Tests - -**End-to-End Workflow**: -```python -def test_full_benchmark_workflow_mocked(): - """Test complete benchmark with mocked Harbor subprocess.""" - with patch("subprocess.run") as mock_run: - # Setup mock to return success - mock_run.return_value = MagicMock(returncode=0) - - # Run benchmark - result = run_benchmark(repo_path, assessor_id="claude_md") - - # Verify subprocess called correctly - assert mock_run.called - # Verify result parsed correctly - assert result.is_mocked == False - assert 0.0 <= result.score <= 1.0 -``` - ---- - -## Documentation Updates - -### 1. README.md - -**Section to Add**: "Running Real Terminal-Bench Evaluations (Phase 2)" - -**Content**: -```markdown -## Running Real Terminal-Bench Evaluations - -### Prerequisites -- Docker installed and running -- Anthropic API key (get from https://console.anthropic.com) - -### Setup -```bash -# Install Harbor framework -uv pip install harbor - -# Set environment variables -export ANTHROPIC_API_KEY="sk-ant-api03-..." -export TBENCH_USE_REAL=1 - -# Run baseline benchmark -agentready tbench baseline /path/to/repo -``` - -See [Quickstart Guide](specs/002-harbor-real-integration/quickstart.md) for detailed instructions. -``` - ---- - -### 2. docs/tbench/methodology.md - -**Section to Add**: "Phase 2: Real-World Validation" - -**Content**: -- Harbor framework integration details -- Real vs mocked benchmark comparison -- Statistical significance testing approach -- Sample size rationale (10-20 repositories) - ---- - -### 3. docs/tbench/assessor-refinement-results.md (NEW) - -**Purpose**: Document empirical findings from Phase 2 benchmarks - -**Structure**: -- Methodology (sample size, repository diversity, metrics) -- High-impact assessors (keep/promote based on data) -- Low/no-impact assessors (review/demote based on data) -- Recommendations (tier reassignments, assessor improvements) -- Appendix (raw data, statistical tests) - ---- - -## Risks & Mitigations - -### Risk 1: Harbor framework API changes between versions - -**Impact**: Breaking changes in Harbor CLI could break our integration -**Likelihood**: Low (Harbor is in active development but API appears stable) -**Mitigation**: -- Pin Harbor version in dependencies (`harbor>=2.0.0,<3.0.0`) -- Add integration tests that fail if Harbor output format changes -- Document Harbor version tested with - ---- - -### Risk 2: Docker unavailable on CI/CD - -**Impact**: Real benchmarks cannot run in GitHub Actions (no Docker in standard runners) -**Likelihood**: Medium (GitHub Actions free tier doesn't support Docker-in-Docker well) -**Mitigation**: -- Default to mocked integration in CI/CD (`TBENCH_USE_REAL=0` by default) -- Document that real benchmarks require local execution or self-hosted runners -- Consider GitHub Actions self-hosted runners with Docker for future automation - ---- - -### Risk 3: Benchmark execution costs exceed budget - -**Impact**: Running 280+ benchmarks (8 repos × 35 assessors) could cost $100-$200 USD -**Likelihood**: Medium (depending on repository complexity and Claude API pricing) -**Mitigation**: -- Start with small sample (5 repos × 10 assessors) to validate approach -- Use Claude Haiku (cheaper) for initial validation, Sonnet only for final confirmation -- Document cost per benchmark in README to help users budget - ---- - -### Risk 4: Statistical sample size insufficient for significance testing - -**Impact**: 10-20 repositories may not provide statistical power for significance tests -**Likelihood**: Medium (depends on effect size and variance) -**Mitigation**: -- Document confidence intervals and p-values with sample size caveats -- Use conservative significance threshold (p < 0.05) -- Recommend larger sample sizes for critical decisions (e.g., removing Tier 1 assessors) - ---- - -## Next Steps - -1. ✅ **Phase 0 Complete**: Research document with all unknowns resolved -2. ✅ **Phase 1 Complete**: Data models, contracts, quickstart guide -3. ⏭️ **Update Agent Context**: Run `.specify/scripts/bash/update-agent-context.sh claude` -4. ⏭️ **Phase 2**: Generate tasks with `/speckit.tasks` command -5. ⏭️ **Phase 3**: Execute tasks with `/speckit.implement` command - ---- - -## Appendix: References - -- [Harbor Framework Documentation](https://harborframework.com/docs/running-tbench) -- [Harbor GitHub Repository](https://github.com/laude-institute/harbor) -- [Terminal-Bench GitHub Repository](https://github.com/laude-institute/terminal-bench) -- [Terminal-Bench 2.0 Article - Snorkel AI](https://snorkel.ai/blog/terminal-bench-2-0-raising-the-bar-for-ai-agent-evaluation/) -- [AgentReady Constitution](.specify/memory/constitution.md) -- [DoubleAgent.md Impact Analysis](./DOUBLEAGENT_IMPACT.md) -- [Automated Code Review Findings (GitHub Issue #190)](https://github.com/ambient-code/agentready/issues/190) - ---- - -**Document Status**: Complete -**Last Updated**: 2025-12-09 -**Ready for Phase 2**: ✅ Yes (pending agent context update) diff --git a/specs/002-harbor-real-integration/quickstart.md b/specs/002-harbor-real-integration/quickstart.md deleted file mode 100644 index 001479d1..00000000 --- a/specs/002-harbor-real-integration/quickstart.md +++ /dev/null @@ -1,282 +0,0 @@ -# Quickstart: Harbor Framework Integration - -**Feature**: Harbor Framework Real Integration for Terminal-Bench Eval Harness -**Target Audience**: Developers and researchers using AgentReady eval harness -**Time to Complete**: ~10 minutes - ---- - -## Prerequisites - -- ✅ Python 3.11+ installed -- ✅ Docker installed and running (`docker --version`) -- ✅ Anthropic API key (get from https://console.anthropic.com) -- ✅ AgentReady installed (`agentready --version`) - ---- - -## Step 1: Install Harbor Framework - -```bash -# Install Harbor CLI (preferred method) -uv tool install harbor - -# Alternative: pip install -pip install harbor - -# Verify installation -harbor --version -``` - -**Expected Output**: -``` -Harbor v2.0.0 -``` - ---- - -## Step 2: Configure API Authentication - -```bash -# Set your Anthropic API key -export ANTHROPIC_API_KEY="sk-ant-api03-..." - -# Verify Docker is running -docker ps - -# Enable real Harbor integration (instead of mocked) -export TBENCH_USE_REAL=1 -``` - -**Important**: Keep your API key secure. Never commit it to git. Consider using `.env` files or secret managers. - ---- - -## Step 3: Run Your First Baseline Benchmark - -```bash -# Run baseline evaluation on a repository -agentready tbench baseline /path/to/your/repository - -# Example with a specific repository -agentready tbench baseline ~/repos/my-python-project -``` - -**What Happens**: -1. AgentReady calls Harbor framework via subprocess -2. Harbor launches Docker container with your repository -3. Terminal-Bench runs coding tasks using Claude Code agent -4. Results are parsed and displayed - -**Expected Output**: -``` -Running Terminal-Bench baseline for /path/to/your/repository... -Using model: anthropic/claude-haiku-4-5 -Using agent: claude-code - -Benchmark Results: - Score: 0.78 (78% accuracy) - Resolved: 39 tasks - Unresolved: 11 tasks - Pass@1: 0.72 - Pass@3: 0.78 - -Duration: 8m 32s -``` - -**Time Estimate**: 5-10 minutes for typical repositories (<10k files) - ---- - -## Step 4: Test an Assessor's Impact - -```bash -# Test if adding CLAUDE.md improves benchmark score -agentready tbench test-assessor --assessor claude_md ~/repos/my-python-project -``` - -**What Happens**: -1. Runs baseline benchmark (no changes) -2. Applies assessor fix (adds CLAUDE.md if missing) -3. Runs delta benchmark (with CLAUDE.md) -4. Calculates score improvement - -**Expected Output**: -``` -Testing assessor: claude_md - -Baseline Results: - Score: 0.78 (78% accuracy) - -Applying assessor fix... - ✅ Created CLAUDE.md with project context - -Delta Results: - Score: 0.84 (84% accuracy) - -Improvement: +0.06 (+6 percentage points) -Statistical Significance: ✅ Yes (p < 0.05) -``` - -**Time Estimate**: 10-20 minutes (runs two full benchmarks) - ---- - -## Step 5: Aggregate Results Across Repositories - -```bash -# After running benchmarks on multiple repositories, aggregate results -agentready tbench summarize -``` - -**Expected Output**: -``` -Assessor Effectiveness Summary - -Assessor ID | Mean Δ | Median Δ | Std Δ | Sample Size | Significant? -------------------|--------|----------|-------|-------------|------------- -claude_md | +0.12 | +0.10 | 0.05 | 15 | ✅ Yes -test_coverage | +0.08 | +0.07 | 0.06 | 15 | ✅ Yes -dependency_pinning| +0.02 | +0.01 | 0.08 | 12 | ❌ No - -Top 5 High-Impact Assessors: -1. claude_md (+12% average improvement) -2. test_coverage (+8% average improvement) -3. gitignore (+5% average improvement) -4. readme_structure (+4% average improvement) -5. type_annotations (+3% average improvement) - -Recommended Actions: -- ✅ Keep: claude_md, test_coverage, gitignore (high impact) -- ⚠️ Review: dependency_pinning (no significant impact) -``` - ---- - -## Common Issues & Troubleshooting - -### Issue 1: "Harbor not found" - -**Symptom**: `FileNotFoundError: harbor command not found` - -**Solution**: -```bash -# Ensure Harbor is in PATH -which harbor - -# If not found, reinstall -uv tool install harbor - -# Add to PATH if needed -export PATH="$HOME/.local/bin:$PATH" -``` - ---- - -### Issue 2: "Docker daemon not running" - -**Symptom**: `RuntimeError: Cannot connect to Docker daemon` - -**Solution**: -```bash -# Start Docker Desktop (Mac/Windows) -open -a Docker # Mac -# Or start Docker service (Linux) -sudo systemctl start docker - -# Verify Docker is running -docker ps -``` - ---- - -### Issue 3: "API key invalid" - -**Symptom**: `AuthenticationError: Invalid API key` - -**Solution**: -```bash -# Check API key is set -echo $ANTHROPIC_API_KEY - -# If empty, set it -export ANTHROPIC_API_KEY="sk-ant-api03-..." - -# Verify key format (starts with sk-ant-) -``` - ---- - -### Issue 4: "Benchmark timeout" - -**Symptom**: `TimeoutExpired: Command timed out after 3600 seconds` - -**Solution**: -- Large repositories (>50k files) may exceed 1-hour timeout -- Consider reducing repository size or increasing timeout (future configuration option) -- Check Docker resource limits (Docker Desktop → Preferences → Resources) - ---- - -## Advanced Usage - -### Custom Model Selection - -```bash -# Use Claude Sonnet instead of Haiku (higher quality, slower, more expensive) -export TBENCH_MODEL="anthropic/claude-sonnet-4-5" -agentready tbench baseline ~/repos/my-project -``` - -### Parallel Repository Evaluation - -```bash -# Evaluate multiple repositories in parallel (4 workers) -agentready tbench batch ~/repos/*/ --workers 4 -``` - -**Note**: Parallel batch evaluation is a future enhancement (Phase 3). Current implementation processes repositories sequentially. - ---- - -## Cost Estimation - -**Per Repository Benchmark**: -- Model: Claude Haiku 4.5 -- Duration: ~10 minutes -- Tasks: ~50 Terminal-Bench tasks -- Estimated Cost: $0.30 - $0.50 USD - -**Batch Evaluation** (10 repositories × 35 assessors): -- Total runs: 350 benchmarks -- Estimated total cost: ~$105 - $175 USD -- Time estimate: ~24 hours with 4-worker parallelism - -**Cost Reduction Tips**: -- Use mocked integration for development/testing (`export TBENCH_USE_REAL=0`) -- Test on smaller repositories first (<5k files) -- Use sample size of 5-10 repositories for initial assessor validation - ---- - -## Next Steps - -1. ✅ Completed quickstart? → Run benchmarks on your repositories -2. ⏭️ Want batch evaluation? → See `docs/tbench/batch-evaluation.md` (Phase 3) -3. ⏭️ Need help? → See `docs/tbench/troubleshooting.md` -4. ⏭️ Contributing? → See `CONTRIBUTING.md` for development setup - ---- - -## Further Reading - -- [Harbor Framework Documentation](https://harborframework.com/docs) -- [Terminal-Bench GitHub](https://github.com/laude-institute/terminal-bench) -- [AgentReady Eval Harness Methodology](../../docs/tbench/methodology.md) -- [Assessor Refinement Results](../../docs/tbench/assessor-refinement-results.md) - ---- - -**Document Status**: Complete -**Last Updated**: 2025-12-09 -**Estimated Time**: 10 minutes setup + 10-20 minutes first benchmark diff --git a/specs/002-harbor-real-integration/research.md b/specs/002-harbor-real-integration/research.md deleted file mode 100644 index d4cd7911..00000000 --- a/specs/002-harbor-real-integration/research.md +++ /dev/null @@ -1,421 +0,0 @@ -# Research Report: Harbor Framework Integration for Terminal-Bench - -**Feature**: Harbor Framework Real Integration for Terminal-Bench Eval Harness -**Date**: 2025-12-09 -**Status**: Complete - ---- - -## Executive Summary - -This research resolves all technical unknowns identified during specification planning for Phase 2 of the Terminal-Bench eval harness. The Harbor framework is a well-documented CLI tool from the Laude Institute that provides straightforward integration via subprocess calls with JSON output. - -**Key Findings**: -- ✅ Harbor framework has clear Python package: `harbor` (installable via pip/uv) -- ✅ Authentication uses standard environment variables (`ANTHROPIC_API_KEY`, optionally `DAYTONA_API_KEY`) -- ✅ CLI interface is simple: `harbor run` with well-defined parameters -- ✅ Output is JSON-based with predictable structure -- ✅ Execution times average 5-10 minutes per repository (align with spec assumptions) - ---- - -## Research Question 1: Harbor Framework Installation - -**Question**: What is the exact Python package name and installation command for Harbor framework? - -**Answer**: `harbor` package via pip/uv - -**Installation Commands**: -```bash -# Preferred (uv) -uv tool install harbor - -# Alternative (pip) -pip install harbor -``` - -**System Requirements**: -- Docker (required for local benchmark execution) -- Python 3.11+ (inferred from Laude Institute's typical stack) - -**Source**: [GitHub - laude-institute/harbor](https://github.com/laude-institute/harbor) - -**Decision**: Use `uv pip install harbor` in dependencies (aligns with AgentReady's existing uv-first approach) - ---- - -## Research Question 2: Authentication & API Keys - -**Question**: What environment variables are needed for Harbor framework authentication? - -**Answer**: Two environment variables required: - -| Variable | Purpose | Required? | -|----------|---------|-----------| -| `ANTHROPIC_API_KEY` | Claude API authentication | ✅ Required | -| `DAYTONA_API_KEY` | Cloud environment provider (Daytona) | Optional (only for `--env daytona`) | - -**Authentication Pattern**: -- No username/password authentication -- No Harbor-specific API key -- Uses Claude API key directly (passed to model provider) -- Daytona key only needed if using cloud environments (not for local Docker execution) - -**Source**: [Harbor Framework - Running Terminal-Bench](https://harborframework.com/docs/running-tbench) - -**Decision**: -- Primary use case: Local Docker execution (no Daytona key needed) -- Only expose `ANTHROPIC_API_KEY` to Harbor subprocess -- Document Daytona as optional advanced feature (out of scope for Phase 2) - ---- - -## Research Question 3: CLI Interface & Command Syntax - -**Question**: What is the command-line interface for submitting repositories to Terminal-Bench? - -**Answer**: `harbor run` command with well-defined parameters - -**Basic Syntax**: -```bash -harbor run \ - --dataset terminal-bench@2.0 \ - --agent claude-code \ - --model anthropic/claude-haiku-4-5 \ - --n-concurrent 4 \ - --jobs-dir /path/to/output -``` - -**Key Parameters**: - -| Parameter | Purpose | Values | -|-----------|---------|--------| -| `--dataset` | Benchmark dataset + version | `terminal-bench@2.0` | -| `--agent` | Agent to evaluate | `claude-code`, `oracle` (reference) | -| `--model` | LLM model identifier | `anthropic/claude-haiku-4-5`, `anthropic/claude-sonnet-4-5` | -| `--n-concurrent` | Parallel tasks | Integer (default: 1) | -| `--jobs-dir` | Output directory | Path to write results | -| `--env` | Environment provider | `daytona` (cloud) or omit (local Docker) | - -**Source**: [Harbor Framework Documentation](https://harborframework.com/docs/running-tbench), [GitHub - laude-institute/harbor](https://github.com/laude-institute/harbor) - -**Decision**: -- Use local Docker execution (no `--env` parameter) -- Set `--n-concurrent 1` for AgentReady integration (parallelism managed by our ProcessPoolExecutor, not Harbor) -- Use `--jobs-dir` to control output location for result parsing - ---- - -## Research Question 4: Output Format & Result Parsing - -**Question**: What is the expected output format from Harbor framework? How do we parse results? - -**Answer**: JSON-based results file with structured metrics - -**Output Structure**: -- Harbor writes results to `--jobs-dir` location -- Primary file: `results.json` with detailed benchmark data -- Summary metrics available: - - `resolved_trials`: Number of successfully completed tasks - - `unresolved_trials`: Number of failed tasks - - `accuracy`: Overall success rate (0.0 to 1.0) - - `pass@1`: Single-attempt success rate - - `pass@3`: Success rate within 3 attempts - -**Example Results Structure** (inferred from documentation): -```json -{ - "summary": { - "resolved_trials": 42, - "unresolved_trials": 8, - "accuracy": 0.84, - "pass@1": 0.78, - "pass@3": 0.84 - }, - "tasks": [ - { - "task_id": "task_001", - "status": "resolved", - "score": 1.0, - "attempts": 2 - } - ] -} -``` - -**Source**: [Harbor Framework - Running Terminal-Bench](https://harborframework.com/docs/running-tbench), [Terminal-Bench GitHub](https://github.com/laude-institute/terminal-bench) - -**Decision**: -- Parse `results.json` from `--jobs-dir` after benchmark completion -- Extract `accuracy` as primary score metric (maps to our `TbenchResult.score`) -- Validate JSON schema before reading (security: FR-005 path validation) -- Map `resolved_trials > 0` to `TbenchResult.task_solved = True` - ---- - -## Research Question 5: Execution Times & Timeouts - -**Question**: What are typical execution times for Terminal-Bench via Harbor? What timeout should we set? - -**Answer**: Execution times vary by task complexity, averaging 5-10 minutes per repository - -**Timing Details**: -- **Simple tasks**: Seconds to 1-2 minutes -- **Complex tasks** (e.g., COBOL modernization, refactoring): 5-10 minutes -- **Full benchmark suite** (100+ tasks): Hours (not applicable to AgentReady use case - we run single-repo assessments) - -**Timeout Recommendations**: -- **Harbor internal timeout**: Not explicitly documented (appears to handle timeouts internally) -- **Our subprocess timeout**: 1 hour (3600 seconds) provides 6x buffer over typical 10-minute execution -- **Rationale**: Covers edge cases (large repos, slow networks) while preventing infinite hangs - -**Source**: [Terminal-Bench 2.0 Article - Snorkel AI](https://snorkel.ai/blog/terminal-bench-2-0-raising-the-bar-for-ai-agent-evaluation/), [VentureBeat Article](https://venturebeat.com/ai/terminal-bench-2-0-launches-alongside-harbor-a-new-framework-for-testing) - -**Decision**: -- Set 1-hour (3600s) timeout per benchmark run (aligns with spec FR-009) -- Log warning if execution exceeds 10 minutes (indicates potential issue) -- Document average execution time in README (5-10 minutes for typical repositories) - ---- - -## Research Question 6: Model & Agent Parameter Validation - -**Question**: What are the valid model and agent identifiers for Harbor framework? - -**Answer**: Documented model and agent identifiers from Harbor CLI - -**Supported Models** (relevant to AgentReady use case): -- `anthropic/claude-haiku-4-5` ✅ (fast, cost-effective) -- `anthropic/claude-sonnet-4-5` ✅ (balanced) -- `anthropic/claude-opus-4-1` (expensive, high-quality) - -**Supported Agents**: -- `claude-code` ✅ (primary agent for coding tasks) -- `oracle` (reference baseline - uses perfect knowledge) - -**Source**: [GitHub - laude-institute/harbor](https://github.com/laude-institute/harbor) (CLI help output) - -**Decision**: -- Allowlist for models: `["anthropic/claude-haiku-4-5", "anthropic/claude-sonnet-4-5"]` (excludes opus due to cost) -- Allowlist for agents: `["claude-code"]` (excludes oracle as it's not relevant for real-world assessment) -- Validation before subprocess call (addresses security requirement FR-002, FR-003) - ---- - -## Research Question 7: Docker Dependency & Setup - -**Question**: Does Harbor require Docker? What setup is needed? - -**Answer**: Docker is required for local benchmark execution - -**Docker Requirements**: -- Harbor uses Docker containers to create isolated sandbox environments for benchmarks -- Each benchmark task runs in a fresh container (isolation, reproducibility) -- Docker daemon must be running before `harbor run` execution - -**Setup Validation**: -- Harbor validates Docker availability internally (no need for pre-flight checks) -- If Docker unavailable, Harbor returns clear error message -- Follows "trust the framework" philosophy from doubleagent.md (no custom Docker validation needed) - -**Source**: [Harbor Framework Documentation](https://harborframework.com/docs/running-tbench) - -**Decision**: -- Document Docker as required dependency in README -- Trust Harbor's internal Docker validation (no custom pre-flight checks per simplified approach) -- Return clear error message if Harbor fails due to Docker issues (FR-012) - ---- - -## Technology Selection Summary - -| Technology | Decision | Rationale | -|------------|----------|-----------| -| **Harbor Package** | `harbor` via `uv pip install` | Official Laude Institute package, aligns with uv-first approach | -| **Authentication** | `ANTHROPIC_API_KEY` environment variable | Standard Claude API authentication, no Harbor-specific keys | -| **Execution Environment** | Local Docker (no cloud provider) | Simplifies setup, reduces dependencies, sufficient for Phase 2 | -| **CLI Interface** | `harbor run` subprocess call | Well-documented, stable interface, JSON output | -| **Output Parsing** | Parse `results.json` from `--jobs-dir` | Structured JSON format, predictable schema | -| **Timeout** | 3600 seconds (1 hour) | 6x buffer over typical 10-minute execution, prevents infinite hangs | -| **Model Allowlist** | `claude-haiku-4-5`, `claude-sonnet-4-5` | Balance cost and quality, excludes expensive opus | -| **Agent Allowlist** | `claude-code` | Primary coding agent, excludes oracle (not relevant for real assessments) | - ---- - -## Best Practices & Patterns - -### 1. Subprocess Security Pattern - -**Pattern**: Sanitized environment variables -```python -clean_env = { - "ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY"), - "PATH": os.environ.get("PATH"), - "HOME": os.environ.get("HOME"), -} -subprocess.run(cmd, env=clean_env, timeout=3600) -``` - -**Rationale**: Prevents API key exposure through unsanitized `os.environ.copy()` (addresses security review finding) - ---- - -### 2. Input Validation Pattern - -**Pattern**: Allowlist validation before subprocess -```python -ALLOWED_MODELS = {"anthropic/claude-haiku-4-5", "anthropic/claude-sonnet-4-5"} -ALLOWED_AGENTS = {"claude-code"} - -if model not in ALLOWED_MODELS: - raise ValueError(f"Invalid model: {model}. Allowed: {ALLOWED_MODELS}") -if agent not in ALLOWED_AGENTS: - raise ValueError(f"Invalid agent: {agent}. Allowed: {ALLOWED_AGENTS}") -``` - -**Rationale**: Prevents command injection via unvalidated parameters (addresses security review finding) - ---- - -### 3. Result Parsing Pattern - -**Pattern**: Path validation before file reading -```python -import os -from pathlib import Path - -jobs_dir = Path(jobs_dir_str).resolve() -results_path = jobs_dir / "results.json" - -# Validate path is within expected directory -if not results_path.is_relative_to(jobs_dir): - raise ValueError(f"Invalid results path: {results_path}") - -with open(results_path) as f: - data = json.load(f) -``` - -**Rationale**: Prevents path traversal attacks when reading Harbor output (addresses FR-005) - ---- - -### 4. Graceful Degradation Pattern - -**Pattern**: Environment variable toggle -```python -use_real = os.environ.get("TBENCH_USE_REAL", "0") == "1" - -if use_real: - result = _real_tbench_result(repo_path) -else: - result = _mocked_tbench_result(repo_path) -``` - -**Rationale**: Preserves backward compatibility, safe default for CI/CD (addresses FR-007, FR-014) - ---- - -## Alternatives Considered - -### Alternative 1: Direct Terminal-Bench API Integration - -**Considered**: Bypassing Harbor and calling Terminal-Bench API directly - -**Rejected Because**: -- Harbor is the official harness and recommended approach -- Harbor abstracts complexity of container management -- Direct API would require reimplementing Harbor's orchestration logic -- Harbor provides CLI interface that's simpler than API calls - ---- - -### Alternative 2: Custom Exception Classes for Harbor Errors - -**Considered**: Creating 7 custom exception classes (HarborNotFoundError, DockerMissingError, etc.) - -**Rejected Because**: -- Over-engineering (violates doubleagent.md anti-patterns) -- RuntimeError with clear message provides same functionality -- Simplified approach reduces 186 lines to 35 lines (76% reduction) -- No benefit to custom exceptions for subprocess call failures - ---- - -### Alternative 3: Pre-flight Checks for Docker/Harbor Installation - -**Considered**: Implementing 3 pre-flight check methods to validate Docker and Harbor before execution - -**Rejected Because**: -- Trust Harbor's internal validation (philosophy from doubleagent.md) -- Duplicates validation Harbor already performs -- Adds complexity without value (Harbor errors are already clear) -- Simplified approach removes unnecessary code - ---- - -### Alternative 4: Separate CrossRepoAggregator Service Class - -**Considered**: Creating dedicated service class for multi-repository aggregation - -**Rejected Because**: -- Pandas DataFrame operations are simpler (30 lines vs 171 lines) -- No need for separate class when aggregation is straightforward -- Inline implementation in CLI command is sufficient -- Violates doubleagent.md: "avoid abstractions for one-time operations" - ---- - -## Open Questions Resolved - -All questions from Technical Context section are now resolved: - -| Question | Resolution | -|----------|------------| -| Harbor package name? | `harbor` via `uv pip install harbor` | -| Authentication method? | `ANTHROPIC_API_KEY` environment variable | -| CLI command syntax? | `harbor run --dataset terminal-bench@2.0 --agent claude-code --model --jobs-dir ` | -| Output format? | JSON file at `/results.json` with accuracy, pass@k metrics | -| Execution times? | 5-10 minutes average, 1-hour timeout provides 6x buffer | -| Docker requirement? | Yes, required for local execution (trust Harbor's validation) | -| Model/agent validation? | Allowlist: models={haiku-4-5, sonnet-4-5}, agents={claude-code} | - ---- - -## Impact on Implementation Plan - -**Technical Context Updates**: -- Primary Dependencies: `harbor` (via uv), `pandas` (existing), `subprocess` (stdlib) -- Performance Goals: 5-10 minutes per benchmark, 4 concurrent workers, 1-hour timeout -- Constraints: Docker required, `ANTHROPIC_API_KEY` environment variable -- Scale/Scope: 10-20 diverse repositories for Phase 2 empirical validation - -**Implementation Simplifications**: -- No custom exception classes (use RuntimeError) -- No pre-flight checks (trust Harbor validation) -- No separate aggregator service (inline pandas operations) -- Total implementation: ~120 lines (not 507) - 76% reduction - ---- - -## Next Steps - -1. ✅ Research complete - all NEEDS CLARIFICATION resolved -2. ⏭️ Phase 1: Design data models (TbenchResult, BenchmarkRun, AggregatedResult) -3. ⏭️ Phase 1: Generate contracts (JSON schema for results.json parsing) -4. ⏭️ Phase 1: Create quickstart guide (Harbor setup, first benchmark run) - ---- - -**Sources**: -- [Harbor Framework - GitHub](https://github.com/laude-institute/harbor) -- [Harbor Framework Documentation - Running Terminal-Bench](https://harborframework.com/docs/running-tbench) -- [Terminal-Bench - GitHub](https://github.com/laude-institute/terminal-bench) -- [Terminal-Bench 2.0 Article - Snorkel AI](https://snorkel.ai/blog/terminal-bench-2-0-raising-the-bar-for-ai-agent-evaluation/) -- [VentureBeat - Terminal-Bench 2.0 Launch](https://venturebeat.com/ai/terminal-bench-2-0-launches-alongside-harbor-a-new-framework-for-testing) -- [DeepWiki - Terminal-Bench Getting Started](https://deepwiki.com/laude-institute/terminal-bench/2-getting-started) - ---- - -**Document Status**: Complete -**Last Updated**: 2025-12-09 -**Ready for Phase 1**: ✅ Yes diff --git a/specs/002-harbor-real-integration/spec.md b/specs/002-harbor-real-integration/spec.md deleted file mode 100644 index 00c65b03..00000000 --- a/specs/002-harbor-real-integration/spec.md +++ /dev/null @@ -1,204 +0,0 @@ -# Feature Specification: Harbor Framework Real Integration for Terminal-Bench Eval Harness - -**Feature Branch**: `002-harbor-real-integration` -**Created**: 2025-12-09 -**Status**: Draft -**Input**: User description: "Review https://github.com/ambient-code/agentready/issues/190 and all comments and implement accordingly. Make sure to also consult .claude/agents/doubleagent.md as necessary. I want you to track and report on what the specific impact of doubleagent.md was in this implementation." - -## User Scenarios & Testing *(mandatory)* - -### User Story 1 - Run Real Terminal-Bench Evaluations (Priority: P1) - -A developer wants to run real Terminal-Bench evaluations on their repository to measure how well AgentReady assessors improve AI coding assistant performance using actual benchmark data from the Harbor framework, not mocked results. - -**Why this priority**: This is the core value proposition of Phase 2 - replacing mocked integration with real empirical data. Without this, we cannot validate assessor effectiveness with real-world evidence. - -**Independent Test**: Can be fully tested by running a single benchmark on one repository and verifying that real Harbor framework API is called, results are returned, and they differ from mocked results. - -**Acceptance Scenarios**: - -1. **Given** Harbor framework CLI is installed and API credentials are configured, **When** developer runs `agentready tbench baseline /path/to/repo`, **Then** system submits repository to real Terminal-Bench via Harbor framework and returns actual benchmark score -2. **Given** Harbor framework is installed, **When** developer runs `agentready tbench test-assessor --assessor claude_md /path/to/repo`, **Then** system runs real baseline and delta evaluation and reports actual score improvement -3. **Given** environment variable `TBENCH_USE_REAL=1` is set, **When** any tbench command executes, **Then** system uses real Harbor framework instead of mocked implementation -4. **Given** Harbor framework is not installed, **When** developer runs tbench command, **Then** system shows clear error message with installation instructions - ---- - -### User Story 2 - Aggregate Multi-Repository Results (Priority: P2) - -A researcher wants to run benchmarks across multiple diverse repositories (different languages, sizes, domains) and see aggregated statistics showing which assessors consistently improve benchmark scores and which have no measurable impact. - -**Why this priority**: This enables the empirical assessor refinement goal - identifying high-impact vs low-impact assessors based on real data. This is valuable but depends on Story 1 being complete first. - -**Independent Test**: Can be tested by running benchmarks on 3-5 repositories with different assessors and verifying that aggregation shows mean/median/std delta scores correctly grouped by assessor. - -**Acceptance Scenarios**: - -1. **Given** benchmark results exist for 10+ repositories, **When** developer runs `agentready tbench summarize`, **Then** system shows aggregated statistics (mean, median, std) for each assessor's delta impact -2. **Given** aggregated results are displayed, **When** developer reviews output, **Then** assessors are ranked by mean delta score with statistical significance indicators -3. **Given** multiple benchmark runs exist, **When** developer requests summary, **Then** system identifies assessors with consistently positive impact vs assessors with no significant impact -4. **Given** aggregated data exists, **When** developer views results, **Then** recommendations are provided for which assessors to keep/promote and which to remove/demote - ---- - -### User Story 3 - Secure API Integration (Priority: P1) - -A developer wants to run real benchmarks without exposing their API credentials to subprocesses or command injection vulnerabilities, ensuring that sensitive data is properly sanitized and validated. - -**Why this priority**: Security is critical when integrating with external APIs. The automated review identified critical vulnerabilities (API key exposure, command injection) that must be fixed before production use. This has same priority as P1 because it blocks safe deployment. - -**Independent Test**: Can be tested by attempting to pass malicious input to model/agent parameters and verifying rejection, and by checking that only required environment variables are passed to subprocesses. - -**Acceptance Scenarios**: - -1. **Given** API credentials are in environment variables, **When** Harbor framework subprocess is called, **Then** only required variables (API key, PATH, HOME) are passed, not all environment variables -2. **Given** user provides model parameter, **When** system validates input, **Then** only allowlisted models (claude-haiku-4-5, claude-sonnet-4-5) are accepted -3. **Given** user provides agent parameter, **When** system validates input, **Then** only allowlisted agents (claude-code) are accepted -4. **Given** malicious input is provided for model/agent parameters, **When** system validates, **Then** input is rejected with clear error message before subprocess call - ---- - -### User Story 4 - Resource-Limited Parallel Execution (Priority: P2) - -A developer wants to run benchmarks on multiple repositories in parallel without exhausting system resources (memory, CPU, file handles), ensuring stable execution even when processing large batches. - -**Why this priority**: Running 8 repositories × 35 assessor combinations (280 total runs) requires careful resource management to avoid system crashes. Important for production use but not blocking MVP. - -**Independent Test**: Can be tested by running 20+ parallel benchmark jobs and verifying that system respects worker pool limits (max 4 concurrent) and handles timeouts gracefully. - -**Acceptance Scenarios**: - -1. **Given** developer runs benchmarks on 10 repositories, **When** execution starts, **Then** no more than 4 benchmarks run concurrently regardless of total queue size -2. **Given** parallel execution is running, **When** one benchmark times out (1 hour limit), **Then** that job is terminated and next job starts without blocking other workers -3. **Given** resource limits are in place, **When** running large batch (50+ repos), **Then** system remains stable and does not exhaust file handles or memory -4. **Given** parallel execution completes, **When** developer reviews results, **Then** all successful results are aggregated and failures are clearly logged - ---- - -### Edge Cases - -- What happens when Harbor framework is installed but API credentials are missing or invalid? -- How does system handle network failures during long-running benchmark submissions (30+ minutes)? -- What happens when a benchmark times out after the Harbor framework's internal timeout (not our timeout)? -- How does system handle repositories that are too large for Terminal-Bench (>100k files)? -- What happens when Harbor CLI returns non-JSON output (error messages, warnings)? -- How does system handle partial results when some repositories succeed and others fail in batch mode? - -## Requirements *(mandatory)* - -### Functional Requirements - -- **FR-001**: System MUST replace `_real_tbench_result()` NotImplementedError with functional Harbor framework subprocess integration -- **FR-002**: System MUST validate model parameter against allowlist (anthropic/claude-haiku-4-5, anthropic/claude-sonnet-4-5) before subprocess call -- **FR-003**: System MUST validate agent parameter against allowlist (claude-code) before subprocess call -- **FR-004**: System MUST pass only required environment variables (API key, PATH, HOME) to Harbor subprocess, not all environment variables -- **FR-005**: System MUST parse Harbor framework JSON output and validate file paths before reading -- **FR-006**: System MUST return TbenchResult with is_mocked=False when using real Harbor framework -- **FR-007**: System MUST support environment variable `TBENCH_USE_REAL=1` to toggle between mocked and real integration -- **FR-008**: System MUST limit parallel execution to 4 concurrent workers using ProcessPoolExecutor -- **FR-009**: System MUST enforce 1-hour timeout per individual benchmark run -- **FR-010**: System MUST aggregate results across multiple repositories showing mean, median, standard deviation for each assessor's delta score -- **FR-011**: System MUST indicate statistical significance when aggregating results (e.g., confidence intervals, p-values) -- **FR-012**: System MUST handle Harbor framework errors gracefully with clear error messages and installation guidance -- **FR-013**: System MUST document aggregated results in `docs/tbench/assessor-refinement-results.md` with recommendations for assessor list changes -- **FR-014**: System MUST preserve backward compatibility with existing mocked integration for testing/development - -### Key Entities - -- **TbenchResult**: Represents benchmark output with score, task_solved boolean, and is_mocked flag indicating real vs mocked execution -- **BenchmarkRun**: Represents single benchmark execution with repository path, assessor ID (or None for baseline), timestamp, result, and execution metadata (duration, errors) -- **AggregatedResult**: Represents statistical summary across multiple repositories for a specific assessor including mean/median/std delta scores, sample size, and significance indicators -- **HarborConfig**: Represents Harbor framework configuration including API credentials, model selection, agent selection, and timeout settings - -## Success Criteria *(mandatory)* - -### Measurable Outcomes - -- **SC-001**: Developers can successfully run real Terminal-Bench evaluations on at least 10 diverse repositories with 100% success rate for repositories under 50k files -- **SC-002**: Benchmark results from real Harbor framework differ measurably from mocked results (validate by comparing scores on same repository) -- **SC-003**: System blocks invalid model/agent parameters with 100% accuracy before subprocess execution (security validation) -- **SC-004**: System exposes zero API credentials to subprocess environment beyond required variables (verified via process inspection) -- **SC-005**: Parallel execution of 20+ repositories completes without resource exhaustion (memory stays under 2GB, file handles under 1024) -- **SC-006**: Aggregated results clearly identify top 5 assessors with highest mean delta improvement and bottom 5 with no measurable impact -- **SC-007**: Documentation deliverable (`docs/tbench/assessor-refinement-results.md`) provides actionable recommendations for assessor tier changes based on empirical data -- **SC-008**: 95% of Harbor framework errors result in clear, actionable error messages for users (not stack traces) -- **SC-009**: Complete benchmark suite (8 repos × 35 assessors = 280 runs) completes in under 24 hours with 4-worker parallelism -- **SC-010**: System maintains 100% backward compatibility with existing mocked integration for automated testing - -## Assumptions - -- Harbor framework Python package exists and is installable via pip/uv (package name to be confirmed during implementation) -- Terminal-Bench API access is available via tbench.ai with API key authentication -- Benchmark execution time averages 5-10 minutes per repository (informing timeout and parallelism decisions) -- Developers have Harbor CLI installed locally before using real integration (installation documented in README) -- Standard session-based authentication is sufficient for Harbor framework API (no OAuth required) -- JSON is the standard output format for Harbor framework results -- Repositories under 50k files are supported by Terminal-Bench (larger repositories may fail or timeout) -- Statistical significance can be determined with 10-20 repository samples per assessor (adequate sample size) -- Default behavior remains mocked integration unless explicitly toggled with environment variable (safe default for CI/CD) - -## Scope - -### In Scope - -- Replace NotImplementedError in `_real_tbench_result()` with functional Harbor framework integration -- Add input validation (allowlist) for model and agent parameters -- Sanitize environment variables passed to Harbor subprocess -- Add parallel execution limits (4 workers) with timeouts (1 hour per job) -- Add pandas-based aggregation to existing `summarize` command for cross-repo statistics -- Document empirical findings in `docs/tbench/assessor-refinement-results.md` -- Update `README.md` with Harbor setup instructions -- Add environment variable toggle (`TBENCH_USE_REAL=1`) for real vs mocked integration -- Add integration tests with subprocess mocking for Harbor calls - -### Out of Scope - -- Custom exception classes (7 classes) - use RuntimeError instead per simplified approach -- Pre-flight check methods (3 methods) - trust Harbor's validation -- Separate `CrossRepoAggregator` service class - inline with pandas in CLI -- Docker installation validation - trust Harbor framework's Docker checks -- Public leaderboard submission features (Phase 3) -- Real-time progress UI during long-running benchmarks -- Retry logic for transient network failures (rely on Harbor's internal retry) -- Custom timeout configurations per repository size -- Automated assessor tier reassignment based on results (manual review required) - -## Dependencies - -- Harbor framework Python package (exact package name TBD during implementation research) -- Terminal-Bench API access via tbench.ai -- API credentials (environment variable: `TBENCH_API_KEY`) -- Harbor CLI installed locally -- Pandas library for aggregation (already in dependencies) -- Network access to tbench.ai submission endpoints -- Docker (required by Harbor framework for containerized benchmarks) - -## Non-Functional Requirements - -- **Performance**: Individual benchmark runs complete within 1 hour timeout (assuming 5-10 minute average) -- **Reliability**: System handles network failures and timeouts gracefully without crashing -- **Security**: API credentials never exposed beyond required subprocess environment -- **Usability**: Error messages provide clear guidance with installation instructions -- **Maintainability**: Implementation adds ~120 lines of code (not 507) following simplified approach -- **Compatibility**: Maintains 100% backward compatibility with existing mocked integration - -## Risks & Mitigations - -**Risk**: Harbor framework package name or API may differ from documentation -**Mitigation**: Begin with research phase to confirm package installation and basic API usage before full implementation - -**Risk**: Real benchmarks may be significantly slower than estimated (5-10 min), causing 24-hour goal to slip -**Mitigation**: Implement configurable worker pool size and timeout values for tuning based on empirical data - -**Risk**: Statistical sample size (10-20 repos) may be insufficient for confident significance testing -**Mitigation**: Document confidence intervals and p-values in results; note sample size limitations in recommendations - -**Risk**: Command injection may still be possible through repository paths or other inputs -**Mitigation**: Add path validation and sanitization alongside model/agent allowlists - -**Risk**: Parallel execution may still exhaust resources despite 4-worker limit on systems with limited memory -**Mitigation**: Document minimum system requirements (4GB RAM, 2GB free disk) in README - -## Open Questions - -None - all critical decisions have been made based on issue #190 requirements and automated review feedback. Simplified approach removes uncertainty around over-engineered components. diff --git a/specs/002-harbor-real-integration/tasks.md b/specs/002-harbor-real-integration/tasks.md deleted file mode 100644 index 1e02cc00..00000000 --- a/specs/002-harbor-real-integration/tasks.md +++ /dev/null @@ -1,467 +0,0 @@ -# Implementation Tasks: Harbor Framework Real Integration for Terminal-Bench Eval Harness - -**Feature Branch**: `002-harbor-real-integration` -**Created**: 2025-12-09 -**Spec**: [spec.md](./spec.md) | **Plan**: [plan.md](./plan.md) - ---- - -## Overview - -This document breaks down the Harbor Framework Real Integration feature into executable, dependency-ordered tasks. Tasks are organized by user story to enable independent implementation and testing following TDD (Test-Driven Development) red-green-refactor workflow. - -**Total Tasks**: 35 -**Estimated Implementation**: ~120 lines of new code (76% reduction from original plan) -**TDD Approach**: MANDATORY - Tests written FIRST (red phase) before implementation (green phase) - ---- - -## Task Format Legend - -``` -- [ ] [TaskID] [P?] [Story?] Description with file path -``` - -- **[TaskID]**: Sequential number (T001, T002, ...) in execution order -- **[P]**: Parallelizable (can run simultaneously with other [P] tasks in same phase) -- **[Story]**: User story label ([US1], [US2], [US3], [US4]) for tracking -- **File Path**: Exact location for implementation - ---- - -## Phase 1: Setup & Dependencies - -**Goal**: Prepare project environment and install Harbor framework dependency. - -**Tasks**: - -- [X] T001 Add `harbor>=2.0.0` dependency to `pyproject.toml` under dependencies section -- [X] T002 Install Harbor framework package via `uv pip install harbor` and verify installation with `harbor --version` -- [X] T003 Update `.gitignore` to exclude temporary benchmark output directories (`**/tbench-results/`, `**/.harbor-cache/`) -- [X] T004 Create `src/agentready/services/eval_harness/harbor_config.py` file stub (empty file with module docstring) - -**Completion Criteria**: Harbor package installed, project dependencies updated, file structure ready for implementation. - ---- - -## Phase 2: Foundational Infrastructure (Blocking Prerequisites) - -**Goal**: Implement core configuration and validation infrastructure needed by all user stories. - -**Independent Test**: HarborConfig validation can be tested independently with unit tests before any Harbor subprocess integration. - -### 2.1 TDD: Write Tests for HarborConfig (Red Phase) - -- [X] T005 [P] Create `tests/unit/test_harbor_config.py` with test structure and imports -- [X] T006 [P] Write test `test_harbor_config_valid_model_haiku` - verify haiku-4-5 model accepted in `tests/unit/test_harbor_config.py` -- [X] T007 [P] Write test `test_harbor_config_valid_model_sonnet` - verify sonnet-4-5 model accepted in `tests/unit/test_harbor_config.py` -- [X] T008 [P] Write test `test_harbor_config_invalid_model_rejected` - verify invalid model raises ValueError in `tests/unit/test_harbor_config.py` -- [X] T009 [P] Write test `test_harbor_config_invalid_agent_rejected` - verify invalid agent raises ValueError in `tests/unit/test_harbor_config.py` -- [X] T010 [P] Write test `test_harbor_config_empty_api_key_rejected` - verify empty API key raises ValueError in `tests/unit/test_harbor_config.py` -- [X] T011 [P] Write test `test_harbor_config_negative_timeout_rejected` - verify negative timeout raises ValueError in `tests/unit/test_harbor_config.py` -- [X] T012 [P] Write test `test_harbor_config_path_resolution` - verify jobs_dir resolved to absolute path in `tests/unit/test_harbor_config.py` - -**Checkpoint**: Run tests, verify all FAIL (red phase complete) - `pytest tests/unit/test_harbor_config.py` - -### 2.2 Implement HarborConfig (Green Phase) - -- [X] T013 Define `ALLOWED_MODELS` constant set in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T014 Define `ALLOWED_AGENTS` constant set in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T015 Implement `HarborConfig` dataclass with all fields (model, agent, jobs_dir, api_key, timeout, n_concurrent) in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T016 Implement `HarborConfig.__post_init__()` with model allowlist validation in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T017 Implement `HarborConfig.__post_init__()` with agent allowlist validation in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T018 Implement `HarborConfig.__post_init__()` with API key non-empty validation in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T019 Implement `HarborConfig.__post_init__()` with timeout positive validation in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T020 Implement `HarborConfig.__post_init__()` with jobs_dir path resolution to absolute path in `src/agentready/services/eval_harness/harbor_config.py` - -**Checkpoint**: Run tests, verify all PASS (green phase complete) - `pytest tests/unit/test_harbor_config.py` - -### 2.3 Refactor & Document (Refactor Phase) - -- [X] T021 Add docstrings to `HarborConfig` class and `__post_init__` method in `src/agentready/services/eval_harness/harbor_config.py` -- [X] T022 Add module-level docstring explaining Harbor framework configuration in `src/agentready/services/eval_harness/harbor_config.py` - -**Completion Criteria**: HarborConfig fully tested and implemented with >80% coverage, all tests passing. - ---- - -## Phase 3: User Story 1 + User Story 3 (P1 - MVP) - -**Combined Stories**: US1 (Run Real Terminal-Bench Evaluations) + US3 (Secure API Integration) - -**Why Combined**: Security is integral to Harbor subprocess calls, not a separate feature. US3 requirements are implemented directly within US1's Harbor integration code. - -**Goal**: Replace `_real_tbench_result()` NotImplementedError with functional, secure Harbor framework subprocess integration. - -**Independent Test**: Run single benchmark on one repository, verify real Harbor framework subprocess called with sanitized environment variables, results parsed correctly, and differ from mocked results. - -### 3.1 TDD: Write Tests for Harbor Subprocess Integration (Red Phase) - -- [X] T023 [P] [US1] Write test `test_real_tbench_result_subprocess_called` - verify `harbor run` command constructed correctly in `tests/unit/test_eval_harness_services.py` -- [X] T024 [P] [US1] [US3] Write test `test_environment_variable_sanitization` - verify only ANTHROPIC_API_KEY, PATH, HOME passed to subprocess in `tests/unit/test_eval_harness_services.py` -- [X] T025 [P] [US1] Write test `test_harbor_subprocess_timeout_enforced` - verify subprocess.run called with timeout=3600 in `tests/unit/test_eval_harness_services.py` -- [X] T026 [P] [US1] Write test `test_harbor_subprocess_timeout_exception` - verify RuntimeError raised when subprocess times out in `tests/unit/test_eval_harness_services.py` -- [X] T027 [P] [US1] Write test `test_harbor_subprocess_failure_exception` - verify RuntimeError raised when subprocess fails in `tests/unit/test_eval_harness_services.py` - -**Checkpoint**: Run tests, verify all FAIL (red phase complete) - `pytest tests/unit/test_eval_harness_services.py -k "real_tbench"` - -### 3.2 TDD: Write Tests for JSON Parsing with Path Validation (Red Phase) - -- [X] T028 [P] [US1] [US3] Write test `test_parse_harbor_results_valid_json` - verify results.json parsed correctly in `tests/unit/test_eval_harness_services.py` -- [X] T029 [P] [US1] Write test `test_parse_harbor_results_creates_tbench_result` - verify TbenchResult created with is_mocked=False in `tests/unit/test_eval_harness_services.py` -- [X] T030 [P] [US1] [US3] Write test `test_parse_harbor_results_path_validation` - verify path traversal attack (../../etc/passwd) rejected in `tests/unit/test_eval_harness_services.py` -- [X] T031 [P] [US1] Write test `test_parse_harbor_results_invalid_json_exception` - verify JSONDecodeError handled gracefully in `tests/unit/test_eval_harness_services.py` - -**Checkpoint**: Run tests, verify all FAIL (red phase complete) - `pytest tests/unit/test_eval_harness_services.py -k "parse_harbor"` - -### 3.3 Implement TbenchResult Extension (Green Phase) - -- [X] T032 [US1] Extend `TbenchResult` dataclass with new optional fields (resolved_trials, unresolved_trials, pass_at_1, pass_at_3) with default values in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T033 [US1] Add `TbenchResult.__post_init__()` validation for score range [0.0, 1.0] in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T034 [US1] Add `TbenchResult.__post_init__()` validation for non-negative trial counts in `src/agentready/services/eval_harness/tbench_runner.py` - -### 3.4 Implement Harbor Subprocess Integration (Green Phase) - -- [X] T035 [US1] Import `HarborConfig`, `subprocess`, `tempfile`, `os`, `json` at top of `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T036 [US1] Replace `_real_tbench_result()` NotImplementedError with HarborConfig initialization in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T037 [US1] Implement `_real_tbench_result()` - build `harbor run` command list with all parameters in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T038 [US1] [US3] Implement `_real_tbench_result()` - create clean_env dict with only ANTHROPIC_API_KEY, PATH, HOME in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T039 [US1] Implement `_real_tbench_result()` - call subprocess.run() with cmd, env, timeout, check=True in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T040 [US1] Implement `_real_tbench_result()` - handle subprocess.TimeoutExpired exception, raise RuntimeError in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T041 [US1] Implement `_real_tbench_result()` - handle subprocess.CalledProcessError exception, raise RuntimeError in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T042 [US1] [US3] Implement `_real_tbench_result()` - validate results_path.is_relative_to(jobs_dir), raise ValueError if path traversal detected in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T043 [US1] Implement `_real_tbench_result()` - call parse_harbor_results() and return TbenchResult in `src/agentready/services/eval_harness/tbench_runner.py` - -### 3.5 Implement JSON Parsing Function (Green Phase) - -- [X] T044 [US1] Create `parse_harbor_results(results_path: Path) -> TbenchResult` function in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T045 [US1] Implement `parse_harbor_results()` - open and load results.json with json.load() in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T046 [US1] Implement `parse_harbor_results()` - extract summary dict from data["summary"] in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T047 [US1] Implement `parse_harbor_results()` - create TbenchResult with all fields mapped from summary in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T048 [US1] Implement `parse_harbor_results()` - set is_mocked=False, task_solved=resolved_trials>0 in `src/agentready/services/eval_harness/tbench_runner.py` - -**Checkpoint**: Run all tests, verify PASS (green phase complete) - `pytest tests/unit/test_eval_harness_services.py tests/unit/test_harbor_config.py` - -### 3.6 Integration Test (Red-Green) - -- [X] T049 [US1] Write integration test `test_full_real_benchmark_workflow_mocked_subprocess` in `tests/integration/test_eval_harness_e2e.py` - mock subprocess.run, verify end-to-end flow -- [X] T050 [US1] Implement fix if integration test fails, verify test passes - -**Checkpoint**: Run integration test, verify PASS - `pytest tests/integration/test_eval_harness_e2e.py -k "real_benchmark"` - -### 3.7 Refactor & Document (Refactor Phase) - -- [X] T051 [US1] Add docstrings to `_real_tbench_result()` and `parse_harbor_results()` functions in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T052 [US1] [US3] Add inline security comments explaining env sanitization and path validation in `src/agentready/services/eval_harness/tbench_runner.py` -- [X] T053 [US1] Extract magic numbers (timeout=3600, n_concurrent=1) to constants at module level in `src/agentready/services/eval_harness/tbench_runner.py` - -**Completion Criteria**: -- ✅ User Story 1 complete: Real benchmarks run successfully via Harbor framework -- ✅ User Story 3 complete: Security validations prevent API key exposure and command injection -- ✅ Tests passing with >80% coverage for new Harbor integration code -- ✅ Independent test verified: Single benchmark on one repository succeeds with real Harbor subprocess - -**MVP Milestone**: This phase completes the minimum viable product - real, secure Harbor framework integration. - ---- - -## Phase 4: User Story 2 (P2 - Aggregation) - -**Goal**: Implement pandas-based aggregation to summarize assessor effectiveness across multiple repositories. - -**Independent Test**: Run benchmarks on 3-5 repositories with different assessors, verify aggregation shows mean/median/std delta scores correctly grouped by assessor. - -### 4.1 TDD: Write Tests for Aggregation Logic (Red Phase) - -- [X] T054 [P] [US2] Write test `test_summarize_aggregates_by_assessor` - verify pandas groupby on assessor_id in `tests/unit/test_eval_harness_cli.py` -- [X] T055 [P] [US2] Write test `test_summarize_calculates_mean_median_std` - verify correct aggregation functions in `tests/unit/test_eval_harness_cli.py` -- [X] T056 [P] [US2] Write test `test_summarize_adds_significance_indicator` - verify boolean significant column added in `tests/unit/test_eval_harness_cli.py` -- [X] T057 [P] [US2] Write test `test_summarize_sorts_by_mean_delta_descending` - verify results sorted correctly in `tests/unit/test_eval_harness_cli.py` -- [X] T058 [P] [US2] Write test `test_summarize_exports_json` - verify JSON file written with correct schema in `tests/unit/test_eval_harness_cli.py` - -**Checkpoint**: Run tests, verify all FAIL (red phase complete) - `pytest tests/unit/test_eval_harness_cli.py -k "summarize"` ✅ - -### 4.2 Implement Aggregation Logic (Green Phase) - -**Note**: Implemented in `src/agentready/services/eval_harness/aggregator.py` (separate module following "generic interface first" principle) instead of CLI file. CLI integration deferred to future task. - -- [X] T059 [US2] Import `pandas as pd` in aggregator module -- [X] T060 [US2] Create `aggregate_results()` function signature with generic interface -- [X] T061 [US2] Implement `aggregate_results()` - create DataFrame from results list -- [X] T062 [US2] Implement `aggregate_results()` - groupby aggregation (mean, median, std, count) -- [X] T063 [US2] Implement `aggregate_results()` - rename columns to mean_delta, median_delta, std_delta, sample_size -- [X] T064 [US2] Implement `aggregate_results()` - add significant column with abs(mean_delta) > 0.05 placeholder -- [X] T065 [US2] Implement `aggregate_results()` - sort by mean_delta descending -- [X] T066 [US2] Handle edge cases (empty results, NaN std for single values) -- [X] T067 [US2] Round numeric values to 2 decimal places for readability - -**Checkpoint**: Run tests, verify all PASS (green phase complete) - `pytest tests/unit/test_eval_harness_cli.py` ✅ - -### 4.3 Refactor & Document (Refactor Phase) - -- [X] T068 [US2] Add comprehensive docstrings to `aggregate_results()` function with Args/Returns/Examples -- [X] T069 [US2] Add module-level docstring explaining aggregation purpose and usage - -**Completion Criteria**: -- ✅ User Story 2 complete: Aggregation summarizes assessor effectiveness across repositories -- ✅ Tests passing with >80% coverage for aggregation logic -- ✅ Independent test verified: Aggregation on 3-5 repositories produces correct statistics - ---- - -## Phase 5: User Story 4 (P2 - Parallel Execution) - -**Goal**: Implement resource-limited parallel execution with ProcessPoolExecutor to handle large batches without exhausting system resources. - -**Independent Test**: Run 20+ parallel benchmark jobs, verify system respects 4-worker limit and handles timeouts gracefully. - -### 5.1 TDD: Write Tests for Parallel Execution (Red Phase) - -- [X] T070 [P] [US4] Write test `test_parallel_execution_max_4_workers` - verify ProcessPoolExecutor initialized with max_workers=4 in `tests/unit/test_eval_harness_services.py` -- [X] T071 [P] [US4] Write test `test_parallel_execution_timeout_per_job` - verify each job has 3600s timeout in `tests/unit/test_eval_harness_services.py` -- [X] T072 [P] [US4] Write test `test_parallel_execution_handles_partial_failures` - verify some jobs can fail without blocking others in `tests/unit/test_eval_harness_services.py` -- [X] T073 [P] [US4] Write test `test_parallel_execution_aggregates_successful_results` - verify only successful results aggregated in `tests/unit/test_eval_harness_services.py` - -**Checkpoint**: Run tests, verify all FAIL (red phase complete) - `pytest tests/unit/test_eval_harness_services.py -k "parallel"` ✅ - -### 5.2 Implement Parallel Execution (Green Phase) - -- [X] T074 [US4] Import `concurrent.futures.ProcessPoolExecutor`, `concurrent.futures.as_completed` in `src/agentready/services/eval_harness/batch_runner.py` (new file) -- [X] T075 [US4] Create `run_batch_benchmarks()` function with repositories list parameter in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T076 [US4] Implement `run_batch_benchmarks()` - initialize ProcessPoolExecutor with max_workers=4 in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T077 [US4] Implement `run_batch_benchmarks()` - submit futures for each repository in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T078 [US4] Implement `run_batch_benchmarks()` - use as_completed() to handle futures as they finish in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T079 [US4] Implement `run_batch_benchmarks()` - catch exceptions from future.result(timeout=3600) in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T080 [US4] Implement `run_batch_benchmarks()` - log failures, aggregate successes, return results list in `src/agentready/services/eval_harness/batch_runner.py` - -**Checkpoint**: Run tests, verify all PASS (green phase complete) - `pytest tests/unit/test_eval_harness_services.py -k "parallel"` ✅ - -### 5.3 Refactor & Document (Refactor Phase) - -- [X] T081 [US4] Add docstrings to `run_batch_benchmarks()` function in `src/agentready/services/eval_harness/batch_runner.py` -- [X] T082 [US4] Extract worker count (4) and job timeout (3600) to module-level constants in `src/agentready/services/eval_harness/batch_runner.py` - -**Completion Criteria**: -- ✅ User Story 4 complete: Parallel execution handles large batches without resource exhaustion -- ✅ Tests passing with >80% coverage for parallel execution logic -- ✅ Independent test verified: 20+ jobs execute with max 4 concurrent workers - ---- - -## Phase 6: Polish & Cross-Cutting Concerns - -**Goal**: Complete documentation, run linters, verify coverage, and ensure production readiness. - -### 6.1 Documentation Updates - -- [ ] T083 [P] Update `README.md` - add "Running Real Terminal-Bench Evaluations (Phase 2)" section with Harbor setup instructions -- [ ] T084 [P] Update `README.md` - add prerequisites (Docker, Anthropic API key), setup commands, quickstart example -- [ ] T085 [P] Create `docs/tbench/assessor-refinement-results.md` template with methodology, high-impact assessors, low-impact assessors, recommendations sections (structure only, data to be filled after benchmarks run) -- [ ] T086 [P] Update `docs/tbench/methodology.md` - add "Phase 2: Real-World Validation" section explaining Harbor integration, real vs mocked comparison, statistical significance approach - -### 6.2 Linting & Code Quality - -- [ ] T087 Run `black src/agentready/services/eval_harness/ src/agentready/cli/eval_harness.py tests/` to format all modified files -- [ ] T088 Run `isort src/agentready/services/eval_harness/ src/agentready/cli/eval_harness.py tests/` to sort imports -- [ ] T089 Run `flake8 src/agentready/services/eval_harness/ src/agentready/cli/eval_harness.py tests/ --ignore=E501,E203,W503` to verify linting (no line length enforcement) -- [ ] T090 Fix any linting errors reported by flake8 - -### 6.3 Testing & Coverage - -- [ ] T091 Run full test suite: `pytest tests/unit/test_harbor_config.py tests/unit/test_eval_harness_services.py tests/unit/test_eval_harness_cli.py tests/integration/test_eval_harness_e2e.py` -- [ ] T092 Run coverage report: `pytest --cov=src/agentready/services/eval_harness --cov=src/agentready/cli/eval_harness --cov-report=html --cov-report=term` -- [ ] T093 Verify coverage >80% for new Harbor integration code (harbor_config.py, tbench_runner.py modifications, eval_harness.py modifications, batch_runner.py) -- [ ] T094 Add additional tests if coverage gaps identified (target missing branches, edge cases) - -### 6.4 Final Integration Verification - -- [ ] T095 Manually test: `export TBENCH_USE_REAL=1 && export ANTHROPIC_API_KEY= && agentready tbench baseline /path/to/test/repo` - verify real Harbor subprocess called -- [ ] T096 Manually test: Verify results differ from mocked integration (run same repo with TBENCH_USE_REAL=0 vs =1, compare scores) -- [ ] T097 Manually test: Verify error handling - run without ANTHROPIC_API_KEY, verify clear error message with installation instructions -- [ ] T098 Manually test: Verify security - inspect subprocess call with process monitor, confirm only required env vars passed - -**Completion Criteria**: -- ✅ All documentation updated -- ✅ All linters pass (black, isort, flake8) -- ✅ All tests pass with >80% coverage -- ✅ Manual integration tests verify real Harbor framework integration works end-to-end -- ✅ Security validations confirmed via manual testing - ---- - -## Dependencies & Execution Order - -### User Story Dependency Graph - -``` -┌─────────────────────────────────────────────────────────┐ -│ Phase 1: Setup & Dependencies │ -│ (T001-T004) │ -└────────────────┬────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────┐ -│ Phase 2: Foundational Infrastructure │ -│ (T005-T022) - HarborConfig implementation │ -└────────────────┬────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────┐ -│ Phase 3: US1 + US3 (P1 MVP) │ -│ (T023-T053) - Real Harbor integration + Security │ -│ ✓ Independent - Can deploy alone │ -└────────────────┬────────────────────────────────────────┘ - │ - ├──────────────────┬─────────────────────┐ - │ │ │ - ▼ ▼ ▼ -┌──────────────────────┐ ┌──────────────────┐ ┌────────────────────┐ -│ Phase 4: US2 (P2) │ │ Phase 5: US4 (P2)│ │ Phase 6: Polish │ -│ (T054-T069) │ │ (T070-T082) │ │ (T083-T098) │ -│ Aggregation │ │ Parallel Exec │ │ Documentation │ -│ ✓ Independent of US4 │ │ ✓ Independent of US2│ │ ⚠️ Requires US1-4 │ -└──────────────────────┘ └──────────────────┘ └────────────────────┘ -``` - -**Blocking Dependencies**: -- Phase 1 blocks all other phases (setup required first) -- Phase 2 blocks Phase 3, 4, 5 (HarborConfig needed for Harbor integration and aggregation config) -- Phase 3 blocks Phase 6 (MVP must be complete before polish) -- Phase 4 and Phase 5 are independent of each other (can be implemented in parallel) - -**Story Independence**: -- ✅ **US1 (Real Benchmarks)**: Fully independent, can deploy alone after Phase 2 -- ✅ **US2 (Aggregation)**: Independent of US4, depends only on US1 for benchmark results -- ✅ **US3 (Security)**: Integrated into US1 (not separate implementation) -- ✅ **US4 (Parallel Execution)**: Independent of US2, depends only on US1 for benchmark runner - ---- - -## Parallel Execution Opportunities - -### Within Each Phase - -**Phase 2 (Foundational)**: -- Tests T005-T012 can run in parallel (all are test writing, no shared state) -- Implementation tasks T013-T020 are sequential (shared file modifications) - -**Phase 3 (US1 + US3 MVP)**: -- Tests T023-T031 can run in parallel (different test files/functions) -- Implementation tasks T032-T053 are mostly sequential (shared file modifications) - -**Phase 4 (US2 Aggregation)**: -- Tests T054-T058 can run in parallel (independent test cases) -- Implementation tasks T059-T069 are sequential (shared file modifications) - -**Phase 5 (US4 Parallel Execution)**: -- Tests T070-T073 can run in parallel (independent test cases) -- Implementation tasks T074-T082 are sequential (new file, but sequential logic) - -**Phase 6 (Polish)**: -- Documentation tasks T083-T086 can run in parallel (different files) -- Linting tasks T087-T090 must run sequentially (formatter output affects linter input) -- Testing tasks T091-T094 are sequential (coverage depends on all tests running) -- Manual verification T095-T098 are sequential (depends on implementation complete) - -### Parallelization Summary - -**Estimated Parallelization Gains**: -- ~40% of tasks marked [P] can run in parallel -- Most parallelism in test writing phases (TDD red phase) -- Implementation phases are mostly sequential due to shared file modifications - ---- - -## Implementation Strategy - -### Recommended Approach: Incremental Delivery - -**Week 1: MVP (Phase 1-3)** -1. Complete Setup & Dependencies (Phase 1): ~1 hour -2. Complete Foundational Infrastructure (Phase 2): ~1 day - - TDD: Write all tests (red) → Implement HarborConfig (green) → Refactor -3. Complete US1 + US3 MVP (Phase 3): ~2-3 days - - TDD: Write all tests (red) → Implement Harbor integration (green) → Refactor - - **Milestone**: MVP deployable - real, secure Harbor benchmarks work - -**Week 2: Enhancement Features (Phase 4-5)** -4. Complete US2 Aggregation (Phase 4): ~1 day - - TDD: Write tests → Implement pandas aggregation → Refactor -5. Complete US4 Parallel Execution (Phase 5): ~1 day - - TDD: Write tests → Implement ProcessPoolExecutor → Refactor - -**Week 3: Polish & Production Readiness (Phase 6)** -6. Complete Documentation, Linting, Coverage (Phase 6): ~1 day -7. Manual integration testing and verification: ~1 day - -**Total Estimated Duration**: 2-3 weeks (10-15 working days) - -**Critical Path**: Phase 1 → Phase 2 → Phase 3 (MVP) → Phase 6 (Documentation) - -**Suggested MVP Scope**: Phase 1-3 only (real, secure Harbor benchmarks) - delivers core value, can release independently. - ---- - -## Testing Strategy Summary - -**Test-Driven Development (TDD)**: MANDATORY per Constitution Principle IV - -**Red-Green-Refactor Workflow**: -1. **Red Phase**: Write tests FIRST, verify they FAIL -2. **Green Phase**: Implement code to make tests PASS -3. **Refactor Phase**: Improve code quality, add docs, extract constants - -**Test Coverage Goals**: -- >80% line coverage for new code (per Constitution) -- >90% branch coverage for security-critical code (env sanitization, allowlist validation, path validation) -- 100% coverage for HarborConfig validation logic - -**Test Types**: -- **Unit Tests**: Test individual functions/classes in isolation (mocked dependencies) -- **Integration Tests**: Test full workflow with mocked subprocess calls -- **Manual Tests**: Verify real Harbor subprocess integration end-to-end - -**Test Count by Phase**: -- Phase 2: 8 unit tests (HarborConfig validation) -- Phase 3: 13 unit tests + 1 integration test (Harbor integration, JSON parsing, security) -- Phase 4: 5 unit tests (pandas aggregation) -- Phase 5: 4 unit tests (parallel execution) -- **Total**: 30+ tests for 120 lines of implementation code (~4:1 test-to-code ratio) - ---- - -## Risk Mitigation During Implementation - -**Risk 1: Harbor framework behavior differs from documentation** -- **Mitigation Task**: T049 (integration test) catches this early -- **Response**: Update implementation based on actual Harbor output format - -**Risk 2: Test coverage falls below 80%** -- **Mitigation Task**: T093-T094 (coverage verification and gap filling) -- **Response**: Add missing tests before declaring phase complete - -**Risk 3: Security validations insufficient** -- **Mitigation Tasks**: T024, T030, T042, T098 (security-focused tests and manual verification) -- **Response**: Enhance allowlists or validation logic if vulnerabilities found - -**Risk 4: Performance slower than estimated (>10 min per benchmark)** -- **Mitigation**: MVP (Phase 3) deployment allows real-world performance measurement -- **Response**: Adjust timeout values or add performance optimization tasks if needed - ---- - -## Next Steps - -1. ✅ Tasks generated and organized by user story -2. ⏭️ Begin Phase 1: Setup & Dependencies (T001-T004) -3. ⏭️ Begin Phase 2: TDD for HarborConfig (T005-T022) -4. ⏭️ Track progress: Use task checkboxes to mark completion -5. ⏭️ After MVP (Phase 3): Deploy and test with real repositories -6. ⏭️ After all phases: Run empirical benchmarks on 10-20 repositories, document findings in `docs/tbench/assessor-refinement-results.md` - ---- - -**Document Status**: Complete -**Last Updated**: 2025-12-09 -**Ready for Implementation**: ✅ Yes -**Estimated Effort**: 10-15 working days (120 lines of code, 30+ tests, following TDD) diff --git a/src/agentready/cli/benchmark.py b/src/agentready/cli/benchmark.py deleted file mode 100644 index 31958f00..00000000 --- a/src/agentready/cli/benchmark.py +++ /dev/null @@ -1,473 +0,0 @@ -"""Benchmark command for running agent coding evaluations.""" - -import json -import os -import tempfile -from pathlib import Path - -import click - -from ..services.eval_harness.harbor_config import ALLOWED_MODELS, HarborConfig -from ..services.eval_harness.tbench_runner import _real_tbench_result -from ..services.harbor.agent_toggler import AssessorStateToggler -from ..services.harbor.comparer import compare_assessor_impact - - -@click.command() -@click.argument("repository", type=click.Path(exists=True), required=False, default=".") -@click.option( - "--harness", - type=click.Choice(["tbench"]), - default="tbench", - help="Evaluation harness to use (tbench=Terminal-Bench)", -) -@click.option( - "--subset", - type=str, - default=None, - help="Benchmark subset (tbench: smoketest/full)", -) -@click.option( - "--agent", - type=click.Choice(["claude-code", "cursor-cli"]), - default="claude-code", - help="Agent for evaluation", -) -@click.option( - "--model", - type=click.Choice(list(ALLOWED_MODELS)), - default="anthropic/claude-haiku-4-5", - help="Model for evaluation", -) -@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") -@click.option( - "--timeout", - type=int, - default=3600, - help="Timeout in seconds (default: 3600)", -) -@click.option( - "--output-dir", - "-o", - type=click.Path(), - default=None, - help="Output directory for results (default: .agentready/benchmarks/tbench/)", -) -@click.option( - "--skip-preflight", - is_flag=True, - help="Skip dependency checks (for advanced users)", -) -def benchmark( - repository, - harness, - subset, - agent, - model, - verbose, - timeout, - output_dir, - skip_preflight, -): - """Run agent coding benchmarks. - - Evaluates agent performance on standardized coding benchmarks. - Currently supports Terminal-Bench (89 tasks). - - REPOSITORY: Path to git repository (default: current directory) - - Examples: - - \b - # Quick Terminal-Bench smoketest (1-2 tasks, ~2-5 min) - agentready benchmark --harness tbench --subset smoketest - - \b - # Full Terminal-Bench with Sonnet (~30-40 min) - agentready benchmark --harness tbench --subset full --model claude-sonnet-4-5 - - \b - # Default harness is tbench, so you can omit it - agentready benchmark --subset smoketest - """ - repo_path = Path(repository).resolve() - - # Route to appropriate harness - if harness == "tbench": - _run_tbench( - repo_path, - subset, - agent, - model, - verbose, - timeout, - output_dir, - skip_preflight, - ) - else: - click.echo(f"Unknown harness: {harness}", err=True) - raise click.Abort() - - -def _run_tbench( - repo_path, subset, agent, model, verbose, timeout, output_dir, skip_preflight -): - """Run Terminal-Bench evaluation.""" - # Default subset to 'full' if not specified - if subset is None: - subset = "full" - - # Validate subset - if subset not in ["smoketest", "full"]: - click.echo( - f"Invalid subset '{subset}' for tbench. Use: smoketest, full", err=True - ) - raise click.Abort() - - smoketest = subset == "smoketest" - - if verbose: - click.echo("AgentReady Terminal-Bench Benchmark") - click.echo(f"{'=' * 50}\n") - click.echo(f"Repository: {repo_path}") - click.echo(f"Agent: {agent}") - click.echo(f"Model: {model}") - click.echo(f"Subset: {subset} ({'1-2 tasks' if smoketest else '89 tasks'})") - click.echo(f"Timeout: {timeout}s\n") - - # Preflight: Check Harbor CLI availability and dataset - task_path = None - if not skip_preflight: - try: - from ..utils.preflight import ( - PreflightError, - check_harbor_cli, - ensure_terminal_bench_dataset, - ) - - if verbose: - click.echo("Checking dependencies...\n") - - check_harbor_cli(interactive=True) - - # For smoketest, ensure dataset is downloaded - if smoketest: - task_path = ensure_terminal_bench_dataset() - - except PreflightError as e: - click.echo(f"\nPreflight check failed:\n{e}\n", err=True) - raise click.Abort() - - # Validate API key BEFORE creating HarborConfig - if agent == "claude-code": - api_key = os.environ.get("ANTHROPIC_API_KEY", "") - elif agent == "cursor-cli": - api_key = os.environ.get("CURSOR_API_KEY", "") - - if not api_key: - key_name = "ANTHROPIC_API_KEY" if agent == "claude-code" else "CURSOR_API_KEY" - click.echo( - f"Error: {key_name} environment variable not set.\n" - f"Set it with: export {key_name}=your-key-here", - err=True, - ) - raise click.Abort() - - # Create HarborConfig (will not raise ValueError now) - harbor_config = HarborConfig( - model=model, - agent=agent, - jobs_dir=Path(tempfile.mkdtemp()), - api_key=api_key, - timeout=timeout, - n_concurrent=1, - smoketest=smoketest, - task_path=task_path, - ) - - try: - # Run benchmark - if verbose: - click.echo("Starting Terminal-Bench evaluation...\n") - - result = _real_tbench_result(repo_path, harbor_config) - - # Display results - click.echo(f"\n{'=' * 50}") - click.echo("Terminal-Bench Benchmark Complete") - click.echo(f"{'=' * 50}\n") - click.echo(f"Score: {result.score:.2f}") - click.echo(f"Task Solved: {result.task_solved}") - click.echo(f"Resolved Trials: {result.resolved_trials}") - click.echo(f"Unresolved Trials: {result.unresolved_trials}") - click.echo(f"Pass@1: {result.pass_at_1:.2f}") - - # Display trajectory file path if available - if result.trajectory_path: - click.echo(f"\nTrajectory: {result.trajectory_path}") - - # Save results if output dir specified - if output_dir: - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) - # TODO: Save results to JSON file - - except Exception as e: - click.echo(f"\nBenchmark failed: {e}", err=True) - if verbose: - import traceback - - traceback.print_exc() - raise click.Abort() - - -# Default Phase 1 task subset (8 diverse tasks, ~2-3 hours per assessor) -DEFAULT_PHASE1_TASKS = [ - "adaptive-rejection-sampler", # Math/algorithms - "async-http-client", # Networking - "terminal-file-browser", # File I/O - "markdown-parser", # Text processing - "json-validator", # Data structures - "cli-calculator", # User interaction - "log-analyzer", # String manipulation - "sudoku-solver", # Logic -] - - -@click.command() -@click.option( - "--assessor", - "-a", - required=False, - help="Assessor ID to validate (e.g., claude_md_file, readme_structure, test_execution)", -) -@click.option( - "--tasks", - "-t", - multiple=True, - help="Terminal-Bench task names (default: Phase 1 subset of 8 tasks)", -) -@click.option( - "--runs", - "-r", - type=int, - default=3, - help="Number of runs per task (default: 3, recommended: 5+)", -) -@click.option( - "--model", - "-m", - type=click.Choice(["claude-haiku-4-5", "claude-sonnet-4-5"]), - default="claude-haiku-4-5", - help="Claude model to use (default: haiku-4-5 for speed)", -) -@click.option( - "--output-dir", - "-o", - type=click.Path(), - default=None, - help="Output directory (default: .agentready/validations/{assessor_id}/)", -) -@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output") -@click.option( - "--list-assessors", - is_flag=True, - help="List supported assessors and exit", -) -@click.option( - "--concurrent", - "-c", - type=int, - default=1, - help="Number of concurrent tasks to run in parallel (default: 1, recommended: 3-5)", -) -@click.option( - "--smoketest", - is_flag=True, - help="Run smoketest with single task for quick validation (~2 minutes)", -) -def validate_assessor( - assessor, - tasks, - runs, - model, - output_dir, - verbose, - list_assessors, - concurrent, - smoketest, -): - """Validate single assessor impact using Terminal-Bench A/B testing. - - This command empirically measures the impact of a specific AgentReady assessor - on Claude Code's Terminal-Bench performance by running an A/B test: - - \b - 1. Baseline: Force assessor to FAIL (manipulate repo state) - 2. Treatment: Restore repo to normal state (assessor PASSES) - 3. Compare success rates with statistical significance testing - - Examples: - - \b - # Quick smoketest with 1 task (~2 minutes) - agentready validate-assessor --assessor claude_md_file --smoketest - - \b - # Validate CLAUDE.md impact (8 tasks, 3 runs each = 48 trials) - agentready validate-assessor --assessor claude_md_file - - \b - # Test README with custom tasks and 5 runs for higher statistical power - agentready validate-assessor --assessor readme_structure \\ - --tasks adaptive-rejection-sampler \\ - --tasks async-http-client \\ - --runs 5 - - \b - # List all supported assessors - agentready validate-assessor --list-assessors - """ - repo_root = Path.cwd() - - # Handle --list-assessors - if list_assessors: - toggler = AssessorStateToggler(repo_root=repo_root) - supported = toggler.list_supported_assessors() - - click.echo("Supported Assessors for Validation:") - click.echo(f"{'=' * 60}") - for assessor_id in supported: - click.echo(f" - {assessor_id}") - click.echo(f"\nTotal: {len(supported)} assessors") - click.echo("\nUsage: agentready validate-assessor --assessor {assessor_id}") - return - - # Validate that --assessor is provided if not listing - if not assessor: - click.echo( - "Error: Missing required option '--assessor' / '-a'.\n" - "Use --list-assessors to see available assessors.", - err=True, - ) - raise click.Abort() - - # Validate ANTHROPIC_API_KEY - api_key = os.environ.get("ANTHROPIC_API_KEY", "") - if not api_key: - click.echo( - "Error: ANTHROPIC_API_KEY environment variable not set.\n" - "Set it with: export ANTHROPIC_API_KEY=your-key-here", - err=True, - ) - raise click.Abort() - - # Use default Phase 1 tasks if not specified - if not tasks: - if smoketest: - # Smoketest: just 1 simple task for quick validation - tasks = ["adaptive-rejection-sampler"] - if verbose: - click.echo("Using smoketest mode (1 task, ~2 minutes total)\n") - else: - tasks = DEFAULT_PHASE1_TASKS - if verbose: - click.echo(f"Using default Phase 1 task subset ({len(tasks)} tasks)\n") - - # Convert model name to full identifier - model_id = f"anthropic/{model}" - - # Set output directory - if output_dir: - output_path = Path(output_dir) - else: - output_path = Path(".agentready") / "validations" / assessor - - try: - # Run A/B comparison - comparison = compare_assessor_impact( - assessor_id=assessor, - task_names=list(tasks), - repo_root=repo_root, - runs_per_task=runs, - output_dir=output_path, - model=model_id, - n_concurrent=concurrent, - verbose=verbose, - ) - - # Save JSON results - json_file = output_path / f"{assessor}.json" - with open(json_file, "w") as f: - json.dump( - { - "assessor_id": assessor, - "tasks": list(tasks), - "runs_per_task": runs, - "baseline": comparison.without_agent.to_dict(), - "treatment": comparison.with_agent.to_dict(), - "deltas": comparison.deltas, - "statistical_significance": comparison.statistical_significance, - }, - f, - indent=2, - ) - - # Generate Markdown report - md_file = output_path / f"{assessor}.md" - with open(md_file, "w") as f: - f.write(f"# Assessor Impact Validation: {assessor}\n\n") - f.write(f"**Date**: {comparison.created_at}\n") - f.write(f"**Tasks**: {len(tasks)}\n") - f.write(f"**Runs per Task**: {runs}\n\n") - - f.write("## Results Summary\n\n") - f.write( - "| Metric | Baseline (Assessor Fails) | Treatment (Assessor Passes) | Delta |\n" - ) - f.write( - "|--------|---------------------------|----------------------------|-------|\n" - ) - - delta = comparison.deltas["success_rate_delta"] - sign = "+" if delta >= 0 else "" - f.write( - f"| Success Rate | {comparison.without_agent.success_rate:.1f}% | " - f"{comparison.with_agent.success_rate:.1f}% | **{sign}{delta:.1f} pp** |\n" - ) - - duration_delta = comparison.deltas.get("avg_duration_delta_sec", 0) - sign = "+" if duration_delta >= 0 else "" - f.write( - f"| Avg Duration | {comparison.without_agent.avg_duration_sec:.1f}s | " - f"{comparison.with_agent.avg_duration_sec:.1f}s | {sign}{duration_delta:.1f}s |\n" - ) - - f.write("\n## Statistical Significance\n\n") - is_sig = comparison.statistical_significance.get( - "success_rate_significant", False - ) - p_val = comparison.statistical_significance.get("success_rate_p_value") - f.write(f"- **Significant**: {'YES ✓' if is_sig else 'NO'}\n") - if p_val is not None: - f.write(f"- **P-value**: {p_val:.4f}\n") - - f.write("\n## Files\n\n") - f.write(f"- JSON: `{json_file}`\n") - f.write(f"- Markdown: `{md_file}`\n") - - click.echo("\nResults saved:") - click.echo(f" - JSON: {json_file}") - click.echo(f" - Markdown: {md_file}\n") - - except ValueError as e: - click.echo(f"\nError: {e}\n", err=True) - click.echo("Use --list-assessors to see supported assessors.", err=True) - raise click.Abort() - except Exception as e: - click.echo(f"\nValidation failed: {e}\n", err=True) - if verbose: - import traceback - - traceback.print_exc() - raise click.Abort() diff --git a/src/agentready/cli/eval_harness.py b/src/agentready/cli/eval_harness.py deleted file mode 100644 index 32ce0d01..00000000 --- a/src/agentready/cli/eval_harness.py +++ /dev/null @@ -1,787 +0,0 @@ -"""CLI commands for Terminal-Bench eval harness. - -Provides commands to establish baseline, test assessors, aggregate results, -and generate dashboard data for empirical assessment validation. -""" - -import sys -from pathlib import Path - -import click - -from ..assessors import create_all_assessors -from ..services.eval_harness import ( - AssessorTester, - BaselineEstablisher, - DashboardGenerator, - ResultsAggregator, - TbenchRunner, -) - - -@click.group("eval-harness") -def eval_harness(): - """Terminal-Bench eval harness for measuring assessor impact. - - Systematically measures the impact of each AgentReady assessor - on Terminal-Bench (tbench.ai) performance through A/B testing. - - Workflow: - 1. Establish baseline: agentready eval-harness baseline - 2. Test assessors: agentready eval-harness run-tier --tier 1 - 3. View summary: agentready eval-harness summarize - 4. Generate dashboard: agentready eval-harness dashboard - - Examples: - - \b - # Establish baseline (5 runs) - agentready eval-harness baseline - - \b - # Test single assessor - agentready eval-harness test-assessor --assessor-id claude_md_file - - \b - # Test all Tier 1 assessors - agentready eval-harness run-tier --tier 1 - """ - pass - - -@eval_harness.command() -@click.argument("repository", type=click.Path(exists=True), default=".") -@click.option( - "--iterations", - "-n", - type=int, - default=5, - help="Number of tbench runs to perform (default: 5)", -) -@click.option( - "--output-dir", - "-o", - type=click.Path(), - default=None, - help="Output directory (default: .agentready/eval_harness/baseline)", -) -@click.option( - "--verbose", - "-v", - is_flag=True, - help="Show detailed progress information", -) -def baseline(repository, iterations, output_dir, verbose): - """Establish baseline Terminal-Bench performance. - - Runs tbench multiple times on an unmodified repository to establish - the starting point for measuring assessor impact. Calculates mean, - standard deviation, median, min, and max scores. - - REPOSITORY: Path to git repository (default: current directory) - """ - repo_path = Path(repository).resolve() - - # Validate repository - if not (repo_path / ".git").exists(): - click.echo("Error: Not a git repository", err=True) - sys.exit(1) - - click.echo("🔬 AgentReady Eval Harness - Baseline Establishment") - click.echo("=" * 60) - click.echo(f"\nRepository: {repo_path}") - click.echo(f"Iterations: {iterations}") - if output_dir: - click.echo(f"Output: {output_dir}") - click.echo() - - # Create establisher - tbench_runner = TbenchRunner(mock=True) - establisher = BaselineEstablisher(tbench_runner=tbench_runner) - - # Set output directory - if output_dir: - out_path = Path(output_dir) - else: - out_path = repo_path / ".agentready" / "eval_harness" / "baseline" - - # Run baseline establishment with progress - click.echo("Running Terminal-Bench baseline...") - click.echo("[Mocked mode - using deterministic scores for workflow validation]\n") - - try: - with click.progressbar( - range(iterations), label="Progress", show_pos=True - ) as bar: - # We can't actually update during iteration with current API - # So we'll run all at once and show completion - baseline_metrics = establisher.establish_baseline( - repo_path, iterations=iterations, output_dir=out_path - ) - # Advance progress bar to completion - for _ in bar: - pass - - # Show results - click.echo("\n✅ Baseline established successfully!") - click.echo("\nResults:") - click.echo(f" Mean Score: {baseline_metrics.mean_score:.2f}") - click.echo(f" Std Dev: {baseline_metrics.std_dev:.2f}") - click.echo(f" Median: {baseline_metrics.median_score:.2f}") - click.echo(f" Min: {baseline_metrics.min_score:.2f}") - click.echo(f" Max: {baseline_metrics.max_score:.2f}") - click.echo(f" Iterations: {baseline_metrics.iterations}") - - click.echo("\nResults saved to:") - click.echo(f" {out_path / 'summary.json'}") - click.echo(f" {out_path / 'run_001.json'} (and {iterations-1} more)") - - if verbose: - click.echo("\n📊 Individual Run Scores:") - for i, result in enumerate(baseline_metrics.raw_results, 1): - click.echo( - f" Run {i:2d}: {result.score:.2f} (completion: {result.completion_rate:.1f}%, pytest: {result.pytest_pass_rate:.1f}%)" - ) - - click.echo("\nNext step:") - click.echo( - " agentready eval-harness test-assessor --assessor-id claude_md_file" - ) - - except Exception as e: - click.echo(f"\n❌ Error during baseline establishment: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - sys.exit(1) - - -@eval_harness.command() -@click.option( - "--baseline-dir", - type=click.Path(exists=True), - default=".agentready/eval_harness/baseline", - help="Directory containing baseline results", -) -def show_baseline(baseline_dir): - """Display previously established baseline metrics. - - Loads and displays baseline results from a previous run. - """ - baseline_path = Path(baseline_dir).resolve() - - click.echo("🔬 AgentReady Eval Harness - Baseline Results") - click.echo("=" * 60) - - try: - establisher = BaselineEstablisher() - baseline_metrics = establisher.load_baseline(baseline_path) - - click.echo(f"\nBaseline loaded from: {baseline_path}") - click.echo("\nResults:") - click.echo(f" Mean Score: {baseline_metrics.mean_score:.2f}") - click.echo(f" Std Dev: {baseline_metrics.std_dev:.2f}") - click.echo(f" Median: {baseline_metrics.median_score:.2f}") - click.echo(f" Min: {baseline_metrics.min_score:.2f}") - click.echo(f" Max: {baseline_metrics.max_score:.2f}") - click.echo(f" Iterations: {baseline_metrics.iterations}") - - click.echo("\n📊 Individual Run Scores:") - for i, result in enumerate(baseline_metrics.raw_results, 1): - click.echo( - f" Run {i:2d}: {result.score:.2f} (completion: {result.completion_rate:.1f}%, pytest: {result.pytest_pass_rate:.1f}%)" - ) - - except FileNotFoundError as e: - click.echo(f"\n❌ {str(e)}", err=True) - click.echo( - "\nRun 'agentready eval-harness baseline' first to establish baseline." - ) - sys.exit(1) - except Exception as e: - click.echo(f"\n❌ Error loading baseline: {str(e)}", err=True) - sys.exit(1) - - -@eval_harness.command() -@click.option( - "--assessor-id", - required=True, - help="Assessor attribute ID to test (e.g., claude_md_file)", -) -@click.argument("repository", type=click.Path(exists=True), default=".") -@click.option( - "--baseline-dir", - type=click.Path(exists=True), - default=None, - help="Directory containing baseline results (default: .agentready/eval_harness/baseline)", -) -@click.option( - "--iterations", - "-n", - type=int, - default=5, - help="Number of tbench runs post-remediation (default: 5)", -) -@click.option( - "--output-dir", - "-o", - type=click.Path(), - default=None, - help="Output directory (default: .agentready/eval_harness/)", -) -@click.option( - "--verbose", - "-v", - is_flag=True, - help="Show detailed progress information", -) -def test_assessor( - assessor_id, repository, baseline_dir, iterations, output_dir, verbose -): - """Test a single assessor's impact on Terminal-Bench performance. - - Runs A/B testing workflow: - 1. Clone repository to temp directory - 2. Run assessment with single assessor - 3. Apply remediation using FixerService - 4. Run tbench post-remediation - 5. Calculate delta, p-value, and Cohen's d effect size - - REPOSITORY: Path to git repository (default: current directory) - - Examples: - - \b - # Test claude_md_file assessor - agentready eval-harness test-assessor --assessor-id claude_md_file - - \b - # Test with custom baseline location - agentready eval-harness test-assessor \\ - --assessor-id readme_structure \\ - --baseline-dir /path/to/baseline - """ - repo_path = Path(repository).resolve() - - # Validate repository - if not (repo_path / ".git").exists(): - click.echo("Error: Not a git repository", err=True) - sys.exit(1) - - click.echo("🧪 AgentReady Eval Harness - Assessor Testing") - click.echo("=" * 60) - click.echo(f"\nAssessor: {assessor_id}") - click.echo(f"Repository: {repo_path}") - click.echo(f"Iterations: {iterations}") - click.echo() - - # Load baseline - if baseline_dir: - baseline_path = Path(baseline_dir) - else: - baseline_path = repo_path / ".agentready" / "eval_harness" / "baseline" - - try: - establisher = BaselineEstablisher() - baseline_metrics = establisher.load_baseline(baseline_path) - click.echo( - f"📊 Baseline loaded: {baseline_metrics.mean_score:.2f} ± {baseline_metrics.std_dev:.2f}" - ) - except FileNotFoundError: - click.echo(f"❌ Baseline not found at {baseline_path}", err=True) - click.echo("\nRun 'agentready eval-harness baseline' first.") - sys.exit(1) - - # Set output directory - if output_dir: - out_path = Path(output_dir) - else: - out_path = ( - repo_path / ".agentready" / "eval_harness" / "assessors" / assessor_id - ) - - # Create tester - tbench_runner = TbenchRunner(mock=True) - tester = AssessorTester(tbench_runner=tbench_runner) - - # Run test - click.echo("\n🔬 Testing assessor impact...") - click.echo("[Mocked mode - using deterministic scores for workflow validation]") - click.echo("\nSteps:") - click.echo(" 1. Clone repository to temp directory") - click.echo(f" 2. Run assessment with {assessor_id} only") - click.echo(" 3. Apply remediation (if applicable)") - click.echo(f" 4. Run Terminal-Bench {iterations} times") - click.echo(" 5. Calculate statistical significance\n") - - try: - with click.progressbar( - range(iterations), label="Progress", show_pos=True - ) as bar: - impact = tester.test_assessor( - assessor_id, - repo_path, - baseline_metrics, - iterations=iterations, - output_dir=out_path, - ) - # Advance progress bar to completion - for _ in bar: - pass - - # Show results - click.echo("\n✅ Assessor testing complete!") - - # Delta interpretation - delta_sign = "+" if impact.delta_score >= 0 else "" - delta_color = "green" if impact.delta_score > 0 else "red" - - click.echo("\n📊 Results:") - click.echo(f" Assessor: {impact.assessor_name} (Tier {impact.tier})") - click.echo(f" Baseline Score: {impact.baseline_score:.2f}") - click.echo(f" Post-Fix Score: {impact.post_remediation_score:.2f}") - click.echo( - f" Delta: {delta_sign}{impact.delta_score:.2f} points", - color=delta_color if impact.delta_score != 0 else None, - ) - click.echo(f" P-value: {impact.p_value:.4f}") - click.echo(f" Effect Size (d): {impact.effect_size:.3f}") - - # Significance interpretation - if impact.is_significant: - click.echo(" Significant: ✅ YES (p < 0.05, |d| > 0.2)") - else: - click.echo(" Significant: ❌ NO") - - # Effect size interpretation - abs_d = abs(impact.effect_size) - if abs_d >= 0.8: - effect_label = "large" - elif abs_d >= 0.5: - effect_label = "medium" - elif abs_d >= 0.2: - effect_label = "small" - else: - effect_label = "negligible" - click.echo(f" Effect Magnitude: {effect_label}") - - # Remediation summary - click.echo("\n🔧 Remediation:") - click.echo(f" Fixes Applied: {impact.fixes_applied}") - if verbose and impact.remediation_log: - click.echo("\n Actions taken:") - for log_entry in impact.remediation_log: - click.echo(f" - {log_entry}") - - click.echo("\n💾 Results saved to:") - click.echo(f" {out_path / 'impact.json'}") - click.echo(f" {out_path / 'run_001.json'} (and {iterations-1} more)") - - # Next steps - click.echo("\n📈 Next steps:") - click.echo(" agentready eval-harness run-tier --tier 1") - - except ValueError as e: - click.echo(f"\n❌ {str(e)}", err=True) - sys.exit(1) - except Exception as e: - click.echo(f"\n❌ Error during assessor testing: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - sys.exit(1) - - -@eval_harness.command() -@click.option( - "--tier", - type=int, - required=True, - help="Tier to test (1=Essential, 2=Critical, 3=Important, 4=Advanced)", -) -@click.argument("repository", type=click.Path(exists=True), default=".") -@click.option( - "--baseline-dir", - type=click.Path(exists=True), - default=None, - help="Directory containing baseline results (default: .agentready/eval_harness/baseline)", -) -@click.option( - "--iterations", - "-n", - type=int, - default=5, - help="Number of tbench runs per assessor (default: 5)", -) -@click.option( - "--verbose", - "-v", - is_flag=True, - help="Show detailed progress information", -) -def run_tier(tier, repository, baseline_dir, iterations, verbose): - """Run all assessors in a tier and measure impact. - - Tests each assessor in the specified tier sequentially, measures impact, - and generates a summary report with tier-level statistics. - - REPOSITORY: Path to git repository (default: current directory) - - Examples: - - \b - # Test all Tier 1 assessors (5 total) - agentready eval-harness run-tier --tier 1 - - \b - # Test with more iterations for better statistical confidence - agentready eval-harness run-tier --tier 1 --iterations 10 - """ - repo_path = Path(repository).resolve() - - # Validate repository - if not (repo_path / ".git").exists(): - click.echo("Error: Not a git repository", err=True) - sys.exit(1) - - # Validate tier - if tier not in [1, 2, 3, 4]: - click.echo("Error: Tier must be 1, 2, 3, or 4", err=True) - sys.exit(1) - - tier_names = {1: "Essential", 2: "Critical", 3: "Important", 4: "Advanced"} - - click.echo(f"🧪 AgentReady Eval Harness - Tier {tier} Testing") - click.echo("=" * 60) - click.echo(f"\nTier: {tier} ({tier_names[tier]})") - click.echo(f"Repository: {repo_path}") - click.echo(f"Iterations per assessor: {iterations}") - click.echo() - - # Load baseline - if baseline_dir: - baseline_path = Path(baseline_dir) - else: - baseline_path = repo_path / ".agentready" / "eval_harness" / "baseline" - - try: - establisher = BaselineEstablisher() - baseline_metrics = establisher.load_baseline(baseline_path) - click.echo( - f"📊 Baseline loaded: {baseline_metrics.mean_score:.2f} ± {baseline_metrics.std_dev:.2f}" - ) - except FileNotFoundError: - click.echo(f"❌ Baseline not found at {baseline_path}", err=True) - click.echo("\nRun 'agentready eval-harness baseline' first.") - sys.exit(1) - - # Get assessors for this tier - all_assessors = create_all_assessors() - tier_assessors = [a for a in all_assessors if a.attribute.tier == tier] - - if not tier_assessors: - click.echo(f"❌ No assessors found for Tier {tier}", err=True) - sys.exit(1) - - click.echo(f"\nAssessors to test: {len(tier_assessors)}") - for assessor in tier_assessors: - click.echo(f" - {assessor.attribute_id} ({assessor.attribute.name})") - click.echo() - - # Create tester - tbench_runner = TbenchRunner(mock=True) - tester = AssessorTester(tbench_runner=tbench_runner) - - # Test each assessor - click.echo("🔬 Testing assessors...\n") - - for i, assessor in enumerate(tier_assessors, 1): - click.echo(f"[{i}/{len(tier_assessors)}] Testing {assessor.attribute_id}...") - - output_dir = ( - repo_path - / ".agentready" - / "eval_harness" - / "assessors" - / assessor.attribute_id - ) - - try: - impact = tester.test_assessor( - assessor.attribute_id, - repo_path, - baseline_metrics, - iterations=iterations, - output_dir=output_dir, - ) - - # Show brief results - delta_sign = "+" if impact.delta_score >= 0 else "" - significance_icon = "✅" if impact.is_significant else "❌" - click.echo( - f" Delta: {delta_sign}{impact.delta_score:.2f} | " - f"Significant: {significance_icon} | " - f"Fixes: {impact.fixes_applied}" - ) - click.echo() - - except Exception as e: - click.echo(f" ❌ Error: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - click.echo() - - # Automatically run summarize - click.echo("=" * 60) - click.echo("📊 Generating summary...\n") - - try: - eval_harness_dir = repo_path / ".agentready" / "eval_harness" - aggregator = ResultsAggregator() - summary = aggregator.aggregate(eval_harness_dir) - - click.echo("✅ Summary generated!") - click.echo("\n📈 Results:") - click.echo(f" Total Assessors Tested: {summary.total_assessors_tested}") - click.echo( - f" Significant Improvements: {summary.significant_improvements} ({summary.significant_improvements / summary.total_assessors_tested * 100:.0f}%)" - ) - - # Show tier impacts - click.echo("\n🎯 Tier Impacts (Average Delta):") - for t in sorted(summary.tier_impacts.keys()): - delta = summary.tier_impacts[t] - if delta != 0: - delta_sign = "+" if delta >= 0 else "" - click.echo(f" Tier {t}: {delta_sign}{delta:.2f} points") - - # Show top 3 assessors - ranked = summary.get_ranked_assessors() - click.echo("\n🏆 Top 3 Assessors by Impact:") - for i, impact in enumerate(ranked[:3], 1): - delta_sign = "+" if impact.delta_score >= 0 else "" - click.echo( - f" {i}. {impact.assessor_name}: {delta_sign}{impact.delta_score:.2f} points" - ) - - click.echo("\n💾 Summary saved to:") - click.echo(f" {eval_harness_dir / 'summary.json'}") - - click.echo("\n📈 Next steps:") - click.echo( - " agentready eval-harness dashboard # Generate GitHub Pages dashboard" - ) - - except Exception as e: - click.echo(f"❌ Error generating summary: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - sys.exit(1) - - -@eval_harness.command() -@click.argument("repository", type=click.Path(exists=True), default=".") -@click.option( - "--verbose", - "-v", - is_flag=True, - help="Show detailed assessor breakdown", -) -def summarize(repository, verbose): - """Aggregate and display evaluation results. - - Loads all assessor impact results and generates a summary report - with tier-level statistics and ranked assessors. - - REPOSITORY: Path to git repository (default: current directory) - - Examples: - - \b - # Generate summary after testing assessors - agentready eval-harness summarize - - \b - # Show detailed breakdown - agentready eval-harness summarize --verbose - """ - repo_path = Path(repository).resolve() - eval_harness_dir = repo_path / ".agentready" / "eval_harness" - - click.echo("📊 AgentReady Eval Harness - Summary") - click.echo("=" * 60) - - try: - aggregator = ResultsAggregator() - summary = aggregator.aggregate(eval_harness_dir) - - click.echo("\n✅ Summary generated successfully!") - - # Baseline - click.echo("\n📈 Baseline Performance:") - click.echo(f" Mean Score: {summary.baseline.mean_score:.2f}") - click.echo(f" Std Dev: {summary.baseline.std_dev:.2f}") - click.echo(f" Iterations: {summary.baseline.iterations}") - - # Overall stats - click.echo("\n📊 Overall Results:") - click.echo(f" Total Assessors Tested: {summary.total_assessors_tested}") - click.echo(f" Significant Improvements: {summary.significant_improvements}") - click.echo( - f" Significance Rate: {summary.significant_improvements / summary.total_assessors_tested * 100:.0f}%" - ) - - # Tier impacts - click.echo("\n🎯 Impact by Tier (Average Delta):") - for t in sorted(summary.tier_impacts.keys()): - delta = summary.tier_impacts[t] - delta_sign = "+" if delta >= 0 else "" - tier_names = {1: "Essential", 2: "Critical", 3: "Important", 4: "Advanced"} - click.echo( - f" Tier {t} ({tier_names.get(t, 'Unknown')}): {delta_sign}{delta:.2f} points" - ) - - # Ranked assessors - ranked = summary.get_ranked_assessors() - click.echo("\n🏆 Assessors Ranked by Impact:") - - if verbose: - # Show all assessors - for i, impact in enumerate(ranked, 1): - delta_sign = "+" if impact.delta_score >= 0 else "" - sig_icon = "✅" if impact.is_significant else "❌" - click.echo( - f" {i:2d}. {impact.assessor_name:40s} " - f"{delta_sign}{impact.delta_score:+6.2f} | " - f"Sig: {sig_icon} | " - f"Fixes: {impact.fixes_applied}" - ) - else: - # Show top 5 - for i, impact in enumerate(ranked[:5], 1): - delta_sign = "+" if impact.delta_score >= 0 else "" - sig_icon = "✅" if impact.is_significant else "❌" - click.echo( - f" {i}. {impact.assessor_name}: {delta_sign}{impact.delta_score:.2f} | Sig: {sig_icon}" - ) - - if len(ranked) > 5: - click.echo(f" ... and {len(ranked) - 5} more") - click.echo("\n (Use --verbose to see all assessors)") - - click.echo("\n💾 Summary saved to:") - click.echo(f" {eval_harness_dir / 'summary.json'}") - - click.echo("\n📈 Next steps:") - click.echo( - " agentready eval-harness dashboard # Generate GitHub Pages dashboard" - ) - - except FileNotFoundError as e: - click.echo(f"❌ {str(e)}", err=True) - click.echo( - "\nRun 'agentready eval-harness run-tier --tier 1' first to test assessors." - ) - sys.exit(1) - except Exception as e: - click.echo(f"❌ Error: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - sys.exit(1) - - -@eval_harness.command() -@click.argument("repository", type=click.Path(exists=True), default=".") -@click.option( - "--docs-dir", - type=click.Path(), - default=None, - help="Docs data directory (default: docs/_data/tbench/)", -) -@click.option( - "--verbose", - "-v", - is_flag=True, - help="Show detailed file output information", -) -def dashboard(repository, docs_dir, verbose): - """Generate dashboard data files for GitHub Pages. - - Converts evaluation summary into Jekyll-compatible JSON data files - for visualization with Chart.js on the GitHub Pages dashboard. - - REPOSITORY: Path to git repository (default: current directory) - - Examples: - - \b - # Generate dashboard data after testing - agentready eval-harness dashboard - - \b - # Custom docs directory - agentready eval-harness dashboard --docs-dir /path/to/docs/_data/tbench - """ - repo_path = Path(repository).resolve() - eval_harness_dir = repo_path / ".agentready" / "eval_harness" - - click.echo("📊 AgentReady Eval Harness - Dashboard Generator") - click.echo("=" * 60) - - try: - generator = DashboardGenerator() - - # Set docs directory if provided - if docs_dir: - docs_data_dir = Path(docs_dir) - else: - docs_data_dir = None # Will use default (docs/_data/tbench/) - - click.echo("\n🔄 Generating dashboard data...") - click.echo(f"Source: {eval_harness_dir / 'summary.json'}") - - generated_files = generator.generate(eval_harness_dir, docs_data_dir) - - click.echo("\n✅ Dashboard data generated successfully!") - - click.echo("\n📁 Generated Files:") - for name, path in generated_files.items(): - click.echo(f" • {name}: {path.relative_to(repo_path)}") - if verbose: - # Show file size - size = path.stat().st_size - click.echo(f" Size: {size:,} bytes") - - click.echo("\n📈 Next Steps:") - click.echo(" 1. Review generated data in docs/_data/tbench/") - click.echo(" 2. Create dashboard page: docs/tbench.md") - click.echo(" 3. Update navigation: docs/_config.yml") - click.echo(" 4. Commit and push to GitHub Pages") - - click.echo("\n💡 Tip:") - click.echo( - " The dashboard will auto-update when you run 'eval-harness run-tier'" - ) - - except FileNotFoundError as e: - click.echo(f"❌ {str(e)}", err=True) - click.echo( - "\nRun 'agentready eval-harness run-tier --tier 1' first to generate summary." - ) - sys.exit(1) - except Exception as e: - click.echo(f"❌ Error: {str(e)}", err=True) - if verbose: - import traceback - - traceback.print_exc() - sys.exit(1) diff --git a/src/agentready/cli/harbor.py b/src/agentready/cli/harbor.py deleted file mode 100644 index a1db8d71..00000000 --- a/src/agentready/cli/harbor.py +++ /dev/null @@ -1,361 +0,0 @@ -"""Harbor benchmark comparison CLI commands.""" - -import json -from datetime import datetime -from pathlib import Path - -import click - -from agentready.models.harbor import HarborComparison, HarborRunMetrics -from agentready.reporters.harbor_markdown import generate_markdown_report -from agentready.services.harbor.agent_toggler import AgentFileToggler -from agentready.services.harbor.comparer import compare_runs -from agentready.services.harbor.dashboard_generator import ( - DashboardGenerator, - generate_dashboard, -) -from agentready.services.harbor.result_parser import parse_harbor_results -from agentready.services.harbor.runner import HarborNotInstalledError, HarborRunner - - -def _run_benchmark_phase( - runner: HarborRunner, - toggler: AgentFileToggler, - phase_name: str, - run_number: int, - output_dir: Path, - task_list: list, - model: str, - verbose: bool, - disable_agent: bool, -) -> Path: - """Run a single benchmark phase (with or without agent). - - Returns: - Path to results directory - """ - click.echo("=" * 60) - click.echo(f"RUN {run_number}: {phase_name}") - click.echo("=" * 60) - click.echo() - - try: - if disable_agent: - with toggler.temporarily_disabled(): - click.echo("Agent file disabled. Running benchmark...") - runner.run_benchmark( - task_names=task_list, - output_dir=output_dir, - model=model, - verbose=verbose, - ) - else: - click.echo("Agent file enabled. Running benchmark...") - runner.run_benchmark( - task_names=task_list, - output_dir=output_dir, - model=model, - verbose=verbose, - ) - except Exception as e: - click.echo(f"❌ Benchmark failed: {e}", err=True) - # Context manager automatically restores agent file in finally block - raise click.Abort() - - click.echo(f"✓ Run {run_number} complete\n") - return output_dir - - -def _generate_reports( - comparison: HarborComparison, - run_dir: Path, - output_dir: Path, - timestamp: str, -) -> dict: - """Generate all report formats (JSON, Markdown, HTML). - - Returns: - Dictionary of report paths - """ - comparison_base = run_dir / f"comparison_{timestamp}" - paths = {} - - # Generate JSON report - paths["json"] = comparison_base.with_suffix(".json") - with open(paths["json"], "w") as f: - json.dump(comparison.to_dict(), f, indent=2) - click.echo(f" ✓ JSON: {paths['json']}") - - # Generate Markdown report - paths["markdown"] = comparison_base.with_suffix(".md") - generate_markdown_report(comparison, paths["markdown"]) - click.echo(f" ✓ Markdown: {paths['markdown']}") - - # Generate HTML dashboard - paths["html"] = comparison_base.with_suffix(".html") - generate_dashboard(comparison, paths["html"]) - click.echo(f" ✓ HTML: {paths['html']}") - - # Create 'latest' symlinks for easy access - _create_latest_symlinks(paths, output_dir) - - return paths - - -def _create_latest_symlinks(paths: dict, output_dir: Path) -> None: - """Create 'latest' symlinks to most recent comparison files.""" - try: - for format_name, source_path in paths.items(): - extension = source_path.suffix - latest_link = output_dir / f"comparison_latest{extension}" - - # Remove old symlink if exists - if latest_link.exists() or latest_link.is_symlink(): - latest_link.unlink() - - # Create new symlink - latest_link.symlink_to(source_path.relative_to(output_dir)) - - click.echo(f"\n ✓ Latest: {output_dir}/comparison_latest.*") - except Exception: - # Symlinks might fail on Windows, just skip - pass - - -@click.group(name="harbor") -def harbor_cli(): - """Harbor benchmark comparison commands. - - Compare Claude Code performance with/without the doubleagent.md agent file - using the Harbor benchmarking framework. - """ - pass - - -@harbor_cli.command(name="compare") -@click.option( - "-t", - "--task", - "tasks", - multiple=True, - help="Task name to benchmark (can be specified multiple times)", -) -@click.option( - "--model", - default="anthropic/claude-sonnet-4-5", - help="Model identifier (default: anthropic/claude-sonnet-4-5)", -) -@click.option( - "--agent-file", - type=click.Path(exists=True, path_type=Path), - default=".claude/agents/doubleagent.md", - help="Path to agent file (default: .claude/agents/doubleagent.md)", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=".agentready/harbor_comparisons", - help="Output directory for results (default: .agentready/harbor_comparisons)", -) -@click.option("--verbose", is_flag=True, help="Print detailed Harbor output") -@click.option( - "--open-dashboard", is_flag=True, help="Open HTML dashboard after comparison" -) -def compare( - tasks: tuple, - model: str, - agent_file: Path, - output_dir: Path, - verbose: bool, - open_dashboard: bool, -): - """Compare Harbor benchmarks with/without agent file. - - Runs Terminal-Bench tasks twice: - 1. Without doubleagent.md (agent file disabled) - 2. With doubleagent.md (agent file enabled) - - Generates comprehensive comparison reports (JSON, Markdown, HTML). - - Example: - agentready harbor compare -t adaptive-rejection-sampler -t async-http-client - """ - click.echo("=" * 60) - click.echo("Harbor Benchmark Comparison") - click.echo("=" * 60) - click.echo() - - # Validate agent file exists - if not agent_file.exists(): - click.echo(f"❌ Error: Agent file not found: {agent_file}", err=True) - click.echo( - " This comparison requires the doubleagent.md agent file.", err=True - ) - raise click.Abort() - - # Validate tasks specified - if not tasks: - click.echo( - "❌ Error: At least one task must be specified with -t/--task", err=True - ) - click.echo( - " Example: agentready harbor compare -t adaptive-rejection-sampler", - err=True, - ) - raise click.Abort() - - task_list = list(tasks) - click.echo(f"Tasks to benchmark: {', '.join(task_list)}") - click.echo(f"Model: {model}") - click.echo(f"Agent file: {agent_file}") - click.echo() - - try: - # Initialize Harbor runner - click.echo("Checking Harbor installation...") - runner = HarborRunner() - click.echo("✓ Harbor installed\n") - - except HarborNotInstalledError as e: - click.echo(f"❌ {e}", err=True) - raise click.Abort() - - # Create timestamped output directory - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - run_dir = output_dir / f"run_{timestamp}" - run_dir.mkdir(parents=True, exist_ok=True) - - # Initialize agent file toggler - toggler = AgentFileToggler(agent_file) - - # Run benchmarks with and without agent file - without_results_dir = _run_benchmark_phase( - runner=runner, - toggler=toggler, - phase_name="WITHOUT doubleagent.md", - run_number=1, - output_dir=run_dir / "without_agent", - task_list=task_list, - model=model, - verbose=verbose, - disable_agent=True, - ) - - with_results_dir = _run_benchmark_phase( - runner=runner, - toggler=toggler, - phase_name="WITH doubleagent.md", - run_number=2, - output_dir=run_dir / "with_agent", - task_list=task_list, - model=model, - verbose=verbose, - disable_agent=False, - ) - - # Parse results - click.echo("Parsing results...") - try: - without_tasks = parse_harbor_results(without_results_dir) - with_tasks = parse_harbor_results(with_results_dir) - - without_metrics = HarborRunMetrics.from_task_results( - run_id=f"without_{timestamp}", - agent_file_enabled=False, - task_results=without_tasks, - ) - - with_metrics = HarborRunMetrics.from_task_results( - run_id=f"with_{timestamp}", agent_file_enabled=True, task_results=with_tasks - ) - - except Exception as e: - click.echo(f"❌ Failed to parse results: {e}", err=True) - raise click.Abort() - - # Compare runs - click.echo("Calculating comparison...") - comparison = compare_runs(without_metrics, with_metrics) - - # Generate reports - click.echo("Generating reports...") - report_paths = _generate_reports(comparison, run_dir, output_dir, timestamp) - - # Print summary - click.echo() - click.echo("=" * 60) - click.echo("SUMMARY") - click.echo("=" * 60) - - generator = DashboardGenerator() - summary = generator.generate_summary_text(comparison) - click.echo(summary) - - # Open dashboard if requested - if open_dashboard: - import webbrowser - - html_path = report_paths.get("html") - if html_path: - click.echo(f"\nOpening dashboard: {html_path}") - webbrowser.open(html_path.as_uri()) - - -@harbor_cli.command(name="list") -@click.option( - "--output-dir", - type=click.Path(exists=True, path_type=Path), - default=".agentready/harbor_comparisons", - help="Output directory containing comparisons", -) -def list_comparisons(output_dir: Path): - """List all Harbor comparisons.""" - click.echo(f"Harbor comparisons in {output_dir}:") - click.echo() - - comparison_files = sorted(output_dir.glob("*/comparison_*.json"), reverse=True) - - if not comparison_files: - click.echo(" No comparisons found.") - return - - for comp_file in comparison_files: - # Parse comparison to get summary - with open(comp_file, "r") as f: - data = json.load(f) - comparison = HarborComparison.from_dict(data) - - created = comparison.created_at - delta_success = comparison.deltas["success_rate_delta"] - delta_duration = comparison.deltas["avg_duration_delta_pct"] - - click.echo(f" {comp_file.parent.name}/") - click.echo(f" Created: {created}") - click.echo(f" Success Δ: {delta_success:+.1f}%") - click.echo(f" Duration Δ: {delta_duration:+.1f}%") - click.echo() - - -@harbor_cli.command(name="view") -@click.argument("comparison_file", type=click.Path(exists=True, path_type=Path)) -@click.option("--format", type=click.Choice(["summary", "full"]), default="summary") -def view_comparison(comparison_file: Path, format: str): - """View a Harbor comparison. - - COMPARISON_FILE: Path to comparison JSON file - """ - with open(comparison_file, "r") as f: - data = json.load(f) - comparison = HarborComparison.from_dict(data) - - if format == "summary": - generator = DashboardGenerator() - summary = generator.generate_summary_text(comparison) - click.echo(summary) - else: - # Full JSON output - click.echo(json.dumps(data, indent=2)) - - -if __name__ == "__main__": - harbor_cli() diff --git a/src/agentready/cli/main.py b/src/agentready/cli/main.py index f65330f7..92b5ab28 100644 --- a/src/agentready/cli/main.py +++ b/src/agentready/cli/main.py @@ -29,7 +29,6 @@ # Lightweight commands - imported immediately from .align import align -from .benchmark import benchmark, validate_assessor from .bootstrap import bootstrap from .demo import demo from .repomix import repomix_generate @@ -97,7 +96,6 @@ def get_command(self, ctx, cmd_name): "assess-batch": ("assess_batch", "assess_batch"), "experiment": ("experiment", "experiment"), "extract-skills": ("extract_skills", "extract_skills"), - "harbor": ("harbor", "harbor_cli"), "learn": ("learn", "learn"), "submit": ("submit", "submit"), }, @@ -573,8 +571,6 @@ def generate_config(): # Register lightweight commands (heavy commands loaded lazily via LazyGroup) cli.add_command(align) -cli.add_command(benchmark) -cli.add_command(validate_assessor) cli.add_command(bootstrap) cli.add_command(demo) cli.add_command(migrate_report) diff --git a/src/agentready/models/harbor.py b/src/agentready/models/harbor.py deleted file mode 100644 index ff63f238..00000000 --- a/src/agentready/models/harbor.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Data models for Harbor benchmark integration.""" - -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any, Dict, List, Optional - - -@dataclass -class HarborTaskResult: - """Single task result from Harbor result.json.""" - - task_name: str - trial_name: str - success: bool - duration_sec: float - agent_result: Optional[Dict[str, Any]] - verifier_result: Optional[Dict[str, Any]] - exception_info: Optional[Dict[str, str]] - started_at: str - finished_at: str - - @classmethod - def from_result_json(cls, result_data: Dict[str, Any]) -> "HarborTaskResult": - """Create HarborTaskResult from parsed result.json data.""" - # Parse timestamps to calculate duration - started = datetime.fromisoformat(result_data["started_at"]) - finished = datetime.fromisoformat(result_data["finished_at"]) - duration_sec = (finished - started).total_seconds() - - # Determine success based on agent_result and verifier_result - agent_result = result_data.get("agent_result") - verifier_result = result_data.get("verifier_result") - exception_info = result_data.get("exception_info") - - # Success if no exception and both agent and verifier completed - success = ( - exception_info is None - and agent_result is not None - and verifier_result is not None - ) - - return cls( - task_name=result_data["task_name"], - trial_name=result_data["trial_name"], - success=success, - duration_sec=duration_sec, - agent_result=agent_result, - verifier_result=verifier_result, - exception_info=exception_info, - started_at=result_data["started_at"], - finished_at=result_data["finished_at"], - ) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "task_name": self.task_name, - "trial_name": self.trial_name, - "success": self.success, - "duration_sec": self.duration_sec, - "agent_result": self.agent_result, - "verifier_result": self.verifier_result, - "exception_info": self.exception_info, - "started_at": self.started_at, - "finished_at": self.finished_at, - } - - -@dataclass -class HarborRunMetrics: - """Aggregated metrics for a Harbor run.""" - - run_id: str - agent_file_enabled: bool - task_results: List[HarborTaskResult] - success_rate: float - completion_rate: float - avg_duration_sec: float - total_tasks: int - successful_tasks: int - failed_tasks: int - timed_out_tasks: int - - @classmethod - def from_task_results( - cls, run_id: str, agent_file_enabled: bool, task_results: List[HarborTaskResult] - ) -> "HarborRunMetrics": - """Calculate aggregated metrics from task results.""" - total_tasks = len(task_results) - successful_tasks = sum(1 for r in task_results if r.success) - failed_tasks = sum( - 1 for r in task_results if not r.success and r.exception_info is None - ) - timed_out_tasks = sum( - 1 - for r in task_results - if not r.success - and r.exception_info - and "timeout" in r.exception_info.get("exception_type", "").lower() - ) - - success_rate = ( - (successful_tasks / total_tasks * 100) if total_tasks > 0 else 0.0 - ) - completion_rate = ( - (successful_tasks + failed_tasks) / total_tasks * 100 - if total_tasks > 0 - else 0.0 - ) - - # Calculate average duration (only for completed tasks) - completed_results = [r for r in task_results if r.agent_result is not None] - avg_duration_sec = ( - sum(r.duration_sec for r in completed_results) / len(completed_results) - if completed_results - else 0.0 - ) - - return cls( - run_id=run_id, - agent_file_enabled=agent_file_enabled, - task_results=task_results, - success_rate=success_rate, - completion_rate=completion_rate, - avg_duration_sec=avg_duration_sec, - total_tasks=total_tasks, - successful_tasks=successful_tasks, - failed_tasks=failed_tasks, - timed_out_tasks=timed_out_tasks, - ) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "run_id": self.run_id, - "agent_file_enabled": self.agent_file_enabled, - "task_results": [r.to_dict() for r in self.task_results], - "success_rate": self.success_rate, - "completion_rate": self.completion_rate, - "avg_duration_sec": self.avg_duration_sec, - "total_tasks": self.total_tasks, - "successful_tasks": self.successful_tasks, - "failed_tasks": self.failed_tasks, - "timed_out_tasks": self.timed_out_tasks, - } - - -@dataclass -class HarborComparison: - """Complete comparison between two Harbor runs.""" - - without_agent: HarborRunMetrics - with_agent: HarborRunMetrics - deltas: Dict[str, float] = field(default_factory=dict) - statistical_significance: Dict[str, bool] = field(default_factory=dict) - per_task_comparison: List[Dict[str, Any]] = field(default_factory=list) - created_at: str = field(default_factory=lambda: datetime.now().isoformat()) - - def calculate_deltas(self) -> None: - """Calculate delta metrics between runs.""" - success_rate_delta = ( - self.with_agent.success_rate - self.without_agent.success_rate - ) - completion_rate_delta = ( - self.with_agent.completion_rate - self.without_agent.completion_rate - ) - duration_delta_sec = ( - self.with_agent.avg_duration_sec - self.without_agent.avg_duration_sec - ) - successful_tasks_delta = ( - self.with_agent.successful_tasks - self.without_agent.successful_tasks - ) - - # Calculate percentage change in duration - duration_delta_pct = None - if self.without_agent.avg_duration_sec > 0: - duration_delta_pct = ( - duration_delta_sec / self.without_agent.avg_duration_sec * 100 - ) - - self.deltas = { - "success_rate_delta": success_rate_delta, - "completion_rate_delta": completion_rate_delta, - "avg_duration_delta_sec": duration_delta_sec, - "avg_duration_delta_pct": duration_delta_pct, - "successful_tasks_delta": successful_tasks_delta, - } - - def generate_per_task_comparison(self) -> None: - """Generate per-task comparison details.""" - # Create lookup dictionary for tasks - without_tasks = {r.task_name: r for r in self.without_agent.task_results} - with_tasks = {r.task_name: r for r in self.with_agent.task_results} - - self.per_task_comparison = [] - all_task_names = set(without_tasks.keys()) | set(with_tasks.keys()) - - for task_name in all_task_names: - without_result = without_tasks.get(task_name) - with_result = with_tasks.get(task_name) - - comparison = {"task_name": task_name} - - # Add without_agent result if exists - if without_result: - comparison["without_agent"] = { - "success": without_result.success, - "duration_sec": without_result.duration_sec, - } - else: - comparison["without_agent"] = None - - # Add with_agent result if exists - if with_result: - comparison["with_agent"] = { - "success": with_result.success, - "duration_sec": with_result.duration_sec, - } - else: - comparison["with_agent"] = None - - # Calculate per-task delta if both results exist - if without_result and with_result: - comparison["delta"] = self._calculate_task_delta( - without_result, with_result - ) - - self.per_task_comparison.append(comparison) - - def _calculate_task_delta( - self, without_result: HarborTaskResult, with_result: HarborTaskResult - ) -> Dict[str, Any]: - """Calculate delta between two task results.""" - duration_delta_sec = with_result.duration_sec - without_result.duration_sec - duration_delta_pct = None - - if without_result.duration_sec > 0: - duration_delta_pct = duration_delta_sec / without_result.duration_sec * 100 - - return { - "success_improved": with_result.success and not without_result.success, - "duration_delta_sec": duration_delta_sec, - "duration_delta_pct": duration_delta_pct, - } - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "without_agent": self.without_agent.to_dict(), - "with_agent": self.with_agent.to_dict(), - "deltas": self.deltas, - "statistical_significance": self.statistical_significance, - "per_task_comparison": self.per_task_comparison, - "created_at": self.created_at, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "HarborComparison": - """Create HarborComparison from dictionary.""" - without_agent_data = data["without_agent"] - with_agent_data = data["with_agent"] - - without_agent = HarborRunMetrics( - run_id=without_agent_data["run_id"], - agent_file_enabled=without_agent_data["agent_file_enabled"], - task_results=[ - HarborTaskResult(**r) for r in without_agent_data["task_results"] - ], - success_rate=without_agent_data["success_rate"], - completion_rate=without_agent_data["completion_rate"], - avg_duration_sec=without_agent_data["avg_duration_sec"], - total_tasks=without_agent_data["total_tasks"], - successful_tasks=without_agent_data["successful_tasks"], - failed_tasks=without_agent_data["failed_tasks"], - timed_out_tasks=without_agent_data["timed_out_tasks"], - ) - - with_agent = HarborRunMetrics( - run_id=with_agent_data["run_id"], - agent_file_enabled=with_agent_data["agent_file_enabled"], - task_results=[ - HarborTaskResult(**r) for r in with_agent_data["task_results"] - ], - success_rate=with_agent_data["success_rate"], - completion_rate=with_agent_data["completion_rate"], - avg_duration_sec=with_agent_data["avg_duration_sec"], - total_tasks=with_agent_data["total_tasks"], - successful_tasks=with_agent_data["successful_tasks"], - failed_tasks=with_agent_data["failed_tasks"], - timed_out_tasks=with_agent_data["timed_out_tasks"], - ) - - return cls( - without_agent=without_agent, - with_agent=with_agent, - deltas=data.get("deltas", {}), - statistical_significance=data.get("statistical_significance", {}), - per_task_comparison=data.get("per_task_comparison", []), - created_at=data.get("created_at", datetime.now().isoformat()), - ) diff --git a/src/agentready/reporters/harbor_markdown.py b/src/agentready/reporters/harbor_markdown.py deleted file mode 100644 index 8f4fa9e1..00000000 --- a/src/agentready/reporters/harbor_markdown.py +++ /dev/null @@ -1,260 +0,0 @@ -"""Markdown reporter for Harbor comparisons.""" - -from pathlib import Path - -from agentready.models.harbor import HarborComparison - - -class HarborMarkdownReporter: - """Generate GitHub-Flavored Markdown reports for Harbor comparisons.""" - - def generate(self, comparison: HarborComparison, output_path: Path) -> None: - """Generate Markdown report. - - Creates a GitHub-Flavored Markdown file with: - - Summary table - - Statistical significance indicators - - Per-task breakdown - - Recommendations - - Args: - comparison: HarborComparison with calculated deltas - output_path: Path to write Markdown file - """ - markdown = self._build_markdown(comparison) - - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w") as f: - f.write(markdown) - - def _build_markdown(self, comparison: HarborComparison) -> str: - """Build complete Markdown content. - - Args: - comparison: HarborComparison object - - Returns: - Complete Markdown content as string - """ - sections = [ - self._header(comparison), - self._summary_table(comparison), - self._statistical_significance(comparison), - self._per_task_results(comparison), - self._conclusion(comparison), - ] - - return "\n\n".join(sections) - - def _header(self, comparison: HarborComparison) -> str: - """Generate header section.""" - task_names = [t["task_name"] for t in comparison.per_task_comparison] - return f"""# Harbor Benchmark Comparison - -**Created**: {comparison.created_at} -**Tasks**: {len(task_names)} ({', '.join(task_names[:3])}{'...' if len(task_names) > 3 else ''}) -**Agent File**: `.claude/agents/doubleagent.md`""" - - def _summary_table(self, comparison: HarborComparison) -> str: - """Generate summary metrics table.""" - without = comparison.without_agent - with_agent = comparison.with_agent - deltas = comparison.deltas - sig = comparison.statistical_significance - - # Format significance indicators - success_sig = self._format_significance( - sig.get("success_rate_significant"), sig.get("success_rate_p_value") - ) - duration_sig = self._format_significance( - sig.get("duration_significant"), sig.get("duration_p_value") - ) - - return f"""## Summary - -| Metric | Without Agent | With Agent | Delta | Significant? | -|--------|--------------|------------|-------|--------------| -| Success Rate | {without.success_rate:.1f}% | {with_agent.success_rate:.1f}% | {deltas['success_rate_delta']:+.1f}% | {success_sig} | -| Completion Rate | {without.completion_rate:.1f}% | {with_agent.completion_rate:.1f}% | {deltas['completion_rate_delta']:+.1f}% | - | -| Avg Duration | {without.avg_duration_sec / 60:.1f} min | {with_agent.avg_duration_sec / 60:.1f} min | {deltas['avg_duration_delta_pct']:+.1f}% | {duration_sig} | -| Successful Tasks | {without.successful_tasks}/{without.total_tasks} | {with_agent.successful_tasks}/{with_agent.total_tasks} | {deltas['successful_tasks_delta']:+d} | - |""" - - def _format_significance( - self, is_significant: bool = None, p_value: float = None - ) -> str: - """Format statistical significance indicator. - - Args: - is_significant: Whether difference is statistically significant - p_value: P-value from statistical test - - Returns: - Formatted string (e.g., "✓ (p=0.04)" or "✗ (p=0.23)") - """ - if is_significant is None or p_value is None: - return "-" - - symbol = "✓" if is_significant else "✗" - return f"{symbol} (p={p_value:.4f})" - - def _statistical_significance(self, comparison: HarborComparison) -> str: - """Generate statistical significance section.""" - sig = comparison.statistical_significance - - if not sig.get("success_rate_p_value") and not sig.get("duration_p_value"): - return "## Statistical Analysis\n\n*Statistical tests not available (scipy not installed)*" - - lines = ["## Statistical Analysis"] - - # Success rate analysis - if sig.get("success_rate_p_value") is not None: - is_sig = sig["success_rate_significant"] - p_val = sig["success_rate_p_value"] - - if is_sig: - lines.append( - f"- **Success Rate**: Statistically significant improvement " - f"(p={p_val:.4f}, p<0.05)" - ) - else: - lines.append( - f"- **Success Rate**: No statistically significant difference " - f"(p={p_val:.4f}, p≥0.05)" - ) - - # Duration analysis - if sig.get("duration_p_value") is not None: - is_sig = sig["duration_significant"] - p_val = sig["duration_p_value"] - cohens_d = sig.get("duration_cohens_d") - - if is_sig: - effect_text = "" - if cohens_d is not None: - from agentready.services.harbor.comparer import ( - interpret_effect_size, - ) - - effect = interpret_effect_size(cohens_d) - effect_text = f" with {effect} effect size (d={cohens_d:.2f})" - - lines.append( - f"- **Duration**: Statistically significant difference " - f"(p={p_val:.4f}, p<0.05){effect_text}" - ) - else: - lines.append( - f"- **Duration**: No statistically significant difference " - f"(p={p_val:.4f}, p≥0.05)" - ) - - return "\n".join(lines) - - def _per_task_results(self, comparison: HarborComparison) -> str: - """Generate per-task results section.""" - lines = ["## Per-Task Results"] - - for task_comp in comparison.per_task_comparison: - lines.append(f"\n### {task_comp['task_name']}") - lines.append( - self._format_task_result( - "Without Agent", task_comp.get("without_agent") - ) - ) - lines.append( - self._format_task_result("With Agent", task_comp.get("with_agent")) - ) - - # Add impact analysis if delta exists - if "delta" in task_comp: - lines.append(self._format_task_impact(task_comp["delta"])) - - return "\n".join(lines) - - def _format_task_result(self, label: str, result: dict) -> str: - """Format a single task result line.""" - if not result: - return f"- **{label}**: N/A" - - status = "✓ Success" if result.get("success") else "✗ Failed" - duration = result.get("duration_sec", 0) / 60 - return f"- **{label}**: {status} ({duration:.1f} min)" - - def _format_task_impact(self, delta: dict) -> str: - """Format task impact analysis.""" - if delta.get("success_improved"): - return "- **Impact**: +100% success (fixed failure)" - - duration_pct = delta.get("duration_delta_pct") - if duration_pct: - direction = "faster" if duration_pct < 0 else "slower" - return f"- **Impact**: {abs(duration_pct):.1f}% {direction}" - - return "- **Impact**: No change" - - def _conclusion(self, comparison: HarborComparison) -> str: - """Generate conclusion and recommendations.""" - deltas = comparison.deltas - sig = comparison.statistical_significance - - lines = ["## Conclusion"] - - # Determine overall recommendation - success_improved = deltas["success_rate_delta"] > 0 - duration_improved = deltas["avg_duration_delta_pct"] < 0 - statistically_significant = sig.get("success_rate_significant") or sig.get( - "duration_significant" - ) - - if success_improved and statistically_significant: - lines.append( - f"\nThe `doubleagent.md` agent file shows **statistically significant improvement** " - f"in success rate ({deltas['success_rate_delta']:+.1f}%)" - ) - - if duration_improved: - lines.append( - f"and execution speed ({deltas['avg_duration_delta_pct']:+.1f}%)." - ) - else: - lines.append(".") - - lines.append( - "\n**Recommendation**: ✅ **Include `doubleagent.md`** " - "in AgentReady development workflows." - ) - - elif success_improved or duration_improved: - lines.append("\nThe `doubleagent.md` agent file shows improvements:") - if success_improved: - lines.append(f"- Success rate: {deltas['success_rate_delta']:+.1f}%") - if duration_improved: - lines.append(f"- Duration: {deltas['avg_duration_delta_pct']:+.1f}%") - - lines.append( - "\nHowever, differences are not statistically significant (larger sample size recommended)." - ) - lines.append( - "\n**Recommendation**: ⚠️ **Consider including** `doubleagent.md` " - "but validate with larger benchmark." - ) - - else: - lines.append("\nNo significant improvement detected.") - lines.append( - "\n**Recommendation**: ❌ **Agent file may not provide measurable benefit** " - "for tested tasks." - ) - - return "\n".join(lines) - - -def generate_markdown_report(comparison: HarborComparison, output_path: Path) -> None: - """Convenience function to generate Markdown report. - - Args: - comparison: HarborComparison with calculated deltas - output_path: Path to write Markdown file - """ - reporter = HarborMarkdownReporter() - reporter.generate(comparison, output_path) diff --git a/src/agentready/services/eval_harness/__init__.py b/src/agentready/services/eval_harness/__init__.py deleted file mode 100644 index a69e84c3..00000000 --- a/src/agentready/services/eval_harness/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Terminal-Bench evaluation harness for assessor effectiveness testing.""" diff --git a/src/agentready/services/eval_harness/aggregator.py b/src/agentready/services/eval_harness/aggregator.py deleted file mode 100644 index 71426936..00000000 --- a/src/agentready/services/eval_harness/aggregator.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Benchmark results aggregation for assessor effectiveness analysis. - -This module provides functionality to aggregate Terminal-Bench results across -multiple repositories to identify high-impact vs low-impact assessors. -""" - -import pandas as pd - -# Significance threshold for mean delta (placeholder for statistical test) -SIGNIFICANCE_THRESHOLD = 0.05 - - -def aggregate_results(results: list[dict]) -> pd.DataFrame: - """ - Aggregate benchmark results by assessor. - - Generic interface for aggregating benchmark results across multiple - repositories. Follows the principle of "generic interfaces first, - then consumers" - this function is consumed by CLI commands, reporting - tools, and analysis scripts. - - Args: - results: List of dicts with keys: - - assessor_id: Identifier for the assessor - - delta_score: Score improvement (can be negative for regressions) - - Returns: - DataFrame indexed by assessor_id with columns: - - mean_delta: Average score improvement - - median_delta: Median score improvement - - std_delta: Standard deviation of improvements - - sample_size: Number of repositories tested - - significant: Boolean indicator (placeholder: abs(mean) > 0.05) - Sorted by mean_delta descending (highest impact first) - - Examples: - >>> results = [ - ... {"assessor_id": "claude_md", "delta_score": 0.12}, - ... {"assessor_id": "claude_md", "delta_score": 0.10}, - ... ] - >>> summary = aggregate_results(results) - >>> summary.loc["claude_md"]["mean_delta"] - 0.11 - """ - # Handle empty results - if not results: - return pd.DataFrame( - columns=[ - "mean_delta", - "median_delta", - "std_delta", - "sample_size", - "significant", - ] - ) - - # 1. Create DataFrame from results - df = pd.DataFrame(results) - - # 2. Aggregate with pandas groupby - summary = df.groupby("assessor_id").agg( - {"delta_score": ["mean", "median", "std", "count"]} - ) - - # 3. Rename aggregated columns - summary.columns = ["mean_delta", "median_delta", "std_delta", "sample_size"] - - # 4. Handle NaN in std (occurs with single value) - summary["std_delta"] = summary["std_delta"].fillna(0.0) - - # 5. Round to 2 decimal places for readability - summary = summary.round(2) - - # 5. Add statistical significance placeholder - # Placeholder: abs(mean_delta) > 0.05 - # Future: Replace with proper statistical test (t-test, etc.) - summary["significant"] = summary["mean_delta"].abs() > SIGNIFICANCE_THRESHOLD - - # 6. Sort by mean_delta descending (highest impact first) - summary = summary.sort_values("mean_delta", ascending=False) - - return summary diff --git a/src/agentready/services/eval_harness/assessor_tester.py b/src/agentready/services/eval_harness/assessor_tester.py deleted file mode 100644 index df6834c6..00000000 --- a/src/agentready/services/eval_harness/assessor_tester.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Service for testing individual assessor impact on Terminal-Bench.""" - -import shutil -import statistics -import tempfile -from pathlib import Path -from typing import List - -from scipy import stats - -from ...assessors import create_all_assessors -from ...models.eval_harness import AssessorImpact, BaselineMetrics, TbenchResult -from ...services.fixer_service import FixerService -from ...services.scanner import Scanner -from .tbench_runner import TbenchRunner - - -class AssessorTester: - """Test a single assessor's impact on Terminal-Bench performance. - - This is the core A/B testing logic that: - 1. Clones the repo to a temp directory (fresh copy) - 2. Runs assessment with ONLY the specified assessor - 3. Applies fixes using FixerService (align command) - 4. Runs tbench post-remediation - 5. Calculates delta, p-value, and effect size (Cohen's d) - 6. Returns AssessorImpact with statistical significance - """ - - def __init__(self, tbench_runner: TbenchRunner = None): - """Initialize with optional tbench runner. - - Args: - tbench_runner: TbenchRunner instance (defaults to mocked) - """ - self.tbench_runner = tbench_runner or TbenchRunner(mock=True) - self.fixer_service = FixerService() - - def test_assessor( - self, - assessor_id: str, - repo_path: Path, - baseline: BaselineMetrics, - iterations: int = 5, - output_dir: Path = None, - ) -> AssessorImpact: - """Test single assessor and measure impact against baseline. - - Args: - assessor_id: ID of assessor to test (e.g., "claude_md_file") - repo_path: Path to repository to test - baseline: Baseline metrics for comparison - iterations: Number of tbench runs post-remediation - output_dir: Directory to save results (optional) - - Returns: - AssessorImpact with delta score and statistical significance - - Raises: - ValueError: If assessor_id is not found - """ - # 1. Find the assessor - all_assessors = create_all_assessors() - assessor = next( - (a for a in all_assessors if a.attribute_id == assessor_id), None - ) - if not assessor: - valid_ids = [a.attribute_id for a in all_assessors] - raise ValueError( - f"Assessor '{assessor_id}' not found. Valid IDs: {', '.join(valid_ids)}" - ) - - # 2. Clone repo to temp directory - with tempfile.TemporaryDirectory() as temp_dir: - temp_repo = Path(temp_dir) / "repo" - shutil.copytree(repo_path, temp_repo, symlinks=True) - - # 3. Run assessment with single assessor - scanner = Scanner(temp_repo) - assessment = scanner.scan([assessor], verbose=False) - - # 4. Apply remediation using FixerService - fix_plan = self.fixer_service.generate_fix_plan( - assessment, assessment.repository, attribute_ids=[assessor_id] - ) - remediation_log = [] - if fix_plan.fixes: - results = self.fixer_service.apply_fixes(fix_plan.fixes, dry_run=False) - remediation_log = [f.description for f in fix_plan.fixes] - fixes_applied = results["succeeded"] - else: - fixes_applied = 0 - remediation_log = ["No fixes available for this assessor"] - - # 5. Run tbench post-remediation - post_results: List[TbenchResult] = [] - for i in range(iterations): - result = self.tbench_runner.run_benchmark(temp_repo) - post_results.append(result) - - # Save individual run if output_dir provided - if output_dir: - from ...services.eval_harness.baseline import save_to_json - - run_file = output_dir / f"run_{i+1:03d}.json" - save_to_json(result, run_file) - - # 6. Calculate statistics - post_scores = [r.score for r in post_results] - baseline_scores = [r.score for r in baseline.raw_results] - - # Mean scores - baseline_score = baseline.mean_score - post_score = statistics.mean(post_scores) - delta_score = post_score - baseline_score - - # Statistical significance (two-sample t-test) - if len(baseline_scores) > 1 and len(post_scores) > 1: - t_stat, p_value = stats.ttest_ind(baseline_scores, post_scores) - else: - # Not enough samples for t-test - p_value = 1.0 - - # Effect size (Cohen's d) - effect_size = self._calculate_cohens_d(baseline_scores, post_scores) - - # Significance: p < 0.05 AND effect size > 0.2 (small effect) - is_significant = p_value < 0.05 and abs(effect_size) > 0.2 - - # 7. Save impact results if output_dir provided - impact = AssessorImpact( - assessor_id=assessor_id, - assessor_name=assessor.attribute.name, - tier=assessor.attribute.tier, - baseline_score=baseline_score, - post_remediation_score=post_score, - delta_score=delta_score, - p_value=p_value, - effect_size=effect_size, - is_significant=is_significant, - iterations=iterations, - fixes_applied=fixes_applied, - remediation_log=remediation_log, - ) - - if output_dir: - from ...services.eval_harness.baseline import save_to_json - - impact_file = output_dir / "impact.json" - save_to_json(impact, impact_file) - - return impact - - @staticmethod - def _calculate_cohens_d(group1: List[float], group2: List[float]) -> float: - """Calculate Cohen's d effect size. - - Cohen's d measures the standardized difference between two means. - - Interpretation: - - |d| < 0.2: negligible - - 0.2 <= |d| < 0.5: small - - 0.5 <= |d| < 0.8: medium - - |d| >= 0.8: large - - Args: - group1: Baseline scores - group2: Post-remediation scores - - Returns: - Cohen's d effect size (positive = improvement, negative = regression) - """ - if len(group1) < 2 or len(group2) < 2: - return 0.0 - - mean1 = statistics.mean(group1) - mean2 = statistics.mean(group2) - std1 = statistics.stdev(group1) - std2 = statistics.stdev(group2) - - # Pooled standard deviation - n1 = len(group1) - n2 = len(group2) - pooled_std = ((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2) - pooled_std = pooled_std**0.5 - - if pooled_std == 0: - return 0.0 - - # Cohen's d = (mean2 - mean1) / pooled_std - return (mean2 - mean1) / pooled_std diff --git a/src/agentready/services/eval_harness/baseline.py b/src/agentready/services/eval_harness/baseline.py deleted file mode 100644 index 4e548aec..00000000 --- a/src/agentready/services/eval_harness/baseline.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Baseline establishment for Terminal-Bench eval harness. - -Establishes baseline performance by running Terminal-Bench multiple times -on an unmodified repository and calculating statistical metrics. -""" - -import json -from pathlib import Path - -from ...models.eval_harness import BaselineMetrics, TbenchResult, save_to_json -from .tbench_runner import TbenchRunner - - -class BaselineEstablisher: - """Establishes baseline Terminal-Bench performance. - - Runs tbench multiple times on an unmodified repository to establish - the starting point for measuring assessor impact. Calculates mean, - std dev, median, min, max for statistical comparisons. - """ - - def __init__(self, tbench_runner: TbenchRunner = None): - """Initialize establisher. - - Args: - tbench_runner: TbenchRunner instance (defaults to mocked) - """ - self.tbench_runner = tbench_runner or TbenchRunner(mock=True) - - def establish_baseline( - self, repo_path: Path, iterations: int = 5, output_dir: Path = None - ) -> BaselineMetrics: - """Run tbench multiple times and calculate baseline metrics. - - Args: - repo_path: Path to repository to benchmark - iterations: Number of tbench runs to perform (default: 5) - output_dir: Optional directory to save results - (default: repo_path/.agentready/eval_harness/baseline) - - Returns: - BaselineMetrics with calculated statistics - - Raises: - ValueError: If repo_path is invalid or iterations < 1 - """ - # Validate inputs - if not repo_path.exists(): - raise ValueError(f"Repository path does not exist: {repo_path}") - - if iterations < 1: - raise ValueError(f"Iterations must be >= 1, got {iterations}") - - # Set default output directory - if output_dir is None: - output_dir = repo_path / ".agentready" / "eval_harness" / "baseline" - - output_dir.mkdir(parents=True, exist_ok=True) - - # Run tbench multiple times - results: list[TbenchResult] = [] - for i in range(iterations): - result = self.tbench_runner.run_benchmark(repo_path) - results.append(result) - - # Save individual run - run_file = output_dir / f"run_{i+1:03d}.json" - save_to_json(result, run_file) - - # Calculate baseline metrics - baseline = BaselineMetrics.from_results(results) - - # Save baseline summary - summary_file = output_dir / "summary.json" - save_to_json(baseline, summary_file) - - return baseline - - def load_baseline(self, baseline_dir: Path) -> BaselineMetrics: - """Load previously established baseline from directory. - - Args: - baseline_dir: Directory containing baseline results - - Returns: - BaselineMetrics loaded from summary.json - - Raises: - FileNotFoundError: If baseline_dir or summary.json doesn't exist - """ - summary_file = baseline_dir / "summary.json" - - if not summary_file.exists(): - raise FileNotFoundError( - f"Baseline summary not found: {summary_file}\n" - f"Run 'agentready eval-harness baseline' first" - ) - - with open(summary_file, "r", encoding="utf-8") as f: - data = json.load(f) - - return BaselineMetrics.from_dict(data) diff --git a/src/agentready/services/eval_harness/batch_runner.py b/src/agentready/services/eval_harness/batch_runner.py deleted file mode 100644 index ea4f00ca..00000000 --- a/src/agentready/services/eval_harness/batch_runner.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Parallel benchmark execution for Terminal-Bench eval harness. - -This module provides resource-limited parallel execution using ProcessPoolExecutor -to handle large batches of benchmark jobs without exhausting system resources. -""" - -import logging -from concurrent.futures import ProcessPoolExecutor, as_completed -from pathlib import Path - -from agentready.services.eval_harness.tbench_runner import ( - TbenchResult, - _real_tbench_result, -) - -# Resource limits for parallel execution -MAX_WORKERS = 4 -JOB_TIMEOUT = 3600 # seconds - -logger = logging.getLogger(__name__) - - -def run_batch_benchmarks(repositories: list[Path]) -> list[TbenchResult]: - """ - Execute Terminal-Bench benchmarks in parallel with resource limits. - - Runs real Harbor framework benchmarks concurrently using ProcessPoolExecutor - with a maximum of 4 workers to prevent system resource exhaustion. Each job - has a 3600-second timeout. Failures are logged but don't block other jobs. - - Args: - repositories: List of repository paths to benchmark - - Returns: - List of TbenchResult objects for successful benchmarks only. - Failed benchmarks are logged and excluded from results. - - Examples: - >>> repos = [Path("/path/to/repo1"), Path("/path/to/repo2")] - >>> results = run_batch_benchmarks(repos) - >>> len(results) # May be less than len(repos) if some failed - 2 - """ - results = [] - - # Initialize ProcessPoolExecutor with resource limit - with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor: - # Submit all benchmark jobs - future_to_repo = { - executor.submit(_real_tbench_result, repo): repo for repo in repositories - } - - # Process results as they complete - for future in as_completed(future_to_repo): - repo = future_to_repo[future] - try: - # Get result with timeout - result = future.result(timeout=JOB_TIMEOUT) - results.append(result) - logger.info(f"Benchmark completed for {repo}: score={result.score}") - except Exception as exc: - # Log failure but continue processing other jobs - logger.error(f"Benchmark failed for {repo}: {exc}") - continue - - return results diff --git a/src/agentready/services/eval_harness/dashboard_generator.py b/src/agentready/services/eval_harness/dashboard_generator.py deleted file mode 100644 index 62756180..00000000 --- a/src/agentready/services/eval_harness/dashboard_generator.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Service for generating GitHub Pages dashboard data from evaluation results.""" - -from pathlib import Path - -from ...models.eval_harness import EvalSummary, load_from_json, save_to_json - - -class DashboardGenerator: - """Generate Jekyll-compatible data files for GitHub Pages dashboard. - - Responsibilities: - - Load evaluation summary - - Generate docs/_data/tbench/ data files for Jekyll - - Format data for Chart.js consumption - - Create summary.json, ranked_assessors.json, tier_impacts.json - """ - - def generate( - self, - eval_harness_dir: Path, - docs_data_dir: Path = None, - ) -> dict: - """Generate dashboard data files for GitHub Pages. - - Args: - eval_harness_dir: Directory containing summary.json - (e.g., .agentready/eval_harness/) - docs_data_dir: Jekyll _data directory - (defaults to docs/_data/tbench/) - - Returns: - Dict with paths to generated files - - Raises: - FileNotFoundError: If summary.json not found - """ - # Load summary - summary_file = eval_harness_dir / "summary.json" - if not summary_file.exists(): - raise FileNotFoundError( - f"Summary file not found: {summary_file}. " - "Run 'agentready eval-harness run-tier' or 'summarize' first." - ) - - summary = load_from_json(EvalSummary, summary_file) - - # Set output directory - if docs_data_dir is None: - # Default to docs/_data/tbench/ in repository root - repo_root = self._find_repo_root(eval_harness_dir) - docs_data_dir = repo_root / "docs" / "_data" / "tbench" - - docs_data_dir.mkdir(parents=True, exist_ok=True) - - # Generate data files - generated_files = {} - - # 1. Complete summary (for main dashboard) - summary_data_file = docs_data_dir / "summary.json" - save_to_json(summary, summary_data_file) - generated_files["summary"] = summary_data_file - - # 2. Ranked assessors (for leaderboard table) - ranked = summary.get_ranked_assessors() - ranked_data = [impact.to_dict() for impact in ranked] - ranked_file = docs_data_dir / "ranked_assessors.json" - self._save_json_list(ranked_data, ranked_file) - generated_files["ranked_assessors"] = ranked_file - - # 3. Tier impacts (for bar chart) - tier_data = [ - {"tier": tier, "delta": delta, "tier_name": self._tier_name(tier)} - for tier, delta in sorted(summary.tier_impacts.items()) - ] - tier_file = docs_data_dir / "tier_impacts.json" - self._save_json_list(tier_data, tier_file) - generated_files["tier_impacts"] = tier_file - - # 4. Baseline data (for comparison chart) - baseline_data = { - "mean_score": summary.baseline.mean_score, - "std_dev": summary.baseline.std_dev, - "median_score": summary.baseline.median_score, - "min_score": summary.baseline.min_score, - "max_score": summary.baseline.max_score, - "iterations": summary.baseline.iterations, - } - baseline_file = docs_data_dir / "baseline.json" - self._save_json_dict(baseline_data, baseline_file) - generated_files["baseline"] = baseline_file - - # 5. Summary stats (for overview cards) - stats_data = { - "total_assessors_tested": summary.total_assessors_tested, - "significant_improvements": summary.significant_improvements, - "significance_rate": ( - summary.significant_improvements / summary.total_assessors_tested * 100 - if summary.total_assessors_tested > 0 - else 0 - ), - "timestamp": summary.timestamp.isoformat(), - } - stats_file = docs_data_dir / "stats.json" - self._save_json_dict(stats_data, stats_file) - generated_files["stats"] = stats_file - - return generated_files - - @staticmethod - def _find_repo_root(start_path: Path) -> Path: - """Find repository root by looking for .git directory. - - Args: - start_path: Starting directory - - Returns: - Repository root path - - Raises: - FileNotFoundError: If .git not found - """ - current = start_path.resolve() - - while current != current.parent: - if (current / ".git").exists(): - return current - current = current.parent - - raise FileNotFoundError( - f"Could not find repository root from {start_path}. " - "No .git directory found." - ) - - @staticmethod - def _tier_name(tier: int) -> str: - """Get human-readable tier name. - - Args: - tier: Tier number 1-4 - - Returns: - Tier name (Essential, Critical, Important, Advanced) - """ - tier_names = { - 1: "Essential", - 2: "Critical", - 3: "Important", - 4: "Advanced", - } - return tier_names.get(tier, f"Tier {tier}") - - @staticmethod - def _save_json_list(data: list, output_path: Path): - """Save list to JSON file. - - Args: - data: List to save - output_path: Path to output file - """ - import json - - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(data, indent=2, fp=f) - - @staticmethod - def _save_json_dict(data: dict, output_path: Path): - """Save dict to JSON file. - - Args: - data: Dict to save - output_path: Path to output file - """ - import json - - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(data, indent=2, fp=f) diff --git a/src/agentready/services/eval_harness/harbor_config.py b/src/agentready/services/eval_harness/harbor_config.py deleted file mode 100644 index 2753dd91..00000000 --- a/src/agentready/services/eval_harness/harbor_config.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Harbor framework configuration for Terminal-Bench integration. - -This module provides configuration and validation for Harbor framework subprocess execution. -""" - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -# Allowed models (excludes opus due to cost) -# Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview -# Cursor models: https://cursor.com/docs/models -ALLOWED_MODELS = { - "anthropic/claude-haiku-4-5", - "anthropic/claude-sonnet-4-5", - "cursor/composer-1", - "cursor/gpt-5.2-codex", - "cursor/gpt-5.2-codex-fast", - "cursor/gemini-3-pro", - "cursor/opus-4.5", - "cursor/sonnet-4.5", - "cursor/sonnet-4.5-thinking", - "cursor/gpt-5.1-high", - "cursor/gemini-3-flash", -} - -# Allowed agents (excludes oracle as it's not relevant for real-world assessment) -# Harbor supported agents: https://github.com/laude-institute/harbor/blob/main/src/harbor/agents/factory.py -ALLOWED_AGENTS = { - "claude-code", - "cursor-cli", -} - - -@dataclass -class HarborConfig: - """ - Configuration for Harbor framework subprocess execution. - - Attributes: - model: LLM model identifier (must be in ALLOWED_MODELS) - agent: Agent identifier (must be in ALLOWED_AGENTS) - jobs_dir: Output directory for results (resolved to absolute path) - api_key: Anthropic API key (must not be empty) - timeout: Subprocess timeout in seconds (default: 3600, must be positive) - n_concurrent: Harbor's internal concurrency (default: 1, must be >= 1) - smoketest: Run fast validation with 1-2 tasks (default: False) - task_path: Optional path to specific task (for smoketest mode) - """ - - model: str - agent: str - jobs_dir: Path - api_key: str - timeout: int = 3600 - n_concurrent: int = 1 - smoketest: bool = False - task_path: Optional[Path] = None - - def __post_init__(self): - """Validate configuration parameters""" - # Validate model allowlist - if self.model not in ALLOWED_MODELS: - raise ValueError( - f"Invalid model: {self.model}. " - f"Allowed models: {sorted(ALLOWED_MODELS)}" - ) - - # Validate agent allowlist - if self.agent not in ALLOWED_AGENTS: - raise ValueError( - f"Invalid agent: {self.agent}. " - f"Allowed agents: {sorted(ALLOWED_AGENTS)}" - ) - - # Validate API key is not empty - if not self.api_key: - raise ValueError("API key cannot be empty") - - # Validate timeout is positive - if self.timeout <= 0: - raise ValueError(f"Timeout must be positive, got {self.timeout}") - - # Resolve jobs_dir to absolute path - self.jobs_dir = Path(self.jobs_dir).resolve() diff --git a/src/agentready/services/eval_harness/tbench_runner.py b/src/agentready/services/eval_harness/tbench_runner.py deleted file mode 100644 index 429368e8..00000000 --- a/src/agentready/services/eval_harness/tbench_runner.py +++ /dev/null @@ -1,285 +0,0 @@ -""" -Terminal-Bench runner with Harbor framework integration. - -This module provides functionality to execute real Terminal-Bench evaluations -via the Harbor framework subprocess interface. -""" - -import json -import logging -import os -import shlex -import subprocess -from dataclasses import dataclass -from pathlib import Path - -from agentready.services.eval_harness.harbor_config import HarborConfig - -logger = logging.getLogger(__name__) - -# Constants for Harbor subprocess configuration -DEFAULT_TIMEOUT = 3600 # 1 hour timeout per benchmark -DEFAULT_N_CONCURRENT = 1 # Sequential execution (parallelism managed externally) - - -@dataclass -class TbenchResult: - """ - Result from a Terminal-Bench evaluation. - - Attributes: - score: Benchmark accuracy score (0.0 to 1.0) - task_solved: Whether any tasks were successfully resolved - is_mocked: True for mocked results, False for real Harbor runs - resolved_trials: Number of successfully completed tasks - unresolved_trials: Number of failed tasks - pass_at_1: Single-attempt success rate - pass_at_3: Success rate within 3 attempts - trajectory_path: Path to agent trajectory.json file (if available) - """ - - score: float - task_solved: bool - is_mocked: bool - resolved_trials: int = 0 - unresolved_trials: int = 0 - pass_at_1: float = 0.0 - pass_at_3: float = 0.0 - trajectory_path: Path | None = None - - def __post_init__(self): - """Validate score ranges and trial counts""" - # Validate score range [0.0, 1.0] - if not (0.0 <= self.score <= 1.0): - raise ValueError(f"Score must be 0.0-1.0, got {self.score}") - - # Validate pass rates [0.0, 1.0] - if not (0.0 <= self.pass_at_1 <= 1.0): - raise ValueError(f"pass_at_1 must be 0.0-1.0, got {self.pass_at_1}") - if not (0.0 <= self.pass_at_3 <= 1.0): - raise ValueError(f"pass_at_3 must be 0.0-1.0, got {self.pass_at_3}") - - # Validate non-negative trial counts - if self.resolved_trials < 0 or self.unresolved_trials < 0: - raise ValueError("Trial counts cannot be negative") - - -def _real_tbench_result(repo_path: Path, config: HarborConfig) -> TbenchResult: - """ - Execute real Terminal-Bench evaluation via Harbor framework. - - Args: - repo_path: Path to repository being evaluated - config: HarborConfig with Harbor subprocess parameters - - Returns: - TbenchResult with real benchmark metrics - - Raises: - RuntimeError: If Harbor subprocess times out or fails - ValueError: If results path validation fails (path traversal) - """ - - # 2. Build harbor run command - if config.smoketest: - # SMOKETEST MODE: Use --path to point directly to downloaded task - # Task path is dynamically discovered by preflight check - if not config.task_path: - raise RuntimeError( - "Smoketest mode requires task_path to be set. " - "Ensure preflight checks are enabled." - ) - cmd = [ - "harbor", - "run", - "--path", - str(config.task_path), - "--agent", - config.agent, - "--model", - config.model, - "--jobs-dir", - str(config.jobs_dir), - "--n-concurrent", - str(config.n_concurrent), - "--quiet", # Reduce output noise - ] - else: - # Full benchmark: use dataset reference - cmd = [ - "harbor", - "run", - "--dataset", - "terminal-bench@2.0", - "--agent", - config.agent, - "--model", - config.model, - "--jobs-dir", - str(config.jobs_dir), - "--n-concurrent", - str(config.n_concurrent), - ] - - # 3. Prepare environment variables - # Pass through current environment but ensure API key is set - # Harbor's claude-code agent has MiniMax API hardcoded - override it - clean_env = os.environ.copy() - - # Define agent-specific environment variable configurations - # Structure: (Env Key, Env Value, Is Sensitive) - agent_env_configs = { - "claude-code": [ - ("ANTHROPIC_API_KEY", config.api_key, True), - ("ANTHROPIC_AUTH_TOKEN", config.api_key, True), - ("ANTHROPIC_BASE_URL", "https://api.anthropic.com", False), - ("ANTHROPIC_API_BASE", "https://api.anthropic.com", False), - ], - "cursor-cli": [ - ("CURSOR_API_KEY", config.api_key, True), - ], - } - - if config.agent not in agent_env_configs: - raise ValueError(f"Invalid agent: {config.agent}") - - # Set environment variables and build display/copyable lists - env_vars_display = [] - env_vars_copyable = [] - - for var_name, var_value, is_sensitive in agent_env_configs[config.agent]: - clean_env[var_name] = var_value - - # Build display string (truncate sensitive values) - if not is_sensitive: - display_value = var_value - env_vars_display.append(f"{var_name}={display_value}") - - # Build copyable string (use variable reference for sensitive values) - if is_sensitive: - copyable_value = f"${var_name}" - else: - copyable_value = var_value - env_vars_copyable.append(f"{var_name}={copyable_value}") - - # Clear MiniMax settings if present - clean_env.pop("MINIMAX_API_KEY", None) - - # Print Harbor command for debugging and manual execution - shell_cmd = " ".join(shlex.quote(arg) for arg in cmd) - full_cmd_copyable = " ".join(env_vars_copyable) + " " + shell_cmd - - print(f"\n{'=' * 70}") - print("Harbor Command (Copy/Paste Ready)") - print(f"{'=' * 70}") - print(f"\n{full_cmd_copyable}\n") - print(f"{'=' * 70}") - print("Command Breakdown:") - print(f"{'=' * 70}") - print(f"\nCommand: {shell_cmd}\n") - print("Environment Variables:") - for var in env_vars_display: - print(f" {var}") - print(f"\n{'=' * 70}\n") - - # Log full details - logger.info(f"Executing Harbor command: {shell_cmd}") - logger.info(f"Environment: {' '.join(env_vars_display)}") - - # 4. Execute subprocess with timeout - try: - subprocess.run( - cmd, - env=clean_env, - timeout=config.timeout, - check=True, - capture_output=True, - text=True, - ) - except subprocess.TimeoutExpired: - raise RuntimeError(f"Benchmark timed out after {config.timeout}s") - except subprocess.CalledProcessError as e: - # Include stderr in error message for debugging - error_msg = f"Harbor command failed: {e}" - if e.stderr: - error_msg += f"\nStderr: {e.stderr}" - raise RuntimeError(error_msg) - - # 5. Find timestamped results directory created by Harbor - # Harbor creates: jobs_dir/YYYY-MM-DD__HH-MM-SS/result.json - result_dirs = sorted(config.jobs_dir.glob("20*")) # Find timestamped dirs - if not result_dirs: - raise RuntimeError(f"No Harbor results directory found in {config.jobs_dir}") - - latest_dir = result_dirs[-1] # Use most recent - results_path = latest_dir / "result.json" # Note: singular "result.json" - - # SECURITY: Path validation (FR-005) - if not results_path.is_relative_to(config.jobs_dir): - raise ValueError(f"Invalid results path: {results_path}") - - if not results_path.exists(): - raise FileNotFoundError(f"Harbor results file not found: {results_path}") - - # Find trajectory file: jobs_dir/timestamp/task_name__hash/agent/trajectory.json - trajectory_path = None - task_dirs = list(latest_dir.glob("*")) - for task_dir in task_dirs: - if task_dir.is_dir() and task_dir.name != "verifier": - candidate = task_dir / "agent" / "trajectory.json" - if candidate.exists(): - trajectory_path = candidate - break - - return parse_harbor_results(results_path, trajectory_path) - - -def parse_harbor_results( - results_path: Path, trajectory_path: Path | None = None -) -> TbenchResult: - """ - Parse Harbor framework JSON output. - - Args: - results_path: Path to Harbor result.json file - trajectory_path: Optional path to agent trajectory.json file - - Returns: - TbenchResult with metrics from Harbor output - - Raises: - json.JSONDecodeError: If result.json is invalid JSON - KeyError: If required fields missing from results - """ - with open(results_path) as f: - data = json.load(f) - - # Harbor structure: stats.evals..{n_trials, n_errors, metrics} - stats = data["stats"] - evals = stats["evals"] - n_total_trials = data["n_total_trials"] - - # Get the first (and typically only) eval result - eval_key = list(evals.keys())[0] - eval_data = evals[eval_key] - - mean_score = eval_data["metrics"][0]["mean"] - - # In Terminal-Bench: mean_score represents fraction of tasks solved - # reward_stats shows which tasks got reward > 0 - # Count tasks with reward > 0 as resolved - reward_stats = eval_data.get("reward_stats", {}).get("reward", {}) - n_solved = sum( - len(tasks) for reward, tasks in reward_stats.items() if float(reward) > 0 - ) - - return TbenchResult( - score=mean_score, - task_solved=n_solved > 0, - is_mocked=False, - resolved_trials=n_solved, - unresolved_trials=n_total_trials - n_solved, - pass_at_1=mean_score, # Mean score is pass rate - pass_at_3=0.0, # Terminal-Bench doesn't provide pass@3 - trajectory_path=trajectory_path, - ) diff --git a/src/agentready/services/harbor/__init__.py b/src/agentready/services/harbor/__init__.py deleted file mode 100644 index 720dc51e..00000000 --- a/src/agentready/services/harbor/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Harbor benchmark integration services.""" - -from agentready.services.harbor.agent_toggler import AgentFileToggler -from agentready.services.harbor.result_parser import parse_harbor_results - -__all__ = ["AgentFileToggler", "parse_harbor_results"] diff --git a/src/agentready/services/harbor/agent_toggler.py b/src/agentready/services/harbor/agent_toggler.py deleted file mode 100644 index 70d00414..00000000 --- a/src/agentready/services/harbor/agent_toggler.py +++ /dev/null @@ -1,255 +0,0 @@ -"""Service for safely enabling/disabling agent files and manipulating repository state.""" - -import shutil -from contextlib import contextmanager -from pathlib import Path -from typing import Callable, Dict, Generator, Tuple - - -class AgentFileToggler: - """Safely enable/disable agent files via atomic rename operations.""" - - def __init__(self, agent_file: Path): - """Initialize toggler with agent file path. - - Args: - agent_file: Path to the agent file (e.g., .claude/agents/doubleagent.md) - """ - self.agent_file = agent_file - self.disabled_file = agent_file.with_suffix(agent_file.suffix + ".disabled") - - def disable(self) -> None: - """Rename agent file to .disabled extension.""" - if self.agent_file.exists(): - if self.disabled_file.exists(): - # Already disabled, nothing to do - return - shutil.move(str(self.agent_file), str(self.disabled_file)) - - def enable(self) -> None: - """Restore agent file from .disabled extension.""" - if self.disabled_file.exists(): - if self.agent_file.exists(): - # Already enabled, nothing to do - return - shutil.move(str(self.disabled_file), str(self.agent_file)) - - def is_enabled(self) -> bool: - """Check if agent file is currently enabled. - - Returns: - True if agent file exists and is not disabled - """ - return self.agent_file.exists() and not self.disabled_file.exists() - - def is_disabled(self) -> bool: - """Check if agent file is currently disabled. - - Returns: - True if disabled file exists - """ - return self.disabled_file.exists() - - @contextmanager - def temporarily_disabled(self) -> Generator[None, None, None]: - """Context manager for safe disable/enable. - - Ensures agent file is restored even if exception occurs. - - Example: - with toggler.temporarily_disabled(): - # Agent file is disabled here - run_benchmark() - # Agent file is automatically restored here - """ - was_enabled = self.is_enabled() - try: - self.disable() - yield - finally: - if was_enabled: - self.enable() - - @contextmanager - def temporarily_enabled(self) -> Generator[None, None, None]: - """Context manager for safe enable/disable. - - Ensures agent file state is restored even if exception occurs. - - Example: - with toggler.temporarily_enabled(): - # Agent file is enabled here - run_benchmark() - # Agent file state is automatically restored here - """ - was_disabled = self.is_disabled() - try: - self.enable() - yield - finally: - if was_disabled: - self.disable() - - -class AssessorStateToggler: - """Manipulate repository state to force assessor pass/fail for A/B testing. - - This class safely modifies repository files to simulate scenarios where - specific assessors would pass or fail, enabling empirical validation of - assessor impacts on agent performance. - - Example: - toggler = AssessorStateToggler() - - # Force CLAUDE.md assessor to fail - toggler.force_fail("claude_md_file") - # Run benchmark - toggler.restore("claude_md_file") - """ - - # Mapping of assessor IDs to (fail_action, restore_action) tuples - # Each action is a callable that takes the repository root Path - MANIPULATIONS: Dict[str, Tuple[Callable[[Path], None], Callable[[Path], None]]] = {} - - @classmethod - def register_manipulation( - cls, - assessor_id: str, - fail_action: Callable[[Path], None], - restore_action: Callable[[Path], None], - ) -> None: - """Register a manipulation strategy for an assessor. - - Args: - assessor_id: Unique assessor identifier (e.g., "claude_md_file") - fail_action: Function to force assessor to fail state - restore_action: Function to restore assessor to pass state - """ - cls.MANIPULATIONS[assessor_id] = (fail_action, restore_action) - - def __init__(self, repo_root: Path = None): - """Initialize toggler with repository root. - - Args: - repo_root: Root of the repository (default: current directory) - """ - self.repo_root = repo_root or Path.cwd() - self._backup_suffix = ".assessor_backup" - self._initialize_default_manipulations() - - def _initialize_default_manipulations(self) -> None: - """Register default manipulation strategies for Phase 1 assessors.""" - - # CLAUDE.md Assessor - Tier 1, 10% weight - def fail_claude_md(repo_root: Path) -> None: - claude_md = repo_root / ".claude" / "CLAUDE.md" - backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" - if claude_md.exists() and not backup.exists(): - shutil.move(str(claude_md), str(backup)) - - def restore_claude_md(repo_root: Path) -> None: - claude_md = repo_root / ".claude" / "CLAUDE.md" - backup = repo_root / ".claude" / f"CLAUDE.md{self._backup_suffix}" - if backup.exists() and not claude_md.exists(): - shutil.move(str(backup), str(claude_md)) - - self.register_manipulation("claude_md_file", fail_claude_md, restore_claude_md) - - # README Assessor - Tier 1, 10% weight - def fail_readme(repo_root: Path) -> None: - readme = repo_root / "README.md" - backup = repo_root / f"README.md{self._backup_suffix}" - if readme.exists() and not backup.exists(): - shutil.move(str(readme), str(backup)) - - def restore_readme(repo_root: Path) -> None: - readme = repo_root / "README.md" - backup = repo_root / f"README.md{self._backup_suffix}" - if backup.exists() and not readme.exists(): - shutil.move(str(backup), str(readme)) - - self.register_manipulation("readme_structure", fail_readme, restore_readme) - - # Test Coverage Assessor - Tier 2, 3% weight - def fail_tests(repo_root: Path) -> None: - tests_dir = repo_root / "tests" - backup_dir = repo_root / f"tests{self._backup_suffix}" - if tests_dir.exists() and not backup_dir.exists(): - shutil.move(str(tests_dir), str(backup_dir)) - - def restore_tests(repo_root: Path) -> None: - tests_dir = repo_root / "tests" - backup_dir = repo_root / f"tests{self._backup_suffix}" - if backup_dir.exists() and not tests_dir.exists(): - shutil.move(str(backup_dir), str(tests_dir)) - - self.register_manipulation("test_execution", fail_tests, restore_tests) - - def force_fail(self, assessor_id: str) -> None: - """Force assessor to fail by manipulating repository state. - - Args: - assessor_id: Assessor identifier (e.g., "claude_md_file") - - Raises: - ValueError: If assessor_id is not recognized - """ - if assessor_id not in self.MANIPULATIONS: - raise ValueError( - f"Unknown assessor: {assessor_id}. " - f"Available: {', '.join(self.MANIPULATIONS.keys())}" - ) - - fail_action, _ = self.MANIPULATIONS[assessor_id] - fail_action(self.repo_root) - - def restore(self, assessor_id: str) -> None: - """Restore repository to state where assessor passes. - - Args: - assessor_id: Assessor identifier (e.g., "claude_md_file") - - Raises: - ValueError: If assessor_id is not recognized - """ - if assessor_id not in self.MANIPULATIONS: - raise ValueError( - f"Unknown assessor: {assessor_id}. " - f"Available: {', '.join(self.MANIPULATIONS.keys())}" - ) - - _, restore_action = self.MANIPULATIONS[assessor_id] - restore_action(self.repo_root) - - def list_supported_assessors(self) -> list[str]: - """Get list of assessor IDs with registered manipulations. - - Returns: - List of assessor IDs that can be toggled - """ - return list(self.MANIPULATIONS.keys()) - - @contextmanager - def temporarily_failed(self, assessor_id: str) -> Generator[None, None, None]: - """Context manager for safe fail/restore of assessor state. - - Ensures repository is restored even if exception occurs during testing. - - Args: - assessor_id: Assessor identifier - - Example: - toggler = AssessorStateToggler() - with toggler.temporarily_failed("claude_md_file"): - # CLAUDE.md is now missing (assessor fails) - run_benchmark() - # CLAUDE.md is automatically restored here - - Yields: - None - """ - try: - self.force_fail(assessor_id) - yield - finally: - self.restore(assessor_id) diff --git a/src/agentready/services/harbor/comparer.py b/src/agentready/services/harbor/comparer.py deleted file mode 100644 index df3b4461..00000000 --- a/src/agentready/services/harbor/comparer.py +++ /dev/null @@ -1,349 +0,0 @@ -"""Service for comparing Harbor benchmark runs and calculating statistical significance.""" - -from pathlib import Path -from typing import List, Optional - -from agentready.models.harbor import HarborComparison, HarborRunMetrics -from agentready.services.harbor.agent_toggler import AssessorStateToggler -from agentready.services.harbor.result_parser import parse_harbor_results -from agentready.services.harbor.runner import HarborRunner - - -def compare_runs( - without_agent: HarborRunMetrics, with_agent: HarborRunMetrics -) -> HarborComparison: - """Compare two Harbor runs and calculate deltas. - - Args: - without_agent: Metrics from run without agent file - with_agent: Metrics from run with agent file - - Returns: - HarborComparison with calculated deltas and significance - - Raises: - ValueError: If metrics are incompatible (different task sets) - """ - # Validate task sets match - without_tasks = {r.task_name for r in without_agent.task_results} - with_tasks = {r.task_name for r in with_agent.task_results} - - if without_tasks != with_tasks: - print( - f"Warning: Task sets differ. Without: {without_tasks}, With: {with_tasks}. " - "Comparison may be incomplete." - ) - - # Create comparison object - comparison = HarborComparison(without_agent=without_agent, with_agent=with_agent) - - # Calculate deltas - comparison.calculate_deltas() - - # Generate per-task comparison - comparison.generate_per_task_comparison() - - # Calculate statistical significance - comparison.statistical_significance = calculate_statistical_significance( - without_agent, with_agent - ) - - return comparison - - -def calculate_statistical_significance( - without_agent: HarborRunMetrics, with_agent: HarborRunMetrics, alpha: float = 0.05 -) -> dict: - """Calculate statistical significance of differences between runs. - - Uses two-sample t-test for continuous metrics and requires scipy. - - Args: - without_agent: Metrics from run without agent file - with_agent: Metrics from run with agent file - alpha: Significance level (default: 0.05 for 95% confidence) - - Returns: - Dictionary with significance flags and p-values: - { - 'success_rate_significant': bool, - 'duration_significant': bool, - 'success_rate_p_value': float, - 'duration_p_value': float, - } - """ - # Import scipy here to avoid hard dependency - try: - from scipy import stats - except ImportError: - print( - "Warning: scipy not installed. " - "Statistical significance tests unavailable. " - "Install with: uv pip install scipy" - ) - return { - "success_rate_significant": False, - "duration_significant": False, - "success_rate_p_value": None, - "duration_p_value": None, - } - - # Require minimum sample size for valid statistics - min_sample_size = 3 - if ( - len(without_agent.task_results) < min_sample_size - or len(with_agent.task_results) < min_sample_size - ): - print( - f"Warning: Sample size too small (n<{min_sample_size}). " - "Statistical tests may not be reliable." - ) - - # Extract success rates (binary: 1 for success, 0 for failure) - without_successes = [1 if r.success else 0 for r in without_agent.task_results] - with_successes = [1 if r.success else 0 for r in with_agent.task_results] - - # Extract durations (only for completed tasks) - without_durations = [ - r.duration_sec for r in without_agent.task_results if r.agent_result - ] - with_durations = [r.duration_sec for r in with_agent.task_results if r.agent_result] - - results = {} - - # T-test for success rate differences - if len(without_successes) > 0 and len(with_successes) > 0: - t_stat, p_value = stats.ttest_ind(without_successes, with_successes) - results["success_rate_significant"] = p_value < alpha - results["success_rate_p_value"] = p_value - else: - results["success_rate_significant"] = False - results["success_rate_p_value"] = None - - # T-test for duration differences - if len(without_durations) > 0 and len(with_durations) > 0: - t_stat, p_value = stats.ttest_ind(without_durations, with_durations) - results["duration_significant"] = p_value < alpha - results["duration_p_value"] = p_value - - # Calculate Cohen's d for effect size - results["duration_cohens_d"] = calculate_cohens_d( - without_durations, with_durations - ) - else: - results["duration_significant"] = False - results["duration_p_value"] = None - results["duration_cohens_d"] = None - - return results - - -def calculate_cohens_d(group1: List[float], group2: List[float]) -> Optional[float]: - """Calculate Cohen's d effect size. - - Cohen's d measures the standardized difference between two means: - - Small effect: 0.2 ≤ |d| < 0.5 - - Medium effect: 0.5 ≤ |d| < 0.8 - - Large effect: |d| ≥ 0.8 - - Args: - group1: First group of values - group2: Second group of values - - Returns: - Cohen's d value, or None if calculation not possible - """ - if not group1 or not group2: - return None - - # Calculate means - mean1 = sum(group1) / len(group1) - mean2 = sum(group2) / len(group2) - - # Calculate pooled standard deviation - var1 = sum((x - mean1) ** 2 for x in group1) / len(group1) - var2 = sum((x - mean2) ** 2 for x in group2) / len(group2) - pooled_std = ((var1 + var2) / 2) ** 0.5 - - if pooled_std == 0: - return None - - return (mean2 - mean1) / pooled_std - - -def interpret_effect_size(cohens_d: float) -> str: - """Interpret Cohen's d effect size. - - Args: - cohens_d: Cohen's d value - - Returns: - Human-readable interpretation - """ - abs_d = abs(cohens_d) - - if abs_d < 0.2: - return "negligible" - elif abs_d < 0.5: - return "small" - elif abs_d < 0.8: - return "medium" - else: - return "large" - - -def compare_assessor_impact( - assessor_id: str, - task_names: List[str], - repo_root: Path, - runs_per_task: int = 3, - output_dir: Path = None, - model: str = "anthropic/claude-sonnet-4-5", - n_concurrent: int = 1, - verbose: bool = True, -) -> HarborComparison: - """A/B test assessor impact: baseline (assessor fails) vs treatment (assessor passes). - - This function orchestrates the complete A/B testing workflow: - 1. Force assessor to fail (manipulate repository state) - 2. Run Harbor benchmark (baseline) - 3. Restore repository to normal state (assessor passes) - 4. Run Harbor benchmark again (treatment) - 5. Compare results with statistical significance - - Args: - assessor_id: Assessor identifier (e.g., "claude_md_file") - task_names: List of Terminal-Bench task names to test - repo_root: Root of the repository to test - runs_per_task: Number of runs per task (default: 3, recommended: 5+) - output_dir: Directory to store results (default: .agentready/validations/{assessor_id}/) - model: Claude model to use (default: sonnet-4-5) - n_concurrent: Number of concurrent tasks to run in parallel (default: 1) - verbose: Print progress messages (default: True) - - Returns: - HarborComparison with baseline vs treatment metrics, deltas, and significance - - Raises: - ValueError: If assessor_id is not supported - HarborNotInstalledError: If Harbor CLI is not available - subprocess.CalledProcessError: If Harbor benchmark fails - - Example: - comparison = compare_assessor_impact( - assessor_id="claude_md_file", - task_names=["adaptive-rejection-sampler", "async-http-client"], - repo_root=Path("."), - runs_per_task=3, - verbose=True - ) - - print(f"Success rate delta: {comparison.deltas['success_rate_delta']:.1f}%") - print(f"Significant: {comparison.statistical_significance['success_rate_significant']}") - """ - # Default output directory - if output_dir is None: - output_dir = Path(".agentready") / "validations" / assessor_id - output_dir.mkdir(parents=True, exist_ok=True) - - # Initialize components - toggler = AssessorStateToggler(repo_root=repo_root) - runner = HarborRunner() - - if verbose: - print(f"\nAssessor Impact Validation: {assessor_id}") - print(f"{'=' * 60}") - print(f"Tasks: {', '.join(task_names)}") - print(f"Runs per task: {runs_per_task}") - print( - f"Total trials: {len(task_names) * runs_per_task * 2} (baseline + treatment)" - ) - print(f"Output: {output_dir}\n") - - # Baseline: Assessor fails (remove CLAUDE.md, README, etc.) - if verbose: - print("[1/2] Running baseline (assessor FAILS)...") - print(f" Manipulating repository state to force {assessor_id} to fail") - - baseline_output = output_dir / "baseline" - with toggler.temporarily_failed(assessor_id): - if verbose: - print( - f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" - ) - - baseline_results_dir = runner.run_benchmark( - task_names=task_names, - output_dir=baseline_output, - model=model, - n_concurrent=n_concurrent, - verbose=verbose, - ) - - # Parse results - baseline_task_results = parse_harbor_results(baseline_results_dir) - baseline_metrics = HarborRunMetrics.from_task_results( - run_id=f"{assessor_id}_baseline", - agent_file_enabled=False, # Assessor fails - task_results=baseline_task_results, - ) - - if verbose: - print( - f" Baseline complete: {baseline_metrics.success_rate:.1f}% success rate\n" - ) - - # Treatment: Assessor passes (normal repository state) - if verbose: - print("[2/2] Running treatment (assessor PASSES)...") - print(" Repository restored to normal state") - - treatment_output = output_dir / "treatment" - if verbose: - print( - f" Running {len(task_names)} tasks × {runs_per_task} runs = {len(task_names) * runs_per_task} trials" - ) - - treatment_results_dir = runner.run_benchmark( - task_names=task_names, - output_dir=treatment_output, - model=model, - n_concurrent=n_concurrent, - verbose=verbose, - ) - - # Parse results - treatment_task_results = parse_harbor_results(treatment_results_dir) - treatment_metrics = HarborRunMetrics.from_task_results( - run_id=f"{assessor_id}_treatment", - agent_file_enabled=True, # Assessor passes - task_results=treatment_task_results, - ) - - if verbose: - print( - f" Treatment complete: {treatment_metrics.success_rate:.1f}% success rate\n" - ) - - # Compare results - comparison = compare_runs( - without_agent=baseline_metrics, with_agent=treatment_metrics - ) - - if verbose: - print(f"{'=' * 60}") - print("Results Summary") - print(f"{'=' * 60}") - delta = comparison.deltas["success_rate_delta"] - sign = "+" if delta >= 0 else "" - print(f"Success Rate Delta: {sign}{delta:.1f} percentage points") - print( - f"Statistical Significance: {'YES' if comparison.statistical_significance.get('success_rate_significant', False) else 'NO'}" - ) - if comparison.statistical_significance.get("success_rate_p_value"): - print( - f"P-value: {comparison.statistical_significance['success_rate_p_value']:.4f}" - ) - print(f"\nResults saved to: {output_dir}\n") - - return comparison diff --git a/src/agentready/services/harbor/dashboard_generator.py b/src/agentready/services/harbor/dashboard_generator.py deleted file mode 100644 index 35a26889..00000000 --- a/src/agentready/services/harbor/dashboard_generator.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Service for generating interactive HTML dashboards for Harbor comparisons.""" - -from pathlib import Path - -from jinja2 import Environment, FileSystemLoader, select_autoescape - -from agentready.models.harbor import HarborComparison - - -class DashboardGenerator: - """Generate interactive HTML dashboards with Chart.js visualizations.""" - - def __init__(self, template_dir: Path = None): - """Initialize dashboard generator. - - Args: - template_dir: Directory containing Jinja2 templates - (defaults to src/agentready/templates) - """ - if template_dir is None: - # Default to package templates directory - import agentready - - package_dir = Path(agentready.__file__).parent - template_dir = package_dir / "templates" - - self.env = Environment( - loader=FileSystemLoader(template_dir), - autoescape=select_autoescape(["html"]), - ) - - def generate(self, comparison: HarborComparison, output_path: Path) -> None: - """Generate interactive HTML dashboard. - - Creates a self-contained HTML file with: - - Side-by-side bar charts (success rates, durations) - - Per-task breakdown table - - Statistical significance indicators - - All CSS/JS inlined (no external dependencies) - - Args: - comparison: HarborComparison with calculated deltas - output_path: Path to write HTML dashboard - - Raises: - FileNotFoundError: If template file not found - jinja2.TemplateError: If template rendering fails - """ - # Load template - template = self.env.get_template("harbor_comparison.html.j2") - - # Prepare data for template - template_data = { - "comparison": comparison, - "without_agent": comparison.without_agent, - "with_agent": comparison.with_agent, - "deltas": comparison.deltas, - "significance": comparison.statistical_significance, - "per_task": comparison.per_task_comparison, - "created_at": comparison.created_at, - } - - # Render template - html_content = template.render(**template_data) - - # Write to file - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w") as f: - f.write(html_content) - - def generate_summary_text(self, comparison: HarborComparison) -> str: - """Generate plain text summary of comparison. - - Args: - comparison: HarborComparison with calculated deltas - - Returns: - Plain text summary for console output - """ - lines = [] - lines.append("=" * 60) - lines.append("Harbor Benchmark Comparison Summary") - lines.append("=" * 60) - lines.append("") - - # Overall metrics - lines.append("Overall Metrics:") - lines.append( - f" Success Rate: {comparison.without_agent.success_rate:.1f}% → " - f"{comparison.with_agent.success_rate:.1f}% " - f"({comparison.deltas['success_rate_delta']:+.1f}%)" - ) - - lines.append( - f" Avg Duration: {comparison.without_agent.avg_duration_sec:.1f}s → " - f"{comparison.with_agent.avg_duration_sec:.1f}s " - f"({comparison.deltas['avg_duration_delta_pct']:+.1f}%)" - ) - - lines.append("") - - # Statistical significance - sig = comparison.statistical_significance - if sig.get("success_rate_p_value") is not None: - is_sig = ( - "✓ Significant" - if sig["success_rate_significant"] - else "✗ Not significant" - ) - lines.append( - f" Success Rate: {is_sig} (p={sig['success_rate_p_value']:.4f})" - ) - - if sig.get("duration_p_value") is not None: - is_sig = ( - "✓ Significant" if sig["duration_significant"] else "✗ Not significant" - ) - lines.append(f" Duration: {is_sig} (p={sig['duration_p_value']:.4f})") - - if sig.get("duration_cohens_d") is not None: - from agentready.services.harbor.comparer import interpret_effect_size - - effect = interpret_effect_size(sig["duration_cohens_d"]) - lines.append( - f" Effect size: {effect} (d={sig['duration_cohens_d']:.2f})" - ) - - lines.append("") - - # Per-task summary - lines.append("Per-Task Results:") - for task_comp in comparison.per_task_comparison: - task_name = task_comp["task_name"] - without = task_comp.get("without_agent", {}) - with_agent_result = task_comp.get("with_agent", {}) - - without_status = "✓" if without and without.get("success") else "✗" - with_status = ( - "✓" if with_agent_result and with_agent_result.get("success") else "✗" - ) - - lines.append(f" {task_name}:") - lines.append(f" Without agent: {without_status}") - lines.append(f" With agent: {with_status}") - - if "delta" in task_comp: - delta = task_comp["delta"] - if delta.get("success_improved"): - lines.append(" Impact: +100% success (fixed failure)") - elif delta.get("duration_delta_pct"): - lines.append( - f" Impact: {delta['duration_delta_pct']:+.1f}% duration" - ) - - lines.append("") - lines.append("=" * 60) - - return "\n".join(lines) - - -def generate_dashboard(comparison: HarborComparison, output_path: Path) -> None: - """Convenience function to generate dashboard. - - Args: - comparison: HarborComparison with calculated deltas - output_path: Path to write HTML dashboard - """ - generator = DashboardGenerator() - generator.generate(comparison, output_path) diff --git a/src/agentready/services/harbor/result_parser.py b/src/agentready/services/harbor/result_parser.py deleted file mode 100644 index c8b1c01c..00000000 --- a/src/agentready/services/harbor/result_parser.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Service for parsing Harbor result.json files.""" - -import json -from pathlib import Path -from typing import List - -from agentready.models.harbor import HarborTaskResult - - -def parse_harbor_results(results_dir: Path) -> List[HarborTaskResult]: - """Parse all result.json files in a Harbor run directory. - - Args: - results_dir: Path to Harbor run directory (e.g., jobs/2025-12-09__22-06-09/) - - Returns: - List of HarborTaskResult objects - - Raises: - ValueError: If results_dir doesn't exist or contains no result files - FileNotFoundError: If results_dir doesn't exist - """ - if not results_dir.exists(): - raise FileNotFoundError(f"Results directory not found: {results_dir}") - - # Find all result.json files in subdirectories (task directories only, not job-level result.json) - # Task directories are named like "task-name__hash/" while the job result.json is at the root - all_result_files = list(results_dir.glob("*/result.json")) - - # Filter to only task result files (exclude job-level result.json) - # Task directories contain "__" in their name (e.g., "build-pmars__abc123") - result_files = [ - f - for f in all_result_files - if "__" in f.parent.name # Task directories have "__" separator - ] - - if not result_files: - raise ValueError(f"No result.json files found in {results_dir}") - - task_results = [] - for result_file in result_files: - try: - with open(result_file, "r") as f: - result_data = json.load(f) - task_result = HarborTaskResult.from_result_json(result_data) - task_results.append(task_result) - except (json.JSONDecodeError, KeyError) as e: - # Log warning but continue processing other files - print(f"Warning: Failed to parse {result_file}: {e}") - continue - - if not task_results: - raise ValueError(f"No valid task results parsed from {results_dir}") - - return task_results - - -def parse_single_result(result_file: Path) -> HarborTaskResult: - """Parse a single result.json file. - - Args: - result_file: Path to result.json file - - Returns: - HarborTaskResult object - - Raises: - FileNotFoundError: If result_file doesn't exist - json.JSONDecodeError: If result_file is not valid JSON - KeyError: If required fields are missing - """ - if not result_file.exists(): - raise FileNotFoundError(f"Result file not found: {result_file}") - - with open(result_file, "r") as f: - result_data = json.load(f) - return HarborTaskResult.from_result_json(result_data) diff --git a/src/agentready/services/harbor/runner.py b/src/agentready/services/harbor/runner.py deleted file mode 100644 index d9dca95d..00000000 --- a/src/agentready/services/harbor/runner.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Service for executing Harbor benchmarks via CLI.""" - -import inspect -import subprocess -import warnings -from pathlib import Path -from typing import List - - -class HarborNotInstalledError(Exception): - """Raised when Harbor framework is not installed.""" - - pass - - -class HarborTaskFilteringBugWarning(UserWarning): - """Raised when Harbor has the task filtering bug.""" - - pass - - -class HarborRunner: - """Execute Harbor benchmarks via subprocess and capture results.""" - - def __init__(self): - """Initialize Harbor runner and verify installation.""" - self._verify_harbor_installed() - self._check_harbor_task_filtering() - - def _verify_harbor_installed(self) -> None: - """Verify Harbor CLI is installed and accessible. - - Raises: - HarborNotInstalledError: If Harbor is not installed or not in PATH - """ - try: - subprocess.run( - ["harbor", "--help"], - capture_output=True, - text=True, - check=True, - timeout=5, - ) - except FileNotFoundError: - raise HarborNotInstalledError( - "Harbor framework not installed.\n" - "Install with: uv tool install harbor\n" - "See: https://harborframework.com/docs/getting-started" - ) - except subprocess.CalledProcessError as e: - raise HarborNotInstalledError(f"Harbor CLI error: {e.stderr}") - - def _check_harbor_task_filtering(self) -> None: - """Check if Harbor has the task filtering bug fix. - - Warns if Harbor version has the known task filtering bug where -t flags are ignored. - - The bug was fixed in Harbor commit f9e6d2e (Dec 12, 2025) but not yet released. - PyPI version 0.1.23 (Dec 11, 2025) has the bug. - - See: https://github.com/laude-institute/harbor/commit/f9e6d2e10c72d33373012294c36fd4938c45c26c - """ - try: - from harbor.models.job.config import BaseDatasetConfig - - # Check if the fix is present by inspecting source code - source = inspect.getsource(BaseDatasetConfig._filter_task_ids) - - # The fix changed task_id.path.name to task_id.get_name() - # If we see path.name (the bug), warn the user - if "task_id.path.name" in source or ".path.name" in source: - warnings.warn( - "\n" - "⚠️ WARNING: Harbor has a task filtering bug!\n" - "\n" - "Your Harbor version has a bug where -t/--task-name flags are ignored.\n" - "This causes smoketests to run ALL tasks instead of the filtered subset.\n" - "\n" - "The bug was fixed in Harbor main (Dec 12, 2025) but not yet released.\n" - "Latest PyPI version 0.1.23 (Dec 11) still has the bug.\n" - "\n" - "FIX OPTIONS:\n" - "\n" - "Option 1 (recommended): Install Harbor from main\n" - " pip uninstall harbor\n" - " pip install git+https://github.com/laude-institute/harbor.git\n" - "\n" - "Option 2: Apply patch to your local Harbor installation\n" - " See: patches/harbor-task-filtering-fix.patch in AgentReady repo\n" - "\n" - "Commit: https://github.com/laude-institute/harbor/commit/f9e6d2e\n", - HarborTaskFilteringBugWarning, - stacklevel=2, - ) - # If we see get_name() (the fix), all good - no warning - - except ImportError: - # Harbor not importable as Python package (only CLI installed) - # Can't check for bug, but also can't use task filtering anyway - pass - except Exception: - # Don't fail if we can't check - just skip the warning - pass - - def run_benchmark( - self, - task_names: List[str], - output_dir: Path, - dataset: str = "terminal-bench", - dataset_version: str = "2.0", - model: str = "anthropic/claude-sonnet-4-5", - agent: str = "claude-code", - n_concurrent: int = 1, - verbose: bool = True, - ) -> Path: - """Run Harbor benchmark and return results directory. - - Args: - task_names: List of task names to run (e.g., ['adaptive-rejection-sampler']) - output_dir: Directory to store results - dataset: Dataset name (default: 'terminal-bench') - dataset_version: Dataset version (default: '2.0') - model: Model identifier (default: 'anthropic/claude-sonnet-4-5') - agent: Agent identifier (default: 'claude-code') - n_concurrent: Number of concurrent tasks (default: 1) - verbose: Print Harbor output to console (default: True) - - Returns: - Path to results directory containing result.json files - - Raises: - HarborNotInstalledError: If Harbor is not installed - subprocess.CalledProcessError: If Harbor command fails - ValueError: If no tasks completed successfully - """ - output_dir.mkdir(parents=True, exist_ok=True) - - # Build Harbor command - cmd = [ - "harbor", - "run", - "-d", - f"{dataset}@{dataset_version}", - "-m", - model, - "-a", - agent, - "-n", - str(n_concurrent), - ] - - # Add task name filters - # NOTE: Requires Harbor >= 0.1.24 (or install from main) - # Task filtering was fixed in commit f9e6d2e (Dec 12, 2025) - for task_name in task_names: - cmd.extend(["-t", task_name]) - - # Execute Harbor benchmark - if verbose: - print(f"Running Harbor benchmark: {' '.join(cmd)}") - print(f"Tasks: {', '.join(task_names) if task_names else 'all'}") - - try: - result = subprocess.run( - cmd, - cwd=str(output_dir), - capture_output=not verbose, - text=True, - check=True, - timeout=None, # No timeout for long-running benchmarks - ) - - if verbose and result.stdout: - print(result.stdout) - - except subprocess.CalledProcessError as e: - error_msg = f"Harbor benchmark failed: {e.stderr if e.stderr else str(e)}" - raise subprocess.CalledProcessError( - e.returncode, e.cmd, e.output, error_msg - ) - - # Find results directory (Harbor creates timestamped subdirectory) - results_dirs = sorted(output_dir.glob("*"), key=lambda p: p.stat().st_mtime) - if not results_dirs: - raise ValueError(f"No results found in {output_dir}") - - results_dir = results_dirs[-1] # Most recent run - - if verbose: - print(f"Results stored in: {results_dir}") - - return results_dir diff --git a/src/agentready/templates/harbor_comparison.html.j2 b/src/agentready/templates/harbor_comparison.html.j2 deleted file mode 100644 index c609d15b..00000000 --- a/src/agentready/templates/harbor_comparison.html.j2 +++ /dev/null @@ -1,510 +0,0 @@ - - - - - - Harbor Benchmark Comparison - AgentReady - - - - -
-
-

Harbor Benchmark Comparison

-
doubleagent.md Impact Analysis
-
- Generated: {{ created_at }} -
-
- -
-
-
Success Rate
-
{{ "%.1f"|format(with_agent.success_rate) }}%
-
- {{ "%+.1f"|format(deltas.success_rate_delta) }}% -
- {% if significance.success_rate_p_value %} -
- {% if significance.success_rate_significant %}✓ Significant{% else %}Not Significant{% endif %} - (p={{ "%.4f"|format(significance.success_rate_p_value) }}) -
- {% endif %} -
- -
-
Avg Duration
-
{{ "%.1f"|format(with_agent.avg_duration_sec) }}s
-
- {{ "%+.1f"|format(deltas.avg_duration_delta_pct) }}% -
- {% if significance.duration_p_value %} -
- {% if significance.duration_significant %}✓ Significant{% else %}Not Significant{% endif %} - (p={{ "%.4f"|format(significance.duration_p_value) }}) -
- {% endif %} -
- -
-
Successful Tasks
-
{{ with_agent.successful_tasks }}/{{ with_agent.total_tasks }}
-
- {{ "%+d"|format(deltas.successful_tasks_delta|int) }} tasks -
-
- -
-
Completion Rate
-
{{ "%.1f"|format(with_agent.completion_rate) }}%
-
- {{ "%+.1f"|format(deltas.completion_rate_delta) }}% -
-
-
- -
-
-
-
Success Rate Comparison
- -
- -
-
Average Duration Comparison
- -
-
-
- -
-

Per-Task Results

- - - - - - - - - - - {% for task in per_task %} - - - - - - - {% endfor %} - -
Task NameWithout AgentWith AgentImpact
{{ task.task_name }} - {% if task.without_agent %} - - {% if task.without_agent.success %}✓ Success{% else %}✗ Failed{% endif %} - -
- {{ "%.1f"|format(task.without_agent.duration_sec) }}s -
- {% else %} - N/A - {% endif %} -
- {% if task.with_agent %} - - {% if task.with_agent.success %}✓ Success{% else %}✗ Failed{% endif %} - -
- {{ "%.1f"|format(task.with_agent.duration_sec) }}s -
- {% else %} - N/A - {% endif %} -
- {% if task.delta %} - {% if task.delta.success_improved %} - +100% (Fixed) - {% elif task.delta.duration_delta_pct %} - - {{ "%+.1f"|format(task.delta.duration_delta_pct) }}% duration - - {% else %} - Unchanged - {% endif %} - {% else %} - - - {% endif %} -
-
- -
- Generated by AgentReady Harbor Comparison Tool · {{ created_at }} -
-
- - - - diff --git a/src/agentready/utils/__init__.py b/src/agentready/utils/__init__.py index 2e80889a..4f8d0c74 100644 --- a/src/agentready/utils/__init__.py +++ b/src/agentready/utils/__init__.py @@ -1,6 +1,5 @@ """Utility modules for AgentReady.""" -from .preflight import PreflightError, check_harbor_cli, ensure_terminal_bench_dataset from .privacy import ( sanitize_command_args, sanitize_error_message, @@ -27,7 +26,4 @@ "sanitize_error_message", "sanitize_metadata", "shorten_commit_hash", - "PreflightError", - "check_harbor_cli", - "ensure_terminal_bench_dataset", ] diff --git a/src/agentready/utils/preflight.py b/src/agentready/utils/preflight.py deleted file mode 100644 index b90f48fd..00000000 --- a/src/agentready/utils/preflight.py +++ /dev/null @@ -1,132 +0,0 @@ -"""Preflight dependency checks for CLI tools.""" - -import shutil -import subprocess -from pathlib import Path - -import click - -from .subprocess_utils import safe_subprocess_run - - -class PreflightError(Exception): - """Raised when preflight check fails.""" - - pass - - -def check_harbor_cli(interactive: bool = True) -> bool: - """Check Harbor CLI availability and optionally install. - - Args: - interactive: If True, prompt user to install if missing - - Returns: - True if Harbor is available - - Raises: - PreflightError: If Harbor is missing and installation declined/failed - """ - # Check if harbor is installed - if shutil.which("harbor") is not None: - return True - - # Harbor not found - if not interactive: - raise PreflightError( - "harbor CLI not installed.\n" "Install with: uv tool install harbor" - ) - - # Prompt user for installation - click.echo("Harbor CLI not found.", err=True) - - # Detect available package manager (uv or pip) - if shutil.which("uv") is not None: - install_cmd = ["uv", "tool", "install", "harbor"] - install_msg = "uv tool install harbor" - elif shutil.which("pip") is not None: - install_cmd = ["pip", "install", "harbor"] - install_msg = "pip install harbor" - else: - raise PreflightError( - "Neither 'uv' nor 'pip' found on PATH.\n" - "Install uv (recommended): https://docs.astral.sh/uv/\n" - "Or install pip: https://pip.pypa.io/en/stable/installation/" - ) - - if not click.confirm(f"Install with '{install_msg}'?", default=True): - raise PreflightError( - f"Harbor CLI installation declined.\n" f"To install manually: {install_msg}" - ) - - # Install Harbor - try: - click.echo(f"Installing Harbor CLI using {install_cmd[0]}...") - safe_subprocess_run(install_cmd, check=True, timeout=300) # 5 minute timeout - except Exception as e: - raise PreflightError(f"Harbor installation failed: {e}") - - # Verify installation succeeded - if shutil.which("harbor") is None: - raise PreflightError( - "Harbor installation completed but 'harbor' not found on PATH.\n" - "You may need to restart your shell or add ~/.local/bin to PATH." - ) - - click.echo("✓ Harbor CLI installed successfully") - return True - - -def ensure_terminal_bench_dataset() -> Path: - """Ensure Terminal-Bench dataset is downloaded and find smoketest task. - - Returns: - Path to adaptive-rejection-sampler task directory - - Raises: - PreflightError: If dataset download fails or task not found - """ - # First, try to find an existing task - cache_dir = Path.home() / ".cache/harbor/tasks" - - if cache_dir.exists(): - candidates = sorted(cache_dir.glob("*/adaptive-rejection-sampler")) - if candidates: - click.echo("✓ Terminal-Bench dataset found in cache") - return candidates[-1] # Use most recent - - # Dataset not found - download it - click.echo("Downloading Terminal-Bench dataset (89 tasks, ~50MB)...") - - try: - subprocess.run( - ["harbor", "datasets", "download", "terminal-bench@2.0"], - capture_output=True, - text=True, - timeout=600, # 10 minute timeout - check=True, - ) - click.echo("✓ Terminal-Bench dataset downloaded") - except subprocess.TimeoutExpired: - raise PreflightError( - "Dataset download timed out after 10 minutes.\n" - "Check your network connection and try again." - ) - except subprocess.CalledProcessError as e: - raise PreflightError( - f"Dataset download failed: {e.stderr}\n" - f"Try manually: harbor datasets download terminal-bench@2.0" - ) - except Exception as e: - raise PreflightError(f"Dataset download failed: {e}") - - # Find the downloaded task - if cache_dir.exists(): - candidates = sorted(cache_dir.glob("*/adaptive-rejection-sampler")) - if candidates: - return candidates[-1] - - raise PreflightError( - "Dataset downloaded but task not found in cache.\n" - "This may indicate a Harbor version incompatibility." - ) diff --git a/tests/unit/services/harbor/__init__.py b/tests/unit/services/harbor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/services/harbor/test_assessor_state_toggler.py b/tests/unit/services/harbor/test_assessor_state_toggler.py deleted file mode 100644 index 02144165..00000000 --- a/tests/unit/services/harbor/test_assessor_state_toggler.py +++ /dev/null @@ -1,240 +0,0 @@ -"""Unit tests for AssessorStateToggler.""" - -import pytest - -from agentready.services.harbor.agent_toggler import AssessorStateToggler - - -@pytest.fixture -def temp_repo(tmp_path): - """Create a temporary repository structure for testing.""" - repo_root = tmp_path / "test_repo" - repo_root.mkdir() - - # Create .claude directory and CLAUDE.md - claude_dir = repo_root / ".claude" - claude_dir.mkdir() - claude_md = claude_dir / "CLAUDE.md" - claude_md.write_text("# Test Project\n\nThis is a test CLAUDE.md file.") - - # Create README.md - readme = repo_root / "README.md" - readme.write_text("# Test Repo\n\n## Installation\n\n## Usage") - - # Create tests directory - tests_dir = repo_root / "tests" - tests_dir.mkdir() - (tests_dir / "test_example.py").write_text("def test_example():\n assert True") - - return repo_root - - -class TestAssessorStateToggler: - """Test AssessorStateToggler functionality.""" - - def test_initialization(self, temp_repo): - """Test toggler initializes with correct repo root.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - assert toggler.repo_root == temp_repo - assert toggler._backup_suffix == ".assessor_backup" - - def test_list_supported_assessors(self, temp_repo): - """Test listing supported assessors returns expected IDs.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - supported = toggler.list_supported_assessors() - - assert "claude_md_file" in supported - assert "readme_structure" in supported - assert "test_execution" in supported - assert len(supported) == 3 - - def test_force_fail_claude_md(self, temp_repo): - """Test forcing CLAUDE.md assessor to fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" - - # Verify file exists before fail - assert claude_md.exists() - assert not backup.exists() - - # Force fail - toggler.force_fail("claude_md_file") - - # Verify file is backed up - assert not claude_md.exists() - assert backup.exists() - - def test_restore_claude_md(self, temp_repo): - """Test restoring CLAUDE.md after forcing fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - # Force fail first - toggler.force_fail("claude_md_file") - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" - - # Verify backup state - assert not claude_md.exists() - assert backup.exists() - - # Restore - toggler.restore("claude_md_file") - - # Verify restored - assert claude_md.exists() - assert not backup.exists() - - def test_force_fail_readme(self, temp_repo): - """Test forcing README assessor to fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - readme = temp_repo / "README.md" - backup = temp_repo / "README.md.assessor_backup" - - # Force fail - toggler.force_fail("readme_structure") - - # Verify - assert not readme.exists() - assert backup.exists() - - def test_restore_readme(self, temp_repo): - """Test restoring README after forcing fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - # Force fail first - toggler.force_fail("readme_structure") - - # Restore - toggler.restore("readme_structure") - - readme = temp_repo / "README.md" - backup = temp_repo / "README.md.assessor_backup" - - # Verify - assert readme.exists() - assert not backup.exists() - - def test_force_fail_tests(self, temp_repo): - """Test forcing test execution assessor to fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - tests_dir = temp_repo / "tests" - backup_dir = temp_repo / "tests.assessor_backup" - - # Force fail - toggler.force_fail("test_execution") - - # Verify - assert not tests_dir.exists() - assert backup_dir.exists() - - def test_restore_tests(self, temp_repo): - """Test restoring tests directory after forcing fail.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - # Force fail first - toggler.force_fail("test_execution") - - # Restore - toggler.restore("test_execution") - - tests_dir = temp_repo / "tests" - backup_dir = temp_repo / "tests.assessor_backup" - - # Verify - assert tests_dir.exists() - assert not backup_dir.exists() - - def test_temporarily_failed_context_manager(self, temp_repo): - """Test temporarily_failed context manager restores state.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - - # Verify initial state - assert claude_md.exists() - - # Use context manager - with toggler.temporarily_failed("claude_md_file"): - # Inside context: file should be missing - assert not claude_md.exists() - - # Outside context: file should be restored - assert claude_md.exists() - - def test_temporarily_failed_exception_still_restores(self, temp_repo): - """Test temporarily_failed restores even when exception occurs.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - - # Verify initial state - assert claude_md.exists() - - # Use context manager with exception - with pytest.raises(RuntimeError): - with toggler.temporarily_failed("claude_md_file"): - assert not claude_md.exists() - raise RuntimeError("Test exception") - - # File should still be restored - assert claude_md.exists() - - def test_unknown_assessor_raises_error(self, temp_repo): - """Test that unknown assessor ID raises ValueError.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): - toggler.force_fail("nonexistent") - - with pytest.raises(ValueError, match="Unknown assessor: nonexistent"): - toggler.restore("nonexistent") - - def test_idempotent_force_fail(self, temp_repo): - """Test that forcing fail multiple times is idempotent.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - # Force fail twice - toggler.force_fail("claude_md_file") - toggler.force_fail("claude_md_file") # Should not error - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" - - # Still in failed state - assert not claude_md.exists() - assert backup.exists() - - def test_idempotent_restore(self, temp_repo): - """Test that restoring multiple times is idempotent.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - # Force fail, then restore twice - toggler.force_fail("claude_md_file") - toggler.restore("claude_md_file") - toggler.restore("claude_md_file") # Should not error - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - backup = temp_repo / ".claude" / "CLAUDE.md.assessor_backup" - - # Still in normal state - assert claude_md.exists() - assert not backup.exists() - - def test_content_preservation(self, temp_repo): - """Test that file content is preserved through fail/restore cycle.""" - toggler = AssessorStateToggler(repo_root=temp_repo) - - claude_md = temp_repo / ".claude" / "CLAUDE.md" - original_content = claude_md.read_text() - - # Fail and restore - toggler.force_fail("claude_md_file") - toggler.restore("claude_md_file") - - # Verify content is identical - assert claude_md.read_text() == original_content diff --git a/tests/unit/test_cli_benchmark.py b/tests/unit/test_cli_benchmark.py deleted file mode 100644 index 9ae34aad..00000000 --- a/tests/unit/test_cli_benchmark.py +++ /dev/null @@ -1,673 +0,0 @@ -"""Unit tests for benchmark CLI commands. - -Test Strategy: - - Uses Click's CliRunner with isolated filesystem for CLI command testing - - Mocks external dependencies (_real_tbench_result, compare_assessor_impact) - - Uses actual data models (HarborComparison, HarborRunMetrics) for type safety - - Tests both high-level commands (benchmark, validate_assessor) and internal helpers (_run_tbench) - - Covers CLI argument parsing, validation, and error handling - -Coverage Target: - - Achieves 80% coverage of cli/benchmark.py - - All commands (benchmark, validate-assessor) tested - - Helper functions (_run_tbench) tested independently - - Edge cases: missing API keys, invalid inputs, file system operations - -Test Fixtures: - - runner: Click test runner for CLI command invocation - - temp_repo: Temporary git repository structure - - mock_tbench_result: Mock Terminal-Bench evaluation result - - mock_comparison: Harbor comparison for assessor validation testing - -Note on Directory Creation: - Tests create output directories explicitly before invocation to match - real-world usage where the CLI creates directories on demand. -""" - -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from click.testing import CliRunner - -from agentready.cli.benchmark import ( - DEFAULT_PHASE1_TASKS, - _run_tbench, - benchmark, - validate_assessor, -) -from agentready.models.harbor import HarborComparison, HarborRunMetrics - - -@pytest.fixture -def runner(): - """Create Click test runner.""" - return CliRunner() - - -@pytest.fixture -def temp_repo(): - """Create a temporary git repository.""" - with tempfile.TemporaryDirectory() as tmpdir: - repo_path = Path(tmpdir) - (repo_path / ".git").mkdir() - yield repo_path - - -@pytest.fixture -def mock_tbench_result(): - """Create mock Terminal-Bench result.""" - result = MagicMock() - result.score = 75.5 - result.task_solved = 10 - result.resolved_trials = 10 - result.unresolved_trials = 0 - result.pass_at_1 = 0.90 - result.trajectory_path = "/path/to/trajectory.json" - return result - - -@pytest.fixture -def mock_comparison(): - """Create mock Harbor comparison for assessor validation. - - Simulates assessor A/B test results showing: - - Baseline (assessor fails): 50% success rate - - Treatment (assessor passes): 100% success rate - - Impact: +50pp success rate when assessor criteria met - """ - # Baseline: assessor forced to fail - without_metrics = HarborRunMetrics( - run_id="without_20240101_120000", - agent_file_enabled=False, - task_results=[], - success_rate=50.0, - completion_rate=100.0, - avg_duration_sec=12.5, - total_tasks=2, - successful_tasks=1, - failed_tasks=1, - timed_out_tasks=0, - ) - - # Treatment: assessor passes normally - with_metrics = HarborRunMetrics( - run_id="with_20240101_120000", - agent_file_enabled=True, - task_results=[], - success_rate=100.0, - completion_rate=100.0, - avg_duration_sec=10.0, - total_tasks=2, - successful_tasks=2, - failed_tasks=0, - timed_out_tasks=0, - ) - - return HarborComparison( - created_at="2024-01-01T12:00:00", # Fixed timestamp for test determinism - without_agent=without_metrics, - with_agent=with_metrics, - deltas={ - "success_rate_delta": 50.0, - "avg_duration_delta_sec": -2.5, - "avg_duration_delta_pct": -20.0, - }, - statistical_significance={ - "success_rate_significant": True, - "duration_significant": False, - }, - per_task_comparison=[], - ) - - -class TestBenchmarkCommand: - """Test benchmark CLI command.""" - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_basic_execution(self, mock_run, runner, temp_repo): - """Test basic benchmark command execution.""" - result = runner.invoke( - benchmark, - [str(temp_repo), "--harness", "tbench", "--subset", "smoketest"], - ) - - # Should succeed - assert result.exit_code == 0 - mock_run.assert_called_once() - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_defaults_to_current_dir(self, mock_run, runner): - """Test benchmark defaults to current directory.""" - with runner.isolated_filesystem(): - Path(".git").mkdir() - - result = runner.invoke( - benchmark, - ["--subset", "smoketest"], - ) - - # Should use current directory - assert result.exit_code == 0 - mock_run.assert_called_once() - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_with_verbose_flag(self, mock_run, runner, temp_repo): - """Test benchmark command with verbose output.""" - result = runner.invoke( - benchmark, - [str(temp_repo), "--verbose", "--subset", "smoketest"], - ) - - assert result.exit_code == 0 - # Verbose flag passed to _run_tbench (repo_path, subset, agent, model, verbose, timeout, output_dir, skip_preflight) - _, _, _, _, verbose, _, _, _ = mock_run.call_args[0] - assert verbose is True - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_with_custom_timeout(self, mock_run, runner, temp_repo): - """Test benchmark with custom timeout.""" - result = runner.invoke( - benchmark, - [str(temp_repo), "--timeout", "7200", "--subset", "smoketest"], - ) - - assert result.exit_code == 0 - _, _, _, _, _, timeout, _, _ = mock_run.call_args[0] - assert timeout == 7200 - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_with_output_dir(self, mock_run, runner, temp_repo): - """Test benchmark with custom output directory.""" - result = runner.invoke( - benchmark, - [ - str(temp_repo), - "--output-dir", - "/custom/output", - "--subset", - "smoketest", - ], - ) - - assert result.exit_code == 0 - _, _, _, _, _, _, output_dir, _ = mock_run.call_args[0] - assert output_dir == "/custom/output" - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_skip_preflight(self, mock_run, runner, temp_repo): - """Test benchmark with skip-preflight flag.""" - result = runner.invoke( - benchmark, - [str(temp_repo), "--skip-preflight", "--subset", "smoketest"], - ) - - assert result.exit_code == 0 - _, _, _, _, _, _, _, skip_preflight = mock_run.call_args[0] - assert skip_preflight is True - - def test_benchmark_unknown_harness(self, runner, temp_repo): - """Test benchmark with unknown harness.""" - result = runner.invoke( - benchmark, - [str(temp_repo), "--harness", "unknown"], - ) - - # Should fail (but unknown won't be accepted by Click's Choice validation) - assert result.exit_code != 0 - - @patch("agentready.cli.benchmark._run_tbench") - def test_benchmark_with_model_selection(self, mock_run, runner, temp_repo): - """Test benchmark with different models.""" - result = runner.invoke( - benchmark, - [ - str(temp_repo), - "--model", - "anthropic/claude-sonnet-4-5", - "--subset", - "smoketest", - ], - ) - - assert result.exit_code == 0 - _, _, _, model, _, _, _, _ = mock_run.call_args[0] - assert model == "anthropic/claude-sonnet-4-5" - - @patch.dict("os.environ", {}, clear=True) - def test_benchmark_cursor_cli_agent_requires_cursor_api_key( - self, runner, temp_repo - ): - """Test that cursor-cli agent requires CURSOR_API_KEY.""" - result = runner.invoke( - benchmark, - [ - str(temp_repo), - "--agent", - "cursor-cli", - "--model", - "cursor/sonnet-4.5", - "--subset", - "smoketest", - "--skip-preflight", - ], - ) - - assert result.exit_code != 0 - assert "CURSOR_API_KEY" in result.output - - @patch("agentready.cli.benchmark._run_tbench") - @patch.dict("os.environ", {"CURSOR_API_KEY": "test-cursor-key"}) - def test_benchmark_cursor_cli_with_valid_cursor_model( - self, mock_run, runner, temp_repo - ): - """Test cursor-cli works with cursor/ prefixed models.""" - result = runner.invoke( - benchmark, - [ - str(temp_repo), - "--agent", - "cursor-cli", - "--model", - "cursor/sonnet-4.5", - "--subset", - "smoketest", - ], - ) - - assert result.exit_code == 0 - mock_run.assert_called_once() - _, _, agent, model, _, _, _, _ = mock_run.call_args[0] - assert agent == "cursor-cli" - assert model == "cursor/sonnet-4.5" - - -class TestRunTbench: - """Test _run_tbench internal function.""" - - @patch("agentready.cli.benchmark._real_tbench_result") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_run_tbench_smoketest(self, mock_result, tmp_path, mock_tbench_result): - """Test running tbench with smoketest subset.""" - mock_result.return_value = mock_tbench_result - - # Create mock repository - repo_path = tmp_path / "repo" - repo_path.mkdir() - - # Should not raise - _run_tbench( - repo_path=repo_path, - subset="smoketest", - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, # Skip preflight to avoid dependencies - ) - - # Should call _real_tbench_result - mock_result.assert_called_once() - - @patch("agentready.cli.benchmark._real_tbench_result") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_run_tbench_full_subset(self, mock_result, tmp_path, mock_tbench_result): - """Test running tbench with full subset.""" - mock_result.return_value = mock_tbench_result - - repo_path = tmp_path / "repo" - repo_path.mkdir() - - _run_tbench( - repo_path=repo_path, - subset="full", - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, - ) - - mock_result.assert_called_once() - - @patch("agentready.cli.benchmark.click.echo") - @patch("agentready.cli.benchmark.click.Abort") - def test_run_tbench_invalid_subset(self, mock_abort, mock_echo, tmp_path): - """Test tbench with invalid subset.""" - repo_path = tmp_path / "repo" - repo_path.mkdir() - - with pytest.raises(Exception): - _run_tbench( - repo_path=repo_path, - subset="invalid", - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, - ) - - @patch.dict("os.environ", {}, clear=True) - @patch("agentready.cli.benchmark.click.echo") - @patch("agentready.cli.benchmark.click.Abort") - def test_run_tbench_missing_api_key(self, mock_abort, mock_echo, tmp_path): - """Test tbench fails without API key.""" - repo_path = tmp_path / "repo" - repo_path.mkdir() - - with pytest.raises(Exception): - _run_tbench( - repo_path=repo_path, - subset="smoketest", - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, - ) - - @patch("agentready.cli.benchmark._real_tbench_result") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_run_tbench_defaults_to_full( - self, mock_result, tmp_path, mock_tbench_result - ): - """Test tbench defaults to full subset when None specified.""" - mock_result.return_value = mock_tbench_result - - repo_path = tmp_path / "repo" - repo_path.mkdir() - - _run_tbench( - repo_path=repo_path, - subset=None, # Should default to 'full' - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, - ) - - # Check that HarborConfig was created with smoketest=False - mock_result.assert_called_once() - harbor_config = mock_result.call_args[0][1] - assert harbor_config.smoketest is False - - @patch("agentready.cli.benchmark._real_tbench_result") - @patch("agentready.cli.benchmark.click.echo") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_run_tbench_exception_handling(self, mock_echo, mock_result, tmp_path): - """Test tbench handles exceptions gracefully.""" - mock_result.side_effect = Exception("Benchmark error") - - repo_path = tmp_path / "repo" - repo_path.mkdir() - - with pytest.raises(Exception): - _run_tbench( - repo_path=repo_path, - subset="smoketest", - agent="claude-code", - model="anthropic/claude-haiku-4-5", - verbose=False, - timeout=3600, - output_dir=None, - skip_preflight=True, - ) - - -class TestValidateAssessorCommand: - """Test validate-assessor CLI command.""" - - @patch("agentready.cli.benchmark.AssessorStateToggler") - def test_list_assessors(self, mock_toggler_class, runner): - """Test --list-assessors flag.""" - mock_toggler = MagicMock() - mock_toggler.list_supported_assessors.return_value = [ - "claude_md_file", - "readme_structure", - "test_execution", - ] - mock_toggler_class.return_value = mock_toggler - - result = runner.invoke(validate_assessor, ["--list-assessors"]) - - # Should succeed - assert result.exit_code == 0 - assert "claude_md_file" in result.output - assert "readme_structure" in result.output - assert "test_execution" in result.output - - def test_validate_missing_assessor_flag(self, runner): - """Test validate-assessor without --assessor flag.""" - result = runner.invoke(validate_assessor, []) - - # Should fail - assert result.exit_code != 0 - assert "Missing required option" in result.output - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_basic(self, mock_compare, runner, mock_comparison): - """Test basic assessor validation.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/claude_md_file").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - ["--assessor", "claude_md_file", "--smoketest"], - ) - - # Should succeed - assert result.exit_code == 0 - assert "Results saved" in result.output - mock_compare.assert_called_once() - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_with_custom_tasks( - self, mock_compare, runner, mock_comparison - ): - """Test validation with custom tasks.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/readme_structure").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - [ - "--assessor", - "readme_structure", - "--tasks", - "adaptive-rejection-sampler", - "--tasks", - "async-http-client", - ], - ) - - assert result.exit_code == 0 - # Check that custom tasks were passed - _, kwargs = mock_compare.call_args - assert kwargs["task_names"] == [ - "adaptive-rejection-sampler", - "async-http-client", - ] - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_with_runs(self, mock_compare, runner, mock_comparison): - """Test validation with custom number of runs.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/test_execution").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - ["--assessor", "test_execution", "--runs", "5", "--smoketest"], - ) - - assert result.exit_code == 0 - _, kwargs = mock_compare.call_args - assert kwargs["runs_per_task"] == 5 - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_default_tasks( - self, mock_compare, runner, mock_comparison - ): - """Test validation uses default Phase 1 tasks.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/claude_md_file").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - ["--assessor", "claude_md_file"], - ) - - assert result.exit_code == 0 - # Should use DEFAULT_PHASE1_TASKS - _, kwargs = mock_compare.call_args - assert kwargs["task_names"] == DEFAULT_PHASE1_TASKS - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_smoketest_mode( - self, mock_compare, runner, mock_comparison - ): - """Test smoketest mode uses single task.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/claude_md_file").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - ["--assessor", "claude_md_file", "--smoketest"], - ) - - assert result.exit_code == 0 - # Smoketest should use only 1 task - _, kwargs = mock_compare.call_args - assert kwargs["task_names"] == ["adaptive-rejection-sampler"] - - @patch.dict("os.environ", {}, clear=True) - def test_validate_assessor_missing_api_key(self, runner): - """Test validation fails without API key.""" - result = runner.invoke( - validate_assessor, - ["--assessor", "claude_md_file"], - ) - - # Should fail - assert result.exit_code != 0 - assert "ANTHROPIC_API_KEY" in result.output - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_value_error(self, mock_compare, runner): - """Test validation handles unsupported assessor.""" - mock_compare.side_effect = ValueError("Unsupported assessor") - - result = runner.invoke( - validate_assessor, - ["--assessor", "invalid_assessor", "--smoketest"], - ) - - # Should fail gracefully - assert result.exit_code != 0 - assert "Error:" in result.output - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_creates_output_files( - self, mock_compare, runner, mock_comparison - ): - """Test validation creates JSON and Markdown files.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - output_dir = Path("output") - # Create output directory structure - output_dir.mkdir(parents=True, exist_ok=True) - - result = runner.invoke( - validate_assessor, - [ - "--assessor", - "claude_md_file", - "--output-dir", - str(output_dir), - "--smoketest", - ], - ) - - assert result.exit_code == 0 - # Check files were created - assert (output_dir / "claude_md_file.json").exists() - assert (output_dir / "claude_md_file.md").exists() - - @patch("agentready.cli.benchmark.compare_assessor_impact") - @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}) - def test_validate_assessor_concurrent_flag( - self, mock_compare, runner, mock_comparison - ): - """Test validation with concurrent tasks.""" - mock_compare.return_value = mock_comparison - - with runner.isolated_filesystem(): - # Create output directory structure - Path(".agentready/validations/claude_md_file").mkdir( - parents=True, exist_ok=True - ) - - result = runner.invoke( - validate_assessor, - ["--assessor", "claude_md_file", "--concurrent", "5", "--smoketest"], - ) - - assert result.exit_code == 0 - _, kwargs = mock_compare.call_args - assert kwargs["n_concurrent"] == 5 - - -class TestPhase1Tasks: - """Test DEFAULT_PHASE1_TASKS constant.""" - - def test_phase1_tasks_defined(self): - """Test that Phase 1 tasks are defined.""" - assert len(DEFAULT_PHASE1_TASKS) == 8 - assert "adaptive-rejection-sampler" in DEFAULT_PHASE1_TASKS - assert "async-http-client" in DEFAULT_PHASE1_TASKS - - def test_phase1_tasks_diversity(self): - """Test that Phase 1 tasks cover diverse categories.""" - # Just check that we have a good variety - assert all(isinstance(task, str) for task in DEFAULT_PHASE1_TASKS) - assert all("-" in task for task in DEFAULT_PHASE1_TASKS) diff --git a/tests/unit/test_cli_harbor.py b/tests/unit/test_cli_harbor.py deleted file mode 100644 index 6a82ff13..00000000 --- a/tests/unit/test_cli_harbor.py +++ /dev/null @@ -1,725 +0,0 @@ -"""Unit tests for Harbor CLI commands. - -Test Strategy: - - Uses Click's CliRunner with isolated filesystem for CLI command testing - - Mocks external dependencies (HarborRunner, AgentFileToggler, parse_harbor_results) - - Uses actual data models (HarborComparison, HarborRunMetrics) for type safety - - Covers success paths, error handling, and edge cases - - Helper functions tested independently from CLI commands - -Coverage Target: - - Achieves 96% coverage of cli/harbor.py - - All commands (compare, list, view) tested - - Helper functions (_run_benchmark_phase, _generate_reports, _create_latest_symlinks) tested - - Error conditions and validation logic covered - -Test Fixtures: - - runner: Click test runner for CLI command invocation - - temp_repo: Temporary git repository with agent file structure - - mock_task_results: Sample Harbor task results with realistic data - - mock_comparison: Complete Harbor comparison object for testing report generation -""" - -import json -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from click.testing import CliRunner - -from agentready.cli.harbor import ( - _create_latest_symlinks, - _generate_reports, - _run_benchmark_phase, - compare, - harbor_cli, - list_comparisons, - view_comparison, -) -from agentready.models.harbor import ( - HarborComparison, - HarborRunMetrics, - HarborTaskResult, -) - - -@pytest.fixture -def runner(): - """Create Click test runner.""" - return CliRunner() - - -@pytest.fixture -def temp_repo(): - """Create a temporary git repository with agent file.""" - with tempfile.TemporaryDirectory() as tmpdir: - repo_path = Path(tmpdir) - (repo_path / ".git").mkdir() - - # Create agent file - agent_dir = repo_path / ".claude" / "agents" - agent_dir.mkdir(parents=True) - (agent_dir / "doubleagent.md").write_text("# Agent file content") - - yield repo_path - - -@pytest.fixture -def mock_task_results(): - """Create mock Harbor task results.""" - return [ - HarborTaskResult( - task_name="test-task-1", - trial_name="trial_1", - success=True, - duration_sec=10.5, - agent_result={"status": "success"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2024-01-01T12:00:00", - finished_at="2024-01-01T12:00:10", - ), - HarborTaskResult( - task_name="test-task-2", - trial_name="trial_2", - success=True, - duration_sec=15.2, - agent_result={"status": "success"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2024-01-01T12:01:00", - finished_at="2024-01-01T12:01:15", - ), - ] - - -@pytest.fixture -def mock_comparison(): - """Create mock Harbor comparison. - - Simulates an A/B test comparison showing: - - Baseline (without agent): 50% success rate, 12.5s avg duration - - Treatment (with agent): 100% success rate, 10.0s avg duration - - Delta: +50pp success rate improvement, -2.5s duration improvement - """ - # Baseline metrics (agent disabled) - without_metrics = HarborRunMetrics( - run_id="without_20240101_120000", - agent_file_enabled=False, - task_results=[], - success_rate=50.0, - completion_rate=100.0, - avg_duration_sec=12.5, - total_tasks=2, - successful_tasks=1, - failed_tasks=1, - timed_out_tasks=0, - ) - - # Treatment metrics (agent enabled) - with_metrics = HarborRunMetrics( - run_id="with_20240101_120000", - agent_file_enabled=True, - task_results=[], - success_rate=100.0, - completion_rate=100.0, - avg_duration_sec=10.0, - total_tasks=2, - successful_tasks=2, - failed_tasks=0, - timed_out_tasks=0, - ) - - # Comparison with deltas and statistical significance - return HarborComparison( - created_at="2024-01-01T12:00:00", # Fixed timestamp for determinism - without_agent=without_metrics, - with_agent=with_metrics, - deltas={ - "success_rate_delta": 50.0, # 50 percentage point improvement - "avg_duration_delta_sec": -2.5, # 2.5 second improvement - "avg_duration_delta_pct": -20.0, # 20% faster - }, - statistical_significance={ - "success_rate_significant": True, - "duration_significant": False, - }, - per_task_comparison=[], - ) - - -class TestRunBenchmarkPhase: - """Test _run_benchmark_phase helper function.""" - - @patch("agentready.cli.harbor.click.echo") - def test_run_without_agent(self, mock_echo, tmp_path): - """Test running benchmark phase without agent.""" - mock_runner = MagicMock() - mock_toggler = MagicMock() - - output_dir = tmp_path / "output" - output_dir.mkdir() - - result = _run_benchmark_phase( - runner=mock_runner, - toggler=mock_toggler, - phase_name="WITHOUT agent", - run_number=1, - output_dir=output_dir, - task_list=["task1", "task2"], - model="anthropic/claude-sonnet-4-5", - verbose=False, - disable_agent=True, - ) - - # Should use context manager for agent toggling - mock_toggler.temporarily_disabled.assert_called_once() - - # Should run benchmark - assert mock_runner.run_benchmark.called - - # Should return output directory - assert result == output_dir - - @patch("agentready.cli.harbor.click.echo") - def test_run_with_agent(self, mock_echo, tmp_path): - """Test running benchmark phase with agent.""" - mock_runner = MagicMock() - mock_toggler = MagicMock() - - output_dir = tmp_path / "output" - output_dir.mkdir() - - result = _run_benchmark_phase( - runner=mock_runner, - toggler=mock_toggler, - phase_name="WITH agent", - run_number=2, - output_dir=output_dir, - task_list=["task1"], - model="anthropic/claude-sonnet-4-5", - verbose=True, - disable_agent=False, - ) - - # Should NOT use context manager when agent enabled - mock_toggler.temporarily_disabled.assert_not_called() - - # Should run benchmark with verbose - mock_runner.run_benchmark.assert_called_once_with( - task_names=["task1"], - output_dir=output_dir, - model="anthropic/claude-sonnet-4-5", - verbose=True, - ) - - assert result == output_dir - - @patch("agentready.cli.harbor.click.echo") - @patch("agentready.cli.harbor.click.Abort") - def test_run_handles_exception(self, mock_abort, mock_echo, tmp_path): - """Test benchmark phase handles exceptions.""" - mock_runner = MagicMock() - mock_runner.run_benchmark.side_effect = Exception("Benchmark failed") - mock_toggler = MagicMock() - - with pytest.raises(Exception): - _run_benchmark_phase( - runner=mock_runner, - toggler=mock_toggler, - phase_name="TEST", - run_number=1, - output_dir=tmp_path, - task_list=["task1"], - model="anthropic/claude-sonnet-4-5", - verbose=False, - disable_agent=False, - ) - - -class TestGenerateReports: - """Test _generate_reports helper function.""" - - @patch("agentready.cli.harbor.generate_dashboard") - @patch("agentready.cli.harbor.generate_markdown_report") - @patch("agentready.cli.harbor._create_latest_symlinks") - @patch("agentready.cli.harbor.click.echo") - def test_generates_all_formats( - self, - mock_echo, - mock_symlinks, - mock_markdown, - mock_dashboard, - tmp_path, - mock_comparison, - ): - """Test report generation creates JSON, Markdown, and HTML.""" - run_dir = tmp_path / "run_123" - run_dir.mkdir() - output_dir = tmp_path - - paths = _generate_reports( - comparison=mock_comparison, - run_dir=run_dir, - output_dir=output_dir, - timestamp="20240101_120000", - ) - - # Should generate all three formats - assert "json" in paths - assert "markdown" in paths - assert "html" in paths - - # JSON file should exist - assert paths["json"].exists() - - # Should call generators - mock_markdown.assert_called_once() - mock_dashboard.assert_called_once() - mock_symlinks.assert_called_once() - - @patch("agentready.cli.harbor.generate_dashboard") - @patch("agentready.cli.harbor.generate_markdown_report") - @patch("agentready.cli.harbor._create_latest_symlinks") - @patch("agentready.cli.harbor.click.echo") - def test_json_content_valid( - self, - mock_echo, - mock_symlinks, - mock_markdown, - mock_dashboard, - tmp_path, - mock_comparison, - ): - """Test JSON report contains valid comparison data.""" - run_dir = tmp_path / "run_123" - run_dir.mkdir() - - paths = _generate_reports( - comparison=mock_comparison, - run_dir=run_dir, - output_dir=tmp_path, - timestamp="20240101_120000", - ) - - # Read and validate JSON - with open(paths["json"]) as f: - data = json.load(f) - - assert "created_at" in data - assert "without_agent" in data - assert "with_agent" in data - assert "deltas" in data - - -class TestCreateLatestSymlinks: - """Test _create_latest_symlinks helper function.""" - - @patch("agentready.cli.harbor.click.echo") - def test_creates_symlinks(self, mock_echo, tmp_path): - """Test symlink creation for latest comparison.""" - # Create source files - run_dir = tmp_path / "run_123" - run_dir.mkdir() - - json_file = run_dir / "comparison_123.json" - json_file.write_text("{}") - - md_file = run_dir / "comparison_123.md" - md_file.write_text("# Report") - - html_file = run_dir / "comparison_123.html" - html_file.write_text("") - - paths = { - "json": json_file, - "markdown": md_file, - "html": html_file, - } - - # Create symlinks - _create_latest_symlinks(paths, tmp_path) - - # Verify symlinks exist - assert (tmp_path / "comparison_latest.json").is_symlink() - assert (tmp_path / "comparison_latest.md").is_symlink() - assert (tmp_path / "comparison_latest.html").is_symlink() - - @patch("agentready.cli.harbor.click.echo") - def test_replaces_existing_symlinks(self, mock_echo, tmp_path): - """Test symlink replacement for updates.""" - # Create old files - old_dir = tmp_path / "run_old" - old_dir.mkdir() - old_file = old_dir / "comparison_old.json" - old_file.write_text("{}") - - # Create old symlink - old_symlink = tmp_path / "comparison_latest.json" - old_symlink.symlink_to(old_file.relative_to(tmp_path)) - - # Create new files - new_dir = tmp_path / "run_new" - new_dir.mkdir() - new_file = new_dir / "comparison_new.json" - new_file.write_text("{}") - - paths = {"json": new_file} - - # Update symlink - _create_latest_symlinks(paths, tmp_path) - - # Symlink should point to new file - assert old_symlink.is_symlink() - assert old_symlink.resolve() == new_file.resolve() - - @patch("agentready.cli.harbor.click.echo") - def test_handles_symlink_errors_gracefully(self, mock_echo, tmp_path): - """Test symlink creation handles errors gracefully.""" - paths = { - "json": tmp_path / "nonexistent.json", - } - - # Should not raise exception - _create_latest_symlinks(paths, tmp_path) - - -class TestCompareCommand: - """Test harbor compare CLI command.""" - - @patch("agentready.cli.harbor.HarborRunner") - @patch("agentready.cli.harbor.AgentFileToggler") - @patch("agentready.cli.harbor._run_benchmark_phase") - @patch("agentready.cli.harbor.parse_harbor_results") - @patch("agentready.cli.harbor.compare_runs") - @patch("agentready.cli.harbor._generate_reports") - @patch("agentready.cli.harbor.DashboardGenerator") - def test_compare_basic_execution( - self, - mock_dashboard_gen, - mock_gen_reports, - mock_compare_runs, - mock_parse, - mock_run_phase, - mock_toggler, - mock_runner_class, - runner, - temp_repo, - mock_task_results, - mock_comparison, - ): - """Test basic compare command execution.""" - # Setup mocks - mock_runner_class.return_value = MagicMock() - mock_run_phase.return_value = temp_repo / "results" - mock_parse.return_value = mock_task_results - mock_compare_runs.return_value = mock_comparison - mock_gen_reports.return_value = {"json": temp_repo / "comparison.json"} - mock_dashboard_gen.return_value.generate_summary_text.return_value = "Summary" - - # Run command - result = runner.invoke( - compare, - [ - "--task", - "test-task-1", - "--task", - "test-task-2", - "--agent-file", - str(temp_repo / ".claude/agents/doubleagent.md"), - "--output-dir", - str(temp_repo / "output"), - ], - ) - - # Should succeed - assert result.exit_code == 0 - assert "Harbor Benchmark Comparison" in result.output - assert "Summary" in result.output - - # Should run benchmarks twice (with and without agent) - assert mock_run_phase.call_count == 2 - - def test_compare_missing_agent_file(self, runner, temp_repo): - """Test compare command with missing agent file.""" - result = runner.invoke( - compare, - [ - "--task", - "test-task", - "--agent-file", - str(temp_repo / "nonexistent.md"), - ], - ) - - # Should fail (Click validates path before function runs) - assert result.exit_code != 0 - assert "does not exist" in result.output - - def test_compare_no_tasks_specified(self, runner, temp_repo): - """Test compare command without tasks.""" - result = runner.invoke( - compare, - [ - "--agent-file", - str(temp_repo / ".claude/agents/doubleagent.md"), - ], - ) - - # Should fail - assert result.exit_code != 0 - assert "At least one task must be specified" in result.output - - @patch("agentready.cli.harbor.HarborRunner") - def test_compare_harbor_not_installed(self, mock_runner_class, runner, temp_repo): - """Test compare command when Harbor not installed.""" - from agentready.services.harbor.runner import HarborNotInstalledError - - mock_runner_class.side_effect = HarborNotInstalledError("Harbor not found") - - result = runner.invoke( - compare, - [ - "--task", - "test-task", - "--agent-file", - str(temp_repo / ".claude/agents/doubleagent.md"), - ], - ) - - # Should fail gracefully - assert result.exit_code != 0 - assert "Harbor not found" in result.output - - @patch("agentready.cli.harbor.HarborRunner") - @patch("agentready.cli.harbor.AgentFileToggler") - @patch("agentready.cli.harbor._run_benchmark_phase") - @patch("agentready.cli.harbor.parse_harbor_results") - @patch("agentready.cli.harbor.compare_runs") - @patch("agentready.cli.harbor._generate_reports") - @patch("agentready.cli.harbor.DashboardGenerator") - @patch("webbrowser.open") - def test_compare_open_dashboard( - self, - mock_webbrowser_open, - mock_dashboard_gen, - mock_gen_reports, - mock_compare_runs, - mock_parse, - mock_run_phase, - mock_toggler, - mock_runner_class, - runner, - temp_repo, - mock_task_results, - mock_comparison, - ): - """Test compare command with --open-dashboard flag.""" - # Setup mocks - mock_runner_class.return_value = MagicMock() - mock_run_phase.return_value = temp_repo / "results" - mock_parse.return_value = mock_task_results - mock_compare_runs.return_value = mock_comparison - - html_path = temp_repo / "comparison.html" - html_path.write_text("") - mock_gen_reports.return_value = {"html": html_path} - mock_dashboard_gen.return_value.generate_summary_text.return_value = "Summary" - - # Run command with open-dashboard flag - result = runner.invoke( - compare, - [ - "--task", - "test-task", - "--agent-file", - str(temp_repo / ".claude/agents/doubleagent.md"), - "--open-dashboard", - ], - ) - - # Should succeed - assert result.exit_code == 0 - - # Should open browser - mock_webbrowser_open.assert_called_once() - - @patch("agentready.cli.harbor.HarborRunner") - @patch("agentready.cli.harbor.AgentFileToggler") - @patch("agentready.cli.harbor._run_benchmark_phase") - @patch("agentready.cli.harbor.parse_harbor_results") - def test_compare_parse_results_failure( - self, - mock_parse, - mock_run_phase, - mock_toggler, - mock_runner_class, - runner, - temp_repo, - ): - """Test compare command handles result parsing errors.""" - # Setup mocks - mock_runner_class.return_value = MagicMock() - mock_run_phase.return_value = temp_repo / "results" - mock_parse.side_effect = Exception("Parse error") - - result = runner.invoke( - compare, - [ - "--task", - "test-task", - "--agent-file", - str(temp_repo / ".claude/agents/doubleagent.md"), - ], - ) - - # Should fail gracefully - assert result.exit_code != 0 - assert "Failed to parse results" in result.output - - -class TestListComparisonsCommand: - """Test harbor list CLI command.""" - - def test_list_empty_directory(self, runner, tmp_path): - """Test list command with no comparisons.""" - output_dir = tmp_path / "comparisons" - output_dir.mkdir() - - result = runner.invoke( - list_comparisons, - ["--output-dir", str(output_dir)], - ) - - # Should succeed - assert result.exit_code == 0 - assert "No comparisons found" in result.output - - def test_list_with_comparisons(self, runner, tmp_path, mock_comparison): - """Test list command with existing comparisons.""" - output_dir = tmp_path / "comparisons" - output_dir.mkdir() - - # Create comparison files - run1 = output_dir / "run_20240101_120000" - run1.mkdir() - comp1 = run1 / "comparison_20240101_120000.json" - comp1.write_text(json.dumps(mock_comparison.to_dict())) - - run2 = output_dir / "run_20240102_120000" - run2.mkdir() - comp2 = run2 / "comparison_20240102_120000.json" - comp2.write_text(json.dumps(mock_comparison.to_dict())) - - result = runner.invoke( - list_comparisons, - ["--output-dir", str(output_dir)], - ) - - # Should succeed - assert result.exit_code == 0 - assert "run_20240101_120000" in result.output - assert "run_20240102_120000" in result.output - assert "Success Δ:" in result.output - assert "Duration Δ:" in result.output - - def test_list_nonexistent_directory(self, runner, tmp_path): - """Test list command with nonexistent directory.""" - result = runner.invoke( - list_comparisons, - ["--output-dir", str(tmp_path / "nonexistent")], - ) - - # Should fail - assert result.exit_code != 0 - - -class TestViewComparisonCommand: - """Test harbor view CLI command.""" - - @patch("agentready.cli.harbor.DashboardGenerator") - def test_view_summary_format( - self, mock_dashboard_gen, runner, tmp_path, mock_comparison - ): - """Test view command with summary format.""" - # Create comparison file - comp_file = tmp_path / "comparison.json" - comp_file.write_text(json.dumps(mock_comparison.to_dict())) - - mock_dashboard_gen.return_value.generate_summary_text.return_value = ( - "Test Summary" - ) - - result = runner.invoke( - view_comparison, - [str(comp_file), "--format", "summary"], - ) - - # Should succeed - assert result.exit_code == 0 - assert "Test Summary" in result.output - - def test_view_full_format(self, runner, tmp_path, mock_comparison): - """Test view command with full JSON format.""" - # Create comparison file - comp_file = tmp_path / "comparison.json" - comp_file.write_text(json.dumps(mock_comparison.to_dict())) - - result = runner.invoke( - view_comparison, - [str(comp_file), "--format", "full"], - ) - - # Should succeed - assert result.exit_code == 0 - # Should output JSON - assert "without_agent" in result.output - assert "with_agent" in result.output - - def test_view_nonexistent_file(self, runner, tmp_path): - """Test view command with nonexistent file.""" - result = runner.invoke( - view_comparison, - [str(tmp_path / "nonexistent.json")], - ) - - # Should fail - assert result.exit_code != 0 - - def test_view_default_format(self, runner, tmp_path, mock_comparison): - """Test view command defaults to summary format.""" - comp_file = tmp_path / "comparison.json" - comp_file.write_text(json.dumps(mock_comparison.to_dict())) - - with patch("agentready.cli.harbor.DashboardGenerator") as mock_gen: - mock_gen.return_value.generate_summary_text.return_value = "Summary" - - result = runner.invoke( - view_comparison, - [str(comp_file)], - ) - - # Should use summary format by default - assert result.exit_code == 0 - mock_gen.return_value.generate_summary_text.assert_called_once() - - -class TestHarborCLIGroup: - """Test harbor CLI group.""" - - def test_harbor_group_help(self, runner): - """Test harbor CLI group shows help.""" - result = runner.invoke(harbor_cli, ["--help"]) - - assert result.exit_code == 0 - assert "Harbor benchmark comparison commands" in result.output - assert "compare" in result.output - assert "list" in result.output - assert "view" in result.output - - def test_harbor_group_has_commands(self): - """Test harbor CLI group has expected commands.""" - assert "compare" in harbor_cli.commands - assert "list" in harbor_cli.commands - assert "view" in harbor_cli.commands diff --git a/tests/unit/test_eval_harness_cli.py b/tests/unit/test_eval_harness_cli.py deleted file mode 100644 index 138587ec..00000000 --- a/tests/unit/test_eval_harness_cli.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Tests for eval harness CLI aggregation functionality. - -Following TDD red-green-refactor workflow: -- Phase 4.1 (RED): Write aggregation tests (T054-T058) -- Phase 4.2 (GREEN): Implement pandas aggregation -- Phase 4.3 (REFACTOR): Add docstrings and documentation -""" - -import json -import tempfile -from pathlib import Path - -import pytest - -from agentready.services.eval_harness.aggregator import aggregate_results - - -class TestAggregationLogic: - """Test pandas-based aggregation of benchmark results""" - - def test_summarize_aggregates_by_assessor(self): - """T054: Verify pandas groupby on assessor_id""" - # Sample benchmark results - results = [ - {"assessor_id": "claude_md", "delta_score": 0.12}, - {"assessor_id": "claude_md", "delta_score": 0.10}, - {"assessor_id": "test_execution", "delta_score": 0.08}, - {"assessor_id": "test_execution", "delta_score": 0.07}, - ] - - summary = aggregate_results(results) - - # Verify grouping by assessor_id - assert "claude_md" in summary.index - assert "test_execution" in summary.index - assert len(summary) == 2 - - def test_summarize_calculates_mean_median_std(self): - """T055: Verify correct aggregation functions""" - results = [ - {"assessor_id": "claude_md", "delta_score": 0.10}, - {"assessor_id": "claude_md", "delta_score": 0.12}, - {"assessor_id": "claude_md", "delta_score": 0.14}, - ] - - summary = aggregate_results(results) - - # Verify statistics calculations - assert "mean_delta" in summary.columns - assert "median_delta" in summary.columns - assert "std_delta" in summary.columns - assert "sample_size" in summary.columns - - # Verify values - claude_stats = summary.loc["claude_md"] - assert claude_stats["mean_delta"] == pytest.approx(0.12, abs=0.01) - assert claude_stats["median_delta"] == pytest.approx(0.12, abs=0.01) - assert claude_stats["sample_size"] == 3 - - def test_summarize_adds_significance_indicator(self): - """T056: Verify boolean significant column added""" - results = [ - {"assessor_id": "high_impact", "delta_score": 0.10}, - {"assessor_id": "high_impact", "delta_score": 0.12}, - {"assessor_id": "low_impact", "delta_score": 0.02}, - {"assessor_id": "low_impact", "delta_score": 0.01}, - ] - - summary = aggregate_results(results) - - # Verify significant column exists - assert "significant" in summary.columns - - # Verify significance threshold (placeholder: abs(mean_delta) > 0.05) - assert summary.loc["high_impact"]["significant"] - assert not summary.loc["low_impact"]["significant"] - - def test_summarize_sorts_by_mean_delta_descending(self): - """T057: Verify results sorted correctly""" - results = [ - {"assessor_id": "low", "delta_score": 0.02}, - {"assessor_id": "high", "delta_score": 0.15}, - {"assessor_id": "medium", "delta_score": 0.08}, - ] - - summary = aggregate_results(results) - - # Verify sorting (descending by mean_delta) - assessors_sorted = summary.index.tolist() - assert assessors_sorted[0] == "high" - assert assessors_sorted[1] == "medium" - assert assessors_sorted[2] == "low" - - def test_summarize_exports_json(self): - """T058: Verify JSON file written with correct schema""" - results = [ - {"assessor_id": "claude_md", "delta_score": 0.12}, - {"assessor_id": "test_execution", "delta_score": 0.08}, - ] - - summary = aggregate_results(results) - - # Verify DataFrame can be exported to JSON - with tempfile.TemporaryDirectory() as tmpdir: - output_path = Path(tmpdir) / "aggregation-results.json" - summary.to_json(output_path, orient="records") - - # Verify file exists and is valid JSON - assert output_path.exists() - with open(output_path) as f: - exported_data = json.load(f) - assert isinstance(exported_data, list) - assert len(exported_data) == 2 - - -class TestAggregationEdgeCases: - """Test edge cases in aggregation""" - - def test_empty_results_list(self): - """Test handling of empty results""" - results = [] - summary = aggregate_results(results) - assert len(summary) == 0 - - def test_single_assessor_single_result(self): - """Test aggregation with minimal data""" - results = [{"assessor_id": "claude_md", "delta_score": 0.10}] - summary = aggregate_results(results) - assert len(summary) == 1 - assert summary.loc["claude_md"]["mean_delta"] == 0.10 - assert summary.loc["claude_md"]["std_delta"] == 0.0 # Single value has no std - - def test_negative_delta_scores(self): - """Test that negative deltas (regressions) are handled""" - results = [ - {"assessor_id": "regression", "delta_score": -0.05}, - {"assessor_id": "regression", "delta_score": -0.03}, - ] - summary = aggregate_results(results) - assert summary.loc["regression"]["mean_delta"] < 0 - assert not summary.loc["regression"]["significant"] # abs < 0.05 diff --git a/tests/unit/test_eval_harness_services.py b/tests/unit/test_eval_harness_services.py deleted file mode 100644 index 581084c3..00000000 --- a/tests/unit/test_eval_harness_services.py +++ /dev/null @@ -1,391 +0,0 @@ -""" -Tests for Harbor subprocess integration and JSON parsing. - -Following TDD red-green-refactor workflow: -- Phase 3.1 (RED): Write tests for Harbor subprocess integration (T023-T027) -- Phase 3.2 (RED): Write tests for JSON parsing with path validation (T028-T031) -- Phase 3.3-3.5 (GREEN): Implement to make tests pass -- Phase 3.7 (REFACTOR): Add docstrings and improve code quality -""" - -import json -import subprocess -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, mock_open, patch - -import pytest - -from agentready.services.eval_harness.tbench_runner import ( - TbenchResult, - parse_harbor_results, -) - - -class TestHarborSubprocessIntegration: - """Test Harbor subprocess execution with security validations (T023-T027)""" - - @patch("agentready.services.eval_harness.tbench_runner.subprocess.run") - def test_real_tbench_result_subprocess_called(self, mock_run): - """T023: Verify harbor run command constructed correctly""" - # Mock subprocess success and results file - mock_run.return_value = MagicMock(returncode=0) - - mock_results = { - "summary": { - "resolved_trials": 42, - "unresolved_trials": 8, - "accuracy": 0.84, - "pass@1": 0.78, - "pass@3": 0.84, - } - } - - with patch("builtins.open", mock_open(read_data=json.dumps(mock_results))): - with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}): - from agentready.services.eval_harness.tbench_runner import ( - _real_tbench_result, - ) - - _real_tbench_result(Path("/fake/repo")) - - # Verify subprocess.run was called - assert mock_run.called - - # Verify command structure - call_args = mock_run.call_args[0][0] - assert "harbor" in call_args - assert "run" in call_args - assert "--dataset" in call_args - assert "terminal-bench@2.0" in call_args - assert "--agent" in call_args - assert "claude-code" in call_args - assert "--model" in call_args - - @patch("agentready.services.eval_harness.tbench_runner.subprocess.run") - def test_environment_variable_sanitization(self, mock_run): - """T024 [US3]: Verify only ANTHROPIC_API_KEY, PATH, HOME passed to subprocess""" - mock_run.return_value = MagicMock(returncode=0) - - mock_results = { - "summary": { - "resolved_trials": 1, - "unresolved_trials": 0, - "accuracy": 1.0, - "pass@1": 1.0, - "pass@3": 1.0, - } - } - - with patch("builtins.open", mock_open(read_data=json.dumps(mock_results))): - # Set multiple environment variables - with patch.dict( - "os.environ", - { - "ANTHROPIC_API_KEY": "test-key", - "PATH": "/usr/bin", - "HOME": "/home/user", - "JAVA_HOME": "/opt/java", # Should NOT be passed - "SECRET_TOKEN": "secret123", # Should NOT be passed - }, - ): - from agentready.services.eval_harness.tbench_runner import ( - _real_tbench_result, - ) - - _real_tbench_result(Path("/fake/repo")) - - # Verify env parameter - call_kwargs = mock_run.call_args[1] - clean_env = call_kwargs["env"] - - # Required env vars present - assert "ANTHROPIC_API_KEY" in clean_env - assert "PATH" in clean_env - assert "HOME" in clean_env - - # Forbidden env vars NOT present - assert "JAVA_HOME" not in clean_env - assert "SECRET_TOKEN" not in clean_env - - @patch("agentready.services.eval_harness.tbench_runner.subprocess.run") - def test_harbor_subprocess_timeout_enforced(self, mock_run): - """T025: Verify subprocess.run called with timeout=3600""" - mock_run.return_value = MagicMock(returncode=0) - - mock_results = { - "summary": { - "resolved_trials": 1, - "unresolved_trials": 0, - "accuracy": 1.0, - "pass@1": 1.0, - "pass@3": 1.0, - } - } - - with patch("builtins.open", mock_open(read_data=json.dumps(mock_results))): - with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}): - from agentready.services.eval_harness.tbench_runner import ( - _real_tbench_result, - ) - - _real_tbench_result(Path("/fake/repo")) - - # Verify timeout parameter - call_kwargs = mock_run.call_args[1] - assert call_kwargs["timeout"] == 3600 - - @patch("agentready.services.eval_harness.tbench_runner.subprocess.run") - def test_harbor_subprocess_timeout_exception(self, mock_run): - """T026: Verify RuntimeError raised when subprocess times out""" - mock_run.side_effect = subprocess.TimeoutExpired("harbor", 3600) - - with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}): - from agentready.services.eval_harness.tbench_runner import ( - _real_tbench_result, - ) - - with pytest.raises(RuntimeError, match="timed out"): - _real_tbench_result(Path("/fake/repo")) - - @patch("agentready.services.eval_harness.tbench_runner.subprocess.run") - def test_harbor_subprocess_failure_exception(self, mock_run): - """T027: Verify RuntimeError raised when subprocess fails""" - mock_run.side_effect = subprocess.CalledProcessError(1, "harbor") - - with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}): - from agentready.services.eval_harness.tbench_runner import ( - _real_tbench_result, - ) - - with pytest.raises(RuntimeError, match="failed"): - _real_tbench_result(Path("/fake/repo")) - - -class TestJSONParsingWithPathValidation: - """Test JSON parsing with security path validation (T028-T031)""" - - def test_parse_harbor_results_valid_json(self): - """T028 [US3]: Verify results.json parsed correctly""" - mock_results = { - "summary": { - "resolved_trials": 42, - "unresolved_trials": 8, - "accuracy": 0.84, - "pass@1": 0.78, - "pass@3": 0.84, - } - } - - with tempfile.TemporaryDirectory() as tmpdir: - results_path = Path(tmpdir) / "results.json" - with open(results_path, "w") as f: - json.dump(mock_results, f) - - result = parse_harbor_results(results_path) - - assert isinstance(result, TbenchResult) - assert result.score == 0.84 - assert result.resolved_trials == 42 - assert result.unresolved_trials == 8 - - def test_parse_harbor_results_creates_tbench_result(self): - """T029: Verify TbenchResult created with is_mocked=False""" - mock_results = { - "summary": { - "resolved_trials": 10, - "unresolved_trials": 5, - "accuracy": 0.67, - "pass@1": 0.60, - "pass@3": 0.67, - } - } - - with tempfile.TemporaryDirectory() as tmpdir: - results_path = Path(tmpdir) / "results.json" - with open(results_path, "w") as f: - json.dump(mock_results, f) - - result = parse_harbor_results(results_path) - - assert result.is_mocked is False - assert result.task_solved is True # resolved_trials > 0 - - def test_parse_harbor_results_path_validation(self): - """T030 [US3]: Verify path traversal attack (../../etc/passwd) rejected""" - # This test verifies path validation happens in _real_tbench_result - # The parse_harbor_results function itself doesn't do path validation - # Path validation is done before calling parse_harbor_results - - # Path validation is verified via the subprocess integration tests - # which ensure results_path.is_relative_to(jobs_dir) check occurs - pass # Path traversal prevention is tested in integration tests - - def test_parse_harbor_results_invalid_json_exception(self): - """T031: Verify JSONDecodeError handled gracefully""" - with tempfile.TemporaryDirectory() as tmpdir: - results_path = Path(tmpdir) / "results.json" - with open(results_path, "w") as f: - f.write("invalid json {{{") - - with pytest.raises(json.JSONDecodeError): - parse_harbor_results(results_path) - - -class TestParallelExecution: - """Test parallel benchmark execution with resource limits (T070-T073)""" - - @patch("agentready.services.eval_harness.batch_runner._real_tbench_result") - @patch("agentready.services.eval_harness.batch_runner.as_completed") - @patch("agentready.services.eval_harness.batch_runner.ProcessPoolExecutor") - def test_parallel_execution_max_4_workers( - self, mock_executor_class, mock_as_completed, mock_real_tbench - ): - """T070 [US4]: Verify ProcessPoolExecutor initialized with max_workers=4""" - from agentready.services.eval_harness.batch_runner import run_batch_benchmarks - - # Mock the benchmark function to return success - mock_real_tbench.return_value = TbenchResult( - score=0.8, task_solved=True, is_mocked=False - ) - - # Mock executor context manager - mock_executor = MagicMock() - mock_executor_class.return_value.__enter__.return_value = mock_executor - - # Create mock futures - mock_futures = [] - for i in range(3): - future = MagicMock() - future.result.return_value = TbenchResult( - score=0.8, task_solved=True, is_mocked=False - ) - mock_futures.append(future) - - mock_executor.submit.side_effect = mock_futures - mock_as_completed.return_value = mock_futures - - # Run with test repositories - repos = [Path("/repo1"), Path("/repo2"), Path("/repo3")] - run_batch_benchmarks(repos) - - # Verify max_workers=4 - mock_executor_class.assert_called_once_with(max_workers=4) - - @patch("agentready.services.eval_harness.batch_runner._real_tbench_result") - @patch("agentready.services.eval_harness.batch_runner.ProcessPoolExecutor") - def test_parallel_execution_timeout_per_job( - self, mock_executor_class, mock_real_tbench - ): - """T071 [US4]: Verify each job has 3600s timeout""" - from agentready.services.eval_harness.batch_runner import run_batch_benchmarks - - # Mock the benchmark function - mock_real_tbench.return_value = TbenchResult( - score=0.8, task_solved=True, is_mocked=False - ) - - # Mock executor and future - mock_executor = MagicMock() - mock_future = MagicMock() - mock_executor_class.return_value.__enter__.return_value = mock_executor - mock_executor.submit.return_value = mock_future - mock_future.result.return_value = TbenchResult( - score=0.8, task_solved=True, is_mocked=False - ) - - # Mock as_completed to return the future - with patch( - "agentready.services.eval_harness.batch_runner.as_completed" - ) as mock_as_completed: - mock_as_completed.return_value = [mock_future] - - repos = [Path("/repo1")] - run_batch_benchmarks(repos) - - # Verify timeout parameter - mock_future.result.assert_called_once_with(timeout=3600) - - @patch("agentready.services.eval_harness.batch_runner._real_tbench_result") - @patch("agentready.services.eval_harness.batch_runner.ProcessPoolExecutor") - def test_parallel_execution_handles_partial_failures( - self, mock_executor_class, mock_real_tbench - ): - """T072 [US4]: Verify some jobs can fail without blocking others""" - from agentready.services.eval_harness.batch_runner import run_batch_benchmarks - - # Mock executor with mixed success/failure futures - mock_executor = MagicMock() - mock_executor_class.return_value.__enter__.return_value = mock_executor - - # Create 3 futures: success, failure, success - future_success_1 = MagicMock() - future_success_1.result.return_value = TbenchResult( - score=0.8, task_solved=True, is_mocked=False - ) - - future_failure = MagicMock() - future_failure.result.side_effect = RuntimeError("Harbor subprocess failed") - - future_success_2 = MagicMock() - future_success_2.result.return_value = TbenchResult( - score=0.7, task_solved=True, is_mocked=False - ) - - mock_executor.submit.side_effect = [ - future_success_1, - future_failure, - future_success_2, - ] - - with patch( - "agentready.services.eval_harness.batch_runner.as_completed" - ) as mock_as_completed: - mock_as_completed.return_value = [ - future_success_1, - future_failure, - future_success_2, - ] - - repos = [Path("/repo1"), Path("/repo2"), Path("/repo3")] - results = run_batch_benchmarks(repos) - - # Should return 2 successful results, ignore 1 failure - assert len(results) == 2 - assert all(isinstance(r, TbenchResult) for r in results) - - @patch("agentready.services.eval_harness.batch_runner._real_tbench_result") - @patch("agentready.services.eval_harness.batch_runner.ProcessPoolExecutor") - def test_parallel_execution_aggregates_successful_results( - self, mock_executor_class, mock_real_tbench - ): - """T073 [US4]: Verify only successful results aggregated""" - from agentready.services.eval_harness.batch_runner import run_batch_benchmarks - - # Mock executor with multiple successful futures - mock_executor = MagicMock() - mock_executor_class.return_value.__enter__.return_value = mock_executor - - # Create successful futures with different scores - futures = [] - for score in [0.9, 0.8, 0.7, 0.6]: - future = MagicMock() - future.result.return_value = TbenchResult( - score=score, task_solved=True, is_mocked=False - ) - futures.append(future) - - mock_executor.submit.side_effect = futures - - with patch( - "agentready.services.eval_harness.batch_runner.as_completed" - ) as mock_as_completed: - mock_as_completed.return_value = futures - - repos = [Path(f"/repo{i}") for i in range(1, 5)] - results = run_batch_benchmarks(repos) - - # Verify all successful results returned - assert len(results) == 4 - scores = [r.score for r in results] - assert scores == [0.9, 0.8, 0.7, 0.6] diff --git a/tests/unit/test_harbor_config.py b/tests/unit/test_harbor_config.py deleted file mode 100644 index 7f54d1c2..00000000 --- a/tests/unit/test_harbor_config.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -Tests for Harbor framework configuration validation. - -Following TDD red-green-refactor workflow: -- Phase 1 (RED): Write tests, verify they FAIL -- Phase 2 (GREEN): Implement HarborConfig to make tests PASS -- Phase 3 (REFACTOR): Add docstrings and improve code quality -""" - -from pathlib import Path - -import pytest - -from agentready.services.eval_harness.harbor_config import ( - ALLOWED_AGENTS, - ALLOWED_MODELS, - HarborConfig, -) - - -class TestHarborConfigValidModels: - """Test valid model acceptance""" - - def test_harbor_config_valid_model_haiku(self): - """Test that haiku-4-5 model is accepted""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.model == "anthropic/claude-haiku-4-5" - - def test_harbor_config_valid_model_sonnet(self): - """Test that sonnet-4-5 model is accepted""" - config = HarborConfig( - model="anthropic/claude-sonnet-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.model == "anthropic/claude-sonnet-4-5" - - -class TestHarborConfigInvalidModels: - """Test invalid model rejection""" - - def test_harbor_config_invalid_model_rejected(self): - """Test that invalid model raises ValueError""" - with pytest.raises(ValueError, match="Invalid model"): - HarborConfig( - model="invalid/model", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - - def test_harbor_config_invalid_model_opus_rejected(self): - """Test that opus (expensive, not in allowlist) is rejected""" - with pytest.raises(ValueError, match="Invalid model"): - HarborConfig( - model="anthropic/claude-opus-4-1", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - - -class TestHarborConfigInvalidAgents: - """Test invalid agent rejection""" - - def test_harbor_config_invalid_agent_rejected(self): - """Test that invalid agent raises ValueError""" - with pytest.raises(ValueError, match="Invalid agent"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="invalid-agent", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - - def test_harbor_config_oracle_agent_rejected(self): - """Test that oracle agent (reference baseline, not relevant) is rejected""" - with pytest.raises(ValueError, match="Invalid agent"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="oracle", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - - -class TestHarborConfigAPIKey: - """Test API key validation""" - - def test_harbor_config_empty_api_key_rejected(self): - """Test that empty API key raises ValueError""" - with pytest.raises(ValueError, match="API key"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="", - ) - - def test_harbor_config_none_api_key_rejected(self): - """Test that None API key raises ValueError""" - with pytest.raises(ValueError, match="API key"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key=None, - ) - - -class TestHarborConfigTimeout: - """Test timeout validation""" - - def test_harbor_config_negative_timeout_rejected(self): - """Test that negative timeout raises ValueError""" - with pytest.raises(ValueError, match="Timeout"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - timeout=-1, - ) - - def test_harbor_config_zero_timeout_rejected(self): - """Test that zero timeout raises ValueError""" - with pytest.raises(ValueError, match="Timeout"): - HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - timeout=0, - ) - - def test_harbor_config_positive_timeout_accepted(self): - """Test that positive timeout is accepted""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - timeout=3600, - ) - assert config.timeout == 3600 - - -class TestHarborConfigPathResolution: - """Test jobs_dir path resolution""" - - def test_harbor_config_path_resolution(self): - """Test that jobs_dir is resolved to absolute path""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("relative/path"), - api_key="test-key", - ) - assert config.jobs_dir.is_absolute() - - def test_harbor_config_absolute_path_unchanged(self): - """Test that absolute path remains unchanged""" - abs_path = Path("/tmp/test").resolve() - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=abs_path, - api_key="test-key", - ) - assert config.jobs_dir == abs_path - - -class TestHarborConfigDefaults: - """Test default values""" - - def test_harbor_config_default_timeout(self): - """Test that default timeout is 3600 seconds""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.timeout == 3600 - - def test_harbor_config_default_n_concurrent(self): - """Test that default n_concurrent is 1""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="claude-code", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.n_concurrent == 1 - - -class TestAllowlists: - """Test allowlist constants""" - - def test_allowed_models_contains_haiku(self): - """Test that ALLOWED_MODELS contains haiku-4-5""" - assert "anthropic/claude-haiku-4-5" in ALLOWED_MODELS - - def test_allowed_models_contains_sonnet(self): - """Test that ALLOWED_MODELS contains sonnet-4-5""" - assert "anthropic/claude-sonnet-4-5" in ALLOWED_MODELS - - def test_allowed_agents_contains_claude_code(self): - """Test that ALLOWED_AGENTS contains claude-code""" - assert "claude-code" in ALLOWED_AGENTS - - def test_allowed_models_is_set(self): - """Test that ALLOWED_MODELS is a set (not list)""" - assert isinstance(ALLOWED_MODELS, set) - - def test_allowed_agents_is_set(self): - """Test that ALLOWED_AGENTS is a set (not list)""" - assert isinstance(ALLOWED_AGENTS, set) - - -class TestHarborConfigCursorModels: - """Test cursor/* model acceptance""" - - def test_harbor_config_cursor_models_accepted(self): - """Test that cursor/* models are accepted""" - config = HarborConfig( - model="cursor/sonnet-4.5", - agent="cursor-cli", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.model == "cursor/sonnet-4.5" - - config_gemini = HarborConfig( - model="cursor/gemini-3-pro", - agent="cursor-cli", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config_gemini.model == "cursor/gemini-3-pro" - - -class TestHarborConfigCursorAgent: - """Test cursor-cli agent acceptance""" - - def test_harbor_config_cursor_agent_accepted(self): - """Test that cursor-cli agent is accepted""" - config = HarborConfig( - model="anthropic/claude-haiku-4-5", - agent="cursor-cli", - jobs_dir=Path("/tmp/test"), - api_key="test-key", - ) - assert config.agent == "cursor-cli" diff --git a/tests/unit/test_harbor_models.py b/tests/unit/test_harbor_models.py deleted file mode 100644 index a6bb0009..00000000 --- a/tests/unit/test_harbor_models.py +++ /dev/null @@ -1,297 +0,0 @@ -"""Unit tests for Harbor data models.""" - -import pytest - -from agentready.models.harbor import ( - HarborComparison, - HarborRunMetrics, - HarborTaskResult, -) - - -class TestHarborTaskResult: - """Tests for HarborTaskResult model.""" - - def test_from_result_json_success(self): - """Test creating HarborTaskResult from successful result.json data.""" - result_data = { - "task_name": "adaptive-rejection-sampler", - "trial_name": "adaptive-rejection-sampler__ABC123", - "agent_result": {"status": "completed"}, - "verifier_result": {"passed": True}, - "exception_info": None, - "started_at": "2025-12-09T10:00:00", - "finished_at": "2025-12-09T10:05:00", - } - - result = HarborTaskResult.from_result_json(result_data) - - assert result.task_name == "adaptive-rejection-sampler" - assert result.trial_name == "adaptive-rejection-sampler__ABC123" - assert result.success is True - assert result.duration_sec == 300.0 # 5 minutes - assert result.agent_result == {"status": "completed"} - assert result.verifier_result == {"passed": True} - assert result.exception_info is None - - def test_from_result_json_failure(self): - """Test creating HarborTaskResult from failed result.json data.""" - result_data = { - "task_name": "async-http-client", - "trial_name": "async-http-client__DEF456", - "agent_result": None, - "verifier_result": None, - "exception_info": { - "exception_type": "TimeoutError", - "exception_message": "Task timed out", - }, - "started_at": "2025-12-09T10:00:00", - "finished_at": "2025-12-09T10:30:00", - } - - result = HarborTaskResult.from_result_json(result_data) - - assert result.task_name == "async-http-client" - assert result.success is False - assert result.duration_sec == 1800.0 # 30 minutes - assert result.exception_info["exception_type"] == "TimeoutError" - - def test_to_dict(self): - """Test converting HarborTaskResult to dictionary.""" - result = HarborTaskResult( - task_name="test-task", - trial_name="test-task__123", - success=True, - duration_sec=120.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:02:00", - ) - - result_dict = result.to_dict() - - assert result_dict["task_name"] == "test-task" - assert result_dict["success"] is True - assert result_dict["duration_sec"] == 120.0 - - -class TestHarborRunMetrics: - """Tests for HarborRunMetrics model.""" - - def test_from_task_results_all_successful(self): - """Test calculating metrics from all successful task results.""" - task_results = [ - HarborTaskResult( - task_name=f"task{i}", - trial_name=f"task{i}__ABC", - success=True, - duration_sec=60.0 * i, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at=f"2025-12-09T10:0{i}:00", - ) - for i in range(1, 4) - ] - - metrics = HarborRunMetrics.from_task_results("run1", True, task_results) - - assert metrics.run_id == "run1" - assert metrics.agent_file_enabled is True - assert metrics.total_tasks == 3 - assert metrics.successful_tasks == 3 - assert metrics.failed_tasks == 0 - assert metrics.timed_out_tasks == 0 - assert metrics.success_rate == 100.0 - assert metrics.completion_rate == 100.0 - assert metrics.avg_duration_sec == 120.0 # (60 + 120 + 180) / 3 - - def test_from_task_results_mixed(self): - """Test calculating metrics from mixed success/failure results.""" - task_results = [ - HarborTaskResult( - task_name="task1", - trial_name="task1__ABC", - success=True, - duration_sec=60.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:01:00", - ), - HarborTaskResult( - task_name="task2", - trial_name="task2__DEF", - success=False, - duration_sec=120.0, - agent_result=None, - verifier_result=None, - exception_info={"exception_type": "TimeoutError"}, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:02:00", - ), - HarborTaskResult( - task_name="task3", - trial_name="task3__GHI", - success=False, - duration_sec=90.0, - agent_result={"status": "error"}, - verifier_result=None, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:01:30", - ), - ] - - metrics = HarborRunMetrics.from_task_results("run2", False, task_results) - - assert metrics.total_tasks == 3 - assert metrics.successful_tasks == 1 - assert metrics.failed_tasks == 1 # task3 (no timeout exception) - assert metrics.timed_out_tasks == 1 # task2 - assert metrics.success_rate == pytest.approx(33.33, rel=0.01) - assert metrics.completion_rate == pytest.approx(66.67, rel=0.01) - - def test_to_dict(self): - """Test converting HarborRunMetrics to dictionary.""" - task_results = [ - HarborTaskResult( - task_name="task1", - trial_name="task1__ABC", - success=True, - duration_sec=60.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:01:00", - ) - ] - - metrics = HarborRunMetrics.from_task_results("run1", True, task_results) - metrics_dict = metrics.to_dict() - - assert metrics_dict["run_id"] == "run1" - assert metrics_dict["agent_file_enabled"] is True - assert metrics_dict["total_tasks"] == 1 - assert len(metrics_dict["task_results"]) == 1 - - -class TestHarborComparison: - """Tests for HarborComparison model.""" - - @pytest.fixture - def sample_metrics(self): - """Create sample metrics for testing.""" - without_results = [ - HarborTaskResult( - task_name="task1", - trial_name="task1__ABC", - success=False, - duration_sec=120.0, - agent_result=None, - verifier_result=None, - exception_info={"exception_type": "TimeoutError"}, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:02:00", - ), - HarborTaskResult( - task_name="task2", - trial_name="task2__DEF", - success=True, - duration_sec=180.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:03:00", - ), - ] - - with_results = [ - HarborTaskResult( - task_name="task1", - trial_name="task1__GHI", - success=True, - duration_sec=90.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:01:30", - ), - HarborTaskResult( - task_name="task2", - trial_name="task2__JKL", - success=True, - duration_sec=150.0, - agent_result={"status": "ok"}, - verifier_result={"passed": True}, - exception_info=None, - started_at="2025-12-09T10:00:00", - finished_at="2025-12-09T10:02:30", - ), - ] - - without_agent = HarborRunMetrics.from_task_results( - "run1", False, without_results - ) - with_agent = HarborRunMetrics.from_task_results("run2", True, with_results) - - return without_agent, with_agent - - def test_calculate_deltas(self, sample_metrics): - """Test calculating delta metrics.""" - without_agent, with_agent = sample_metrics - comparison = HarborComparison( - without_agent=without_agent, with_agent=with_agent - ) - - comparison.calculate_deltas() - - assert "success_rate_delta" in comparison.deltas - assert comparison.deltas["success_rate_delta"] == 50.0 # 50% -> 100% - assert "avg_duration_delta_sec" in comparison.deltas - assert "avg_duration_delta_pct" in comparison.deltas - assert comparison.deltas["successful_tasks_delta"] == 1 # 1 -> 2 - - def test_generate_per_task_comparison(self, sample_metrics): - """Test generating per-task comparison.""" - without_agent, with_agent = sample_metrics - comparison = HarborComparison( - without_agent=without_agent, with_agent=with_agent - ) - - comparison.generate_per_task_comparison() - - assert len(comparison.per_task_comparison) == 2 - task1_comparison = next( - c for c in comparison.per_task_comparison if c["task_name"] == "task1" - ) - - assert task1_comparison["without_agent"]["success"] is False - assert task1_comparison["with_agent"]["success"] is True - assert task1_comparison["delta"]["success_improved"] is True - - def test_to_dict_and_from_dict(self, sample_metrics): - """Test serialization and deserialization.""" - without_agent, with_agent = sample_metrics - comparison = HarborComparison( - without_agent=without_agent, with_agent=with_agent - ) - comparison.calculate_deltas() - comparison.generate_per_task_comparison() - - comparison_dict = comparison.to_dict() - restored_comparison = HarborComparison.from_dict(comparison_dict) - - assert restored_comparison.without_agent.run_id == without_agent.run_id - assert restored_comparison.with_agent.run_id == with_agent.run_id - assert restored_comparison.deltas == comparison.deltas - assert len(restored_comparison.per_task_comparison) == len( - comparison.per_task_comparison - ) diff --git a/tests/unit/test_harbor_services.py b/tests/unit/test_harbor_services.py deleted file mode 100644 index d2d000c0..00000000 --- a/tests/unit/test_harbor_services.py +++ /dev/null @@ -1,255 +0,0 @@ -"""Unit tests for Harbor services.""" - -import json - -import pytest - -from agentready.models.harbor import HarborTaskResult -from agentready.services.harbor.agent_toggler import AgentFileToggler -from agentready.services.harbor.result_parser import ( - parse_harbor_results, - parse_single_result, -) - - -class TestAgentFileToggler: - """Tests for AgentFileToggler service.""" - - @pytest.fixture - def sample_agent_file(self, tmp_path): - """Create a sample agent file for testing.""" - agent_file = tmp_path / ".claude" / "agents" / "doubleagent.md" - agent_file.parent.mkdir(parents=True, exist_ok=True) - agent_file.write_text("# Agent Content\n\nThis is the agent file.") - return agent_file - - def test_disable_enable(self, sample_agent_file): - """Test basic disable/enable functionality.""" - toggler = AgentFileToggler(sample_agent_file) - - # Initially enabled - assert toggler.is_enabled() - assert not toggler.is_disabled() - - # Disable - toggler.disable() - assert not toggler.is_enabled() - assert toggler.is_disabled() - assert not sample_agent_file.exists() - assert toggler.disabled_file.exists() - - # Enable - toggler.enable() - assert toggler.is_enabled() - assert not toggler.is_disabled() - assert sample_agent_file.exists() - assert not toggler.disabled_file.exists() - - def test_disable_idempotent(self, sample_agent_file): - """Test that disable is idempotent.""" - toggler = AgentFileToggler(sample_agent_file) - - toggler.disable() - assert toggler.is_disabled() - - # Disable again (should be no-op) - toggler.disable() - assert toggler.is_disabled() - - def test_enable_idempotent(self, sample_agent_file): - """Test that enable is idempotent.""" - toggler = AgentFileToggler(sample_agent_file) - - toggler.disable() - toggler.enable() - assert toggler.is_enabled() - - # Enable again (should be no-op) - toggler.enable() - assert toggler.is_enabled() - - def test_temporarily_disabled_context_manager(self, sample_agent_file): - """Test temporarily_disabled context manager.""" - toggler = AgentFileToggler(sample_agent_file) - - assert toggler.is_enabled() - - with toggler.temporarily_disabled(): - assert toggler.is_disabled() - assert not sample_agent_file.exists() - - # Restored after context exit - assert toggler.is_enabled() - assert sample_agent_file.exists() - - def test_temporarily_disabled_with_exception(self, sample_agent_file): - """Test that temporarily_disabled restores even on exception.""" - toggler = AgentFileToggler(sample_agent_file) - - assert toggler.is_enabled() - - with pytest.raises(ValueError): - with toggler.temporarily_disabled(): - assert toggler.is_disabled() - raise ValueError("Test exception") - - # Restored even after exception - assert toggler.is_enabled() - assert sample_agent_file.exists() - - def test_temporarily_enabled_context_manager(self, sample_agent_file): - """Test temporarily_enabled context manager.""" - toggler = AgentFileToggler(sample_agent_file) - toggler.disable() - - assert toggler.is_disabled() - - with toggler.temporarily_enabled(): - assert toggler.is_enabled() - assert sample_agent_file.exists() - - # Restored to disabled after context exit - assert toggler.is_disabled() - assert not sample_agent_file.exists() - - def test_file_content_preserved(self, sample_agent_file): - """Test that file content is preserved through disable/enable.""" - original_content = sample_agent_file.read_text() - toggler = AgentFileToggler(sample_agent_file) - - toggler.disable() - toggler.enable() - - assert sample_agent_file.read_text() == original_content - - -class TestResultParser: - """Tests for result parser functions.""" - - @pytest.fixture - def sample_result_data(self): - """Sample result.json data.""" - return { - "task_name": "adaptive-rejection-sampler", - "trial_name": "adaptive-rejection-sampler__ABC123", - "agent_result": {"status": "completed"}, - "verifier_result": {"passed": True}, - "exception_info": None, - "started_at": "2025-12-09T10:00:00", - "finished_at": "2025-12-09T10:05:00", - } - - @pytest.fixture - def sample_results_dir(self, tmp_path, sample_result_data): - """Create a sample Harbor results directory with result.json files.""" - results_dir = tmp_path / "harbor_run" - results_dir.mkdir() - - # Create multiple task result directories - for i in range(1, 4): - task_dir = results_dir / f"task{i}__trial{i}" - task_dir.mkdir() - - result_file = task_dir / "result.json" - result_data = sample_result_data.copy() - result_data["task_name"] = f"task{i}" - result_data["trial_name"] = f"task{i}__trial{i}" - - with open(result_file, "w") as f: - json.dump(result_data, f) - - return results_dir - - def test_parse_single_result(self, tmp_path, sample_result_data): - """Test parsing a single result.json file.""" - result_file = tmp_path / "result.json" - with open(result_file, "w") as f: - json.dump(sample_result_data, f) - - result = parse_single_result(result_file) - - assert isinstance(result, HarborTaskResult) - assert result.task_name == "adaptive-rejection-sampler" - assert result.success is True - assert result.duration_sec == 300.0 - - def test_parse_single_result_file_not_found(self, tmp_path): - """Test parsing non-existent file raises error.""" - result_file = tmp_path / "nonexistent.json" - - with pytest.raises(FileNotFoundError): - parse_single_result(result_file) - - def test_parse_single_result_invalid_json(self, tmp_path): - """Test parsing invalid JSON raises error.""" - result_file = tmp_path / "invalid.json" - result_file.write_text("invalid json content") - - with pytest.raises(json.JSONDecodeError): - parse_single_result(result_file) - - def test_parse_harbor_results(self, sample_results_dir): - """Test parsing multiple result.json files from a directory.""" - results = parse_harbor_results(sample_results_dir) - - assert len(results) == 3 - assert all(isinstance(r, HarborTaskResult) for r in results) - assert {r.task_name for r in results} == {"task1", "task2", "task3"} - - def test_parse_harbor_results_dir_not_found(self, tmp_path): - """Test parsing non-existent directory raises error.""" - nonexistent_dir = tmp_path / "nonexistent" - - with pytest.raises(FileNotFoundError): - parse_harbor_results(nonexistent_dir) - - def test_parse_harbor_results_no_result_files(self, tmp_path): - """Test parsing directory with no result.json files raises error.""" - empty_dir = tmp_path / "empty" - empty_dir.mkdir() - - with pytest.raises(ValueError, match="No result.json files found"): - parse_harbor_results(empty_dir) - - def test_parse_harbor_results_skips_invalid_files(self, sample_results_dir): - """Test that parser skips invalid result files and continues.""" - # Add an invalid result file - invalid_dir = sample_results_dir / "invalid__task" - invalid_dir.mkdir() - invalid_file = invalid_dir / "result.json" - invalid_file.write_text("invalid json") - - # Should still parse valid files and skip invalid one - results = parse_harbor_results(sample_results_dir) - - # Should have 3 valid results (skipped the invalid one) - assert len(results) == 3 - - def test_parse_harbor_results_partial_data(self, tmp_path): - """Test parsing result with missing optional fields.""" - results_dir = tmp_path / "harbor_run" - results_dir.mkdir() - - task_dir = results_dir / "task1__trial1" - task_dir.mkdir() - - # Minimal valid result data - result_data = { - "task_name": "task1", - "trial_name": "task1__trial1", - "agent_result": None, - "verifier_result": None, - "exception_info": {"exception_type": "Error"}, - "started_at": "2025-12-09T10:00:00", - "finished_at": "2025-12-09T10:05:00", - } - - result_file = task_dir / "result.json" - with open(result_file, "w") as f: - json.dump(result_data, f) - - results = parse_harbor_results(results_dir) - - assert len(results) == 1 - assert results[0].task_name == "task1" - assert results[0].success is False # No agent/verifier results diff --git a/tests/unit/utils/test_preflight.py b/tests/unit/utils/test_preflight.py deleted file mode 100644 index 4ea77326..00000000 --- a/tests/unit/utils/test_preflight.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Tests for preflight dependency checks.""" - -from unittest.mock import patch - -import pytest - -from agentready.utils.preflight import PreflightError, check_harbor_cli - - -class TestCheckHarborCLI: - """Tests for check_harbor_cli().""" - - def test_harbor_already_installed(self): - """Harbor found on PATH - no prompts, returns True.""" - with patch("shutil.which", return_value="/usr/local/bin/harbor"): - result = check_harbor_cli(interactive=True) - assert result is True - - def test_harbor_missing_user_confirms_uv(self): - """Harbor missing, user confirms with uv available - succeeds.""" - # First call (harbor check) returns None, second call (uv check) returns path, - # third call (harbor verify) returns harbor path - with patch( - "shutil.which", - side_effect=[None, "/usr/bin/uv", "/usr/local/bin/harbor"], - ): - with patch("click.confirm", return_value=True): - with patch("click.echo"): - with patch( - "agentready.utils.preflight.safe_subprocess_run" - ) as mock_run: - result = check_harbor_cli(interactive=True) - assert result is True - mock_run.assert_called_once_with( - ["uv", "tool", "install", "harbor"], - check=True, - timeout=300, - ) - - def test_harbor_missing_user_confirms_pip_fallback(self): - """Harbor missing, uv not available, falls back to pip - succeeds.""" - # First: harbor=None, uv=None, pip=/usr/bin/pip, final harbor=/usr/local/bin/harbor - with patch( - "shutil.which", - side_effect=[None, None, "/usr/bin/pip", "/usr/local/bin/harbor"], - ): - with patch("click.confirm", return_value=True): - with patch("click.echo"): - with patch( - "agentready.utils.preflight.safe_subprocess_run" - ) as mock_run: - result = check_harbor_cli(interactive=True) - assert result is True - mock_run.assert_called_once_with( - ["pip", "install", "harbor"], check=True, timeout=300 - ) - - def test_harbor_missing_neither_uv_nor_pip(self): - """Harbor missing, neither uv nor pip available - raises error.""" - with patch("shutil.which", return_value=None): - with patch("click.echo"): - with pytest.raises(PreflightError, match="Neither 'uv' nor 'pip'"): - check_harbor_cli(interactive=True) - - def test_harbor_missing_user_declines(self): - """Harbor missing, user declines install - raises error.""" - with patch("shutil.which", side_effect=[None, "/usr/bin/uv"]): - with patch("click.confirm", return_value=False): - with patch("click.echo"): - with pytest.raises( - PreflightError, match="Harbor CLI installation declined" - ): - check_harbor_cli(interactive=True) - - def test_installation_subprocess_fails(self): - """Installation subprocess fails - raises PreflightError.""" - with patch("shutil.which", side_effect=[None, "/usr/bin/uv"]): - with patch("click.confirm", return_value=True): - with patch("click.echo"): - with patch( - "agentready.utils.preflight.safe_subprocess_run", - side_effect=Exception("Subprocess failed"), - ): - with pytest.raises( - PreflightError, match="Harbor installation failed" - ): - check_harbor_cli(interactive=True) - - def test_installation_succeeds_but_not_on_path(self): - """Installation completes but harbor not found on PATH - raises error.""" - # harbor check=None, uv=/usr/bin/uv, harbor verify=None (still not on PATH) - with patch("shutil.which", side_effect=[None, "/usr/bin/uv", None]): - with patch("click.confirm", return_value=True): - with patch("click.echo"): - with patch("agentready.utils.preflight.safe_subprocess_run"): - with pytest.raises(PreflightError, match="not found on PATH"): - check_harbor_cli(interactive=True) - - def test_non_interactive_with_harbor_missing(self): - """Non-interactive mode with missing Harbor - raises PreflightError immediately.""" - with patch("shutil.which", return_value=None): - with pytest.raises(PreflightError, match="harbor CLI not installed"): - check_harbor_cli(interactive=False) - - def test_non_interactive_with_harbor_installed(self): - """Non-interactive mode with Harbor installed - returns True.""" - with patch("shutil.which", return_value="/usr/local/bin/harbor"): - result = check_harbor_cli(interactive=False) - assert result is True