diff --git a/benchmark/Dockerfile b/benchmark/Dockerfile index a5926dab744..a210915e29e 100644 --- a/benchmark/Dockerfile +++ b/benchmark/Dockerfile @@ -57,8 +57,8 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ core-js@3.37.1 \ eslint@8.49.0 -COPY . /aider RUN pip3 install --no-cache-dir --upgrade pip uv -RUN uv pip install --system --no-cache-dir -e /aider[dev] -RUN git config --global --add safe.directory /aider -WORKDIR /aider +COPY . /cecli +RUN uv pip install --system --no-cache-dir -e /cecli[dev] +RUN git config --global --add safe.directory /cecli +WORKDIR /cecli diff --git a/benchmark/README.md b/benchmark/README.md index 988406de687..c35bcd61a95 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -1,94 +1,124 @@ - # Aider benchmark harness -Aider uses benchmarks to quantitatively measure how well it works -with various LLMs. +Before `cecli` was born, the old `aider` used benchmarks to quantitatively +measure how well it works with various LLMs. + This directory holds the harness and tools needed to run the benchmarking suite. +If you're familiar with the `aider` benchmarking, see the "What's new..." +section below. + ## Background -The benchmark is based on the [Exercism](https://github.com/exercism/python) coding exercises. -This -benchmark evaluates how effectively aider and LLMs can translate a -natural language coding request into executable code saved into -files that pass unit tests. -It provides an end-to-end evaluation of not just -the LLM's coding ability, but also its capacity to *edit existing code* -and *format those code edits* so that aider can save the -edits to the local source files. - -See [this writeup for a longer discussion about the benchmark](https://aider.chat/2024/12/21/polyglot.html). - -The benchmark is intended to be run *inside a docker container*. -This is because the benchmarking harness will be -taking code written by an LLM -and executing it without any human review or supervision! -The LLM could generate dangerous python that harms your system, like this: `import os; os.system("sudo rm -rf /")`. +The benchmark was based on the [Exercism](https://github.com/exercism/python) +coding exercises. This benchmark evaluates how effectively aider and LLMs can +translate a natural language coding request into executable code saved into +files that pass unit tests. It provides an end-to-end evaluation of not just the +LLM's coding ability, but also its capacity to _edit existing code_ and _format +those code edits_ so that aider can save the edits to the local source files. + +See +[this writeup for a longer discussion about the benchmark](https://aider.chat/2024/12/21/polyglot.html). + +The benchmark is intended to be run _inside a docker container_. This is because +the benchmarking harness will be taking code written by an LLM and executing it +without any human review or supervision! The LLM could generate dangerous python +that harms your system, like this: `import os; os.system("sudo rm -rf /")`. Running inside a docker container helps limit the damage that could be done. ## Usage -There are 3 main tasks involved in benchmarking aider: +There are 3 main tasks involved in benchmarking: -1. Install and setup for benchmarking. +1. Install and setup. -2. Run the benchmark to measure performance across all the exercises. +2. Run the benchmark. -3. Generate a summary report of how many of the exercises succeeded or failed. +3. Analysis. -### Setup for benchmarking +### Setup -First, prepare all the groundwork for running the benchmarks. These steps only need to be done once. ``` -# Clone the aider repo -git clone https://github.com/Aider-AI/aider.git +ORG=Aider-AI +REPO=aider +# Clone the main repo +git clone https://github.com/$ORG/$REPO.git -# Create the scratch dir to hold benchmarking results inside the main aider dir: -cd aider +# Create the scratch dir to hold benchmarking results inside the main repo: +cd $REPO mkdir tmp.benchmarks # Clone the repo with the exercises -git clone https://github.com/Aider-AI/polyglot-benchmark tmp.benchmarks/polyglot-benchmark +git clone https://github.com/$ORG/polyglot-benchmark tmp.benchmarks/polyglot-benchmark # Build the docker container ./benchmark/docker_build.sh ``` -### Running the benchmark +### Running the benchmarks Launch the docker container and run the benchmark inside it: ``` # Launch the docker container +# You probably want to tweak this script to import your service keys. +# It's curretnly configured to import GEMINI_API_KEY only. +# PR's welcome to more effectively grab the keys without causing anxiety. ./benchmark/docker.sh # Inside the container, install aider as a development build. # This way you're running the code that you cloned above, including any local changes. +# TODO: this step should be included in the Dockerfile pip install -e .[dev] # Run the benchmark: ./benchmark/benchmark.py a-helpful-name-for-this-run --model gpt-3.5-turbo --edit-format whole --threads 10 --exercises-dir polyglot-benchmark ``` -The above will create a folder `tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run` with benchmarking results. -Run like this, the script will run all the exercises in a random order. - -You can run `./benchmark/benchmark.py --help` for a list of all the arguments, but here are the most useful to keep in mind: - -- `--model` is the name of the model, same as you would pass directly to `aider`. -- `--edit-format` is the name of the edit format, same as you would pass directly to `aider`. When working with an experimental LLM, I recommend starting with `whole` -- `--threads` specifies how many exercises to benchmark in parallel. Start with a single thread if you are working out the kinks on your benchmarking setup or working with a new model, etc. Once you are getting reliable results, you can speed up the process by running with more threads. 10 works well against the OpenAI APIs. -- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup. -- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`). -- `--read-model-settings=` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings -- `--map-tokens` sets a token budget for the repo map sent with each request. Set `0` to disable the repo map. This lets you enable repo map usage for any model (e.g., `--map-tokens 1024`). +The above will create a folder +`tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run` with +benchmarking results. Run like this, the script will run all the exercises in a +random order. + +You can run `./benchmark/benchmark.py --help` for a list of all the arguments, +but here are the most useful to keep in mind: + +- `--model` is the name of the model, same as you would pass directly to + `aider`. +- `--edit-format` is the name of the edit format, same as you would pass + directly to `aider`. When working with an experimental LLM, I recommend + starting with `whole` +- `--sets` runs specific groups of tests using the `sets` in the `cat.yaml`. + (Hopefully, the sets will grow with time but currently it just bookmarks + the classic "polyglot" test battery.) +- `--hash-re` allows for deterministic slicing of the exercise set based on the + exercise hash. This is useful for quickly grabbing a consistent subset or k-fold + cross-validation. For example: + - `^0`: 1/16 of the set. + - `^[01]`: 1/8 of the set. + - `^[0-3]`: 1/4 of the set. + - `^.{2}[4-7]`: 1/4 of the set, using the 3 character of the hash. +- `--threads` specifies how many exercises to benchmark in parallel. Start with + a single thread if you are working out the kinks on your benchmarking setup or + working with a new model, etc. Once you are getting reliable results, you can + speed up the process by running with more threads. 10 works well against the + OpenAI APIs. +- `--num-tests` specifies how many of the tests to run before stopping. This is + another way to start gently as you debug your benchmarking setup. +- `--keywords` filters the tests to run to only the ones whose name match the + supplied argument (similar to `pytest -k xxxx`). +- `--read-model-settings=` specify model settings, see here: + https://aider.chat/docs/config/adv-model-settings.html#model-settings +- `--map-tokens` sets a token budget for the repo map sent with each request. + Set `0` to disable the repo map. This lets you enable repo map usage for any + model (e.g., `--map-tokens 1024`). ### Benchmark report -You can generate stats about any benchmark, including ones which are still running. -You don't need to run this inside the docker container, as it is just +You can generate stats about any benchmark, including ones which are still +running. You don't need to run this inside the docker container, as it is just collecting stats not executing unsafe python. ``` @@ -96,52 +126,55 @@ collecting stats not executing unsafe python. ./benchmark/benchmark.py --stats tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run ``` -The benchmark report is a yaml record with statistics about the run: - -```yaml -- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue - test_cases: 225 - model: claude-3.5-sonnet - edit_format: diff - commit_hash: 35f21b5 - pass_rate_1: 57.1 - pass_rate_2: 77.4 - percent_cases_well_formed: 99.2 - error_outputs: 23 - num_malformed_responses: 4 - num_with_malformed_responses: 1 - user_asks: 2 - lazy_comments: 0 - syntax_errors: 1 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 1 - command: aider --sonnet - date: 2024-07-04 - versions: 0.42.1-dev - seconds_per_case: 17.6 - total_cost: 3.6346 -``` - -The key statistics are the `pass_rate_#` entries, which report the -percent of the tasks which had all tests passing. -There will be multiple of these pass rate stats, -depending on the value of the `--tries` parameter. - -The yaml also includes all the settings which were in effect for the benchmark run. -It also reports the git hash of the repo at the time that the benchmark was -run, with `(dirty)` if there were uncommitted changes. -It's good practice to commit the repo before starting a benchmark run. -This way the `model`, `edit_format` and `commit_hash` -should be enough to reliably reproduce any benchmark run. - -You can see examples of the benchmark report yaml in the -[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/). - - -## Limitations, notes - -- Contributions of benchmark results are welcome! Submit results by opening a PR with edits to the -[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/). -- These scripts are not intended for use by typical aider end users. -- Some of these tools are written as `bash` scripts, so it will be hard to use them on Windows. +The benchmark report is a yaml record with statistics about the run. + +The key statistics are the `pass_rate_#` entries, which report the percent of +the tasks which had all tests passing. There will be multiple of these pass rate +stats, depending on the value of the `--tries` parameter. + +The yaml also includes all the settings which were in effect for the benchmark +run. It also reports the git hash of the repo at the time that the benchmark was +run, with `(dirty)` if there were uncommitted changes. It's good practice to +commit the repo before starting a benchmark run. This way the `model`, +`edit_format` and `commit_hash` should be enough to reliably reproduce any +benchmark run. + +## Contributing + +Contributions of benchmark results and tests are welcome! Submit results by opening a PR. + +Note the roadmap priorities: + +1. Complete 'set up records' to support smart caching. +2. Atomic data collection. Most of the data is saved but need protocols for sharing. +3. **Dimensional Parameter Walking** allowing for n-dimensional parameter tuning, + facilitating "gradient descent" approach to opimisation accross multiple parameters. + The test runner should accept n lists of options, e.g., ["thinking: 100", "thinking: 200", "thinking: 400"], ["optionA: B", "optionD: C"]. +4. Smart Caching so the runner can optionally skip any tests for which "similar" result data + is already available based on fuzzy metadata matching. This aids iterative Testing as + when adding a new option to a list of permutations, only the new permutations need to + be run. Also when new Cats join the collection it is easy to incrementally collect the data. +5. Data aggregation and analysis. These will be seperate specialised tools. + +## Limitations + +- These scripts are not intended for use by typical `cecli` end users. +- Some of the old (?deprecated) tools are written as `bash` scripts, so it will be hard to use + them on Windows. +- Currently the JS and cpp tests appear broken. + +## What's new with Cecli Cats? + +The benchmark has evolved into a collection of **Cecli Atomic Tests (Cats)**. + +- **YAML Metadata**: Every Cat has its own `cat.yaml` file containing metadata, + including a unique UUID that may or may not be useful later. +- **Evolving Collection**: The directory structure of the Cats is laid out to + facilitate the growth and evolution of the collection. As the benchmark + matures, Cats will come and go. +- **Simplified Runner**: The test runner is being simplified to focus on its + core job: executing tests and recording results. Downstream aggregation and + analysis of results will be shifted to other tools and projects. +- **Subset Filtering**: see `--sets` +- **K-fold Evaluation Slicing**: The `--hash-re` option allows for deterministic + slicing of the exercise (now `cats`) based on the exercise hash. diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 02117242742..660aa50d57c 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +import asyncio import datetime +import importlib_resources import json import os import random @@ -9,17 +11,18 @@ import sys import time import traceback +import yaml from collections import defaultdict from json.decoder import JSONDecodeError from pathlib import Path from types import SimpleNamespace from typing import List, Optional +import logging """ Performance-oriented refactors: - Avoid heavy imports unless needed for a given code path. - Fast path for `--stats` to skip GitPython and benchmarking deps. -- Build DataFrame / import plotting only when `--graphs` is true. - Use json.load for result file parsing to reduce memory churn. - Cache git version lookups across a single invocation. """ @@ -31,12 +34,14 @@ from aider.dump import dump # noqa: F401 +logger = logging.getLogger("aider.benchmark") + # Cache for commit-hash -> version lookup _VERSION_CACHE = {} BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks")) - -EXERCISES_DIR_DEFAULT = "polyglot-benchmark" +EXERCISES_DIR_DEFAULT = "cecli-cat" +RESULTS_DIR_DEFAULT = "cat-results" app = typer.Typer(add_completion=False, pretty_exceptions_enable=False) @@ -44,167 +49,108 @@ load_dotenv(override=True) -def find_latest_benchmark_dir(): - benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()] - if not benchmark_dirs: - print("Error: No benchmark directories found under tmp.benchmarks.") - sys.exit(1) - - # Get current time and 24 hours ago - now = datetime.datetime.now() - day_ago = now - datetime.timedelta(days=1) - - # Filter directories by name pattern YYYY-MM-DD-HH-MM-SS-- - recent_dirs = [] - for d in benchmark_dirs: - try: - # Extract datetime from directory name - date_str = d.name[:19] # Takes YYYY-MM-DD-HH-MM-SS - dir_date = datetime.datetime.strptime(date_str, "%Y-%m-%d-%H-%M-%S") - if dir_date >= day_ago: - recent_dirs.append(d) - except ValueError: - # Skip directories that don't match the expected format - continue - - if not recent_dirs: - print("Error: No benchmark directories found from the last 24 hours.") - sys.exit(1) - - # Find directory with most recently modified .md file - latest_dir = None - latest_time = 0 - - for d in recent_dirs: - # Look for .md files in subdirectories - for md_file in d.glob("*/exercises/practice/*/.*.md"): - if md_file.is_file(): - mtime = md_file.stat().st_mtime - if mtime > latest_time: - latest_time = mtime - latest_dir = d - - if not latest_dir: - print("Error: No .md files found in recent benchmark directories.") - sys.exit(1) - - print(f"Using the most recently updated benchmark directory: {latest_dir.name}") - return latest_dir - - -def show_stats(dirnames, graphs, verbose, stats_languages=None): - raw_rows = [] - for dirname in dirnames: - row = summarize_results(dirname, verbose, stats_languages) - raw_rows.append(row) - - # return - - seen = dict() - rows = [] - for row in raw_rows: - if not row: - continue - - if row.completed_tests != row.total_tests: - print( - f"Warning: {row.dir_name} is incomplete: {row.completed_tests} of {row.total_tests}" - ) - - try: - kind = (row.model, row.edit_format) - except AttributeError: - return - - if kind in seen: - dump(row.dir_name) - dump(seen[kind]) - return - - seen[kind] = row.dir_name - rows.append(vars(row)) - - repeat_hi = repeat_lo = repeat_avg = None # noqa: F841 - - # Only build a DataFrame and import plotting libs when graphs are requested - if graphs: - import pandas as pd # Lazy import - from plots import plot_refactoring # Lazy import +def resolve_dirname(results_dir, use_single_prior, make_new): + """ + Determines the actual directory path used for storing benchmark results. - df = pd.DataFrame.from_records(rows) - # plot_timing(df) - # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) - # plot_outcomes_claude(df) - plot_refactoring(df) + 1. Resuming a previous run: If the --cont flag is used and exactly one matching previous run exists, it selects that existing directory. + 2. Safety check: If previous runs exist but the user didn't specify --new or --cont, it warns the user and aborts to prevent accidental overwrites or confusion. + 3. Creating a new run: If no prior run exists (or --new is used), it prepends the current timestamp to the directory name to ensure a unique workspace. + """ + logger.debug(f"initial results_dir: {results_dir}") + results_dir = Path(results_dir) + logger.debug(f"dirname1: {results_dir}") + if len(results_dir.parts) > 1: + return results_dir + priors = list(BENCHMARK_DNAME.glob(f"*--{results_dir}")) + # BUG20251223 + logger.debug(f"Found priors: {priors}") + logger.debug(f"use_single_prior: {use_single_prior}, make_new: {make_new}") -def resolve_dirname(dirname, use_single_prior, make_new): - if len(dirname.parts) > 1: - return dirname - - priors = list(BENCHMARK_DNAME.glob(f"*--{dirname}")) if len(priors) == 1 and use_single_prior: - dirname = priors[0].name - print(f"Using pre-existing {dirname}") + results_dir = priors[0].name + logger.info(f"Using pre-existing {results_dir}") elif len(priors): if not make_new: - print(f"Prior runs of {dirname} exist, use --new or name one explicitly") - print() + logger.warning( + f"Prior runs of {results_dir} exist, use --new or name one explicitly" + ) for prior in priors: - print(prior) - return + logger.warning(prior) + sys.exit(1) - if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(dirname)): + if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(results_dir)): now = datetime.datetime.now() now = now.strftime("%Y-%m-%d-%H-%M-%S--") - dirname = now + dirname.name + results_dir = now + results_dir.name - dirname = BENCHMARK_DNAME / dirname - return dirname + logger.debug(f"resolved {results_dir}") + results_dir = BENCHMARK_DNAME / results_dir + logger.info(f"updated results_dir: {results_dir}") + return results_dir @app.command() def main( - dirnames: Optional[List[str]] = typer.Argument(None, help="Directory names"), - graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"), - model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"), + results_dir: Optional[str] = typer.Argument( + "unnamed", help="Results directory slug" + ), + model: str = typer.Option( + "gemini/gemini-3-flash-preview", "--model", "-m", help="Model name" + ), sleep: float = typer.Option( 0, "--sleep", help="Sleep seconds between tests when single threaded" ), languages: str = typer.Option( - None, "--languages", "-l", help="Only run tests for specific languages (comma separated)" + None, + "--languages", + "-l", + help="Only run tests for specific languages (comma separated)", ), edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"), editor_model: str = typer.Option(None, "--editor-model", help="Editor model name"), - editor_edit_format: str = typer.Option(None, "--editor-edit-format", help="Editor edit format"), + editor_edit_format: str = typer.Option( + None, "--editor-edit-format", help="Editor edit format" + ), replay: str = typer.Option( None, "--replay", help="Replay previous .aider.chat.history.md responses from previous benchmark run", ), keywords: str = typer.Option( - None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)" + None, + "--keywords", + "-k", + help="Only run tests that contain keywords (comma sep)", ), clean: bool = typer.Option( - False, "--clean", "-c", help="Discard the existing testdir and make a clean copy" + False, + "--clean", + "-c", + help="Discard the existing testdir and make a clean copy", + ), + cont: bool = typer.Option( + False, "--cont", help="Continue the (single) matching testdir" ), - cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"), make_new: bool = typer.Option(False, "--new", help="Make a new dated testdir"), - no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"), + no_unit_tests: bool = typer.Option( + False, "--no-unit-tests", help="Do not run unit tests" + ), no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), - stats_only: bool = typer.Option( - False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests" + verbose: int = typer.Option( + 0, "--verbose", "-v", count=True, help="Verbose output" ), - stats_languages: str = typer.Option( - None, - "--stats-languages", - help="Only include stats for specific languages (comma separated)", + quiet: bool = typer.Option(False, "--quiet", "-q", help="Quiet output"), + tries: int = typer.Option( + 2, "--tries", "-r", help="Number of tries for running tests" + ), + threads: int = typer.Option( + 1, "--threads", "-t", help="Number of threads to run in parallel" + ), + num_tests: int = typer.Option( + -1, "--num-tests", "-n", help="Number of tests to run" ), - diffs_only: bool = typer.Option(False, "--diffs", help="Just diff the provided stats dirs"), - tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"), - threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"), - num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"), num_ctx: Optional[int] = typer.Option( None, "--num-ctx", help="Override model context window size" ), @@ -212,7 +158,9 @@ def main( None, "--read-model-settings", help="Load aider model settings from YAML file" ), reasoning_effort: Optional[str] = typer.Option( - None, "--reasoning-effort", help="Set reasoning effort for models that support it" + None, + "--reasoning-effort", + help="Set reasoning effort for models that support it", ), thinking_tokens: Optional[int] = typer.Option( None, "--thinking-tokens", help="Set thinking tokens for models that support it" @@ -225,57 +173,85 @@ def main( exercises_dir: str = typer.Option( EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files" ), + legacy: bool = typer.Option( + False, "--legacy", help="Use legacy exercise directory structure" + ), + sets: Optional[str] = typer.Option( + None, "--sets", help="Only run tests for specific sets (comma separated)" + ), + hash_re: Optional[str] = typer.Option( + None, + "--hash-re", + help=( + "Regex to filter exercise hashes. Useful for dividing the set into fractions using" + " hex chars: '^0' for 1/16, '^[01]' for 1/8, '^[0-3]' for 1/4. Use '^.{n}x' to" + " match the nth character (e.g., '^.{2}[4-7]' for the 3rd char in range 4-7)." + ), + ), + dry: bool = typer.Option( + False, "--dry", help="Run in dry mode (no aider, no tests)" + ), ): - if stats_only and not dirnames: - latest_dir = find_latest_benchmark_dir() - dirnames = [str(latest_dir)] - - if dirnames is None: - dirnames = [] - - if len(dirnames) > 1 and not (stats_only or diffs_only): - print("Only provide 1 dirname unless running with --stats or --diffs") - return 1 - - updated_dirnames = [] - for dirname in dirnames: - dirname = Path(dirname) - dirname = resolve_dirname(dirname, stats_only or cont, make_new) - if not dirname: - return 1 - updated_dirnames.append(dirname) - - if stats_only: - return show_stats(updated_dirnames, graphs, verbose, stats_languages) - - if diffs_only: - return show_diffs(updated_dirnames) - - assert len(updated_dirnames) == 1, updated_dirnames - dirname = updated_dirnames[0] + # setup logging and verbosity + if quiet: + log_level = logging.WARNING + elif verbose > 0: + log_level = logging.DEBUG + else: + log_level = logging.INFO - # Lazy imports for the actual benchmark run - import git # Heavy; avoid for --stats/--diffs - import importlib_resources # Used for model metadata registration - import lox # Only needed for threaded runs + logging.basicConfig(level=log_level, format="%(message)s") - from aider import models, sendchat - from aider.coders import base_coder + from aider import models - repo = git.Repo(search_parent_directories=True) - commit_hash = repo.head.object.hexsha[:7] - if repo.is_dirty(): - commit_hash += "-dirty" + if dry: + no_aider = True + no_unit_tests = True + commit_hash = "???????" + else: + # Lazy imports for the actual benchmark run + import git # Heavy + import lox # Only needed for threaded runs + from aider import sendchat + from aider.coders import base_coder + + repo = git.Repo(search_parent_directories=True) + commit_hash = repo.head.object.hexsha[:7] + if repo.is_dirty(): + commit_hash += "-dirty" + + resolved_results_dir = resolve_dirname(results_dir, cont, make_new) + + if not resolved_results_dir: + logger.error(f"Could not resolve results directory from slug: {results_dir}") + logger.error(f"Checked in {BENCHMARK_DNAME}") + return 1 + results_dir = resolved_results_dir - if "AIDER_DOCKER" not in os.environ: - print("Warning: benchmarking runs unvetted code from GPT, run in a docker container") + if not dry and "AIDER_DOCKER" not in os.environ: + logger.warning( + "Warning: Benchmarking runs unvetted code. Run in a docker container." + ) + logger.warning( + "Set AIDER_DOCKER in the environment to by-pass this check at your own risk." + ) return - assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME + # Check dirs exist + if not (BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir()): + logger.error(f"Benchmark directory not found: {BENCHMARK_DNAME}") + sys.exit(1) + original_dname = BENCHMARK_DNAME / exercises_dir + if not (original_dname.exists() and original_dname.is_dir()): + logger.error(f"Exercises directory not found: {original_dname}") + sys.exit(1) - def get_exercise_dirs(base_dir, languages=None): - """Get all exercise directories for specified languages (or all if none specified)""" + def legacy_get_exercise_dirs(base_dir, languages=None): + """Get all exercise directories for specified languages (or all if none specified). + Uses the legacy `excerises/practice` pattern. + """ base_dir = Path(base_dir) + logger.info(f"Looking for exercises in {base_dir}") # Get available language dirs lang_dirs = [d for d in base_dir.iterdir() if d.is_dir()] @@ -286,7 +262,9 @@ def get_exercise_dirs(base_dir, languages=None): lang_dirs = [d for d in lang_dirs if d.name.lower() in requested] dump(lang_dirs) if not lang_dirs: - print(f"No matching language directories found for: {languages}") + logger.warning( + f"No matching language directories found for: {languages}" + ) return [] # Get all exercise dirs under exercises/practice for each language @@ -298,205 +276,220 @@ def get_exercise_dirs(base_dir, languages=None): return exercise_dirs - original_dname = BENCHMARK_DNAME / exercises_dir - assert original_dname.exists() and original_dname.is_dir(), original_dname + def get_exercise_dirs( + base_dir, languages=None, sets=None, hash_re=None, legacy=False + ): + if legacy: + return legacy_get_exercise_dirs(base_dir, languages) - exercise_dirs = get_exercise_dirs(original_dname, languages) + base_dir = Path(base_dir) + logger.info(f"Scanning for cat.yaml in {base_dir}") + + lang_filter = ( + set(l.strip().lower() for l in languages.split(",")) if languages else None + ) + set_filter = set(s.strip().lower() for s in sets.split(",")) if sets else None + + exercise_dirs = [] + for cat_file in base_dir.rglob("cat.yaml"): + try: + with open(cat_file, "r") as f: + metadata = yaml.safe_load(f) + if verbose > 1: + logger.debug( + f"found {metadata['name']} ({metadata['language']})" + ) + except Exception as e: + logger.warning(f"Failed to parse {cat_file}: {e}") + continue + + if lang_filter and metadata.get("language", "").lower() not in lang_filter: + continue + + if set_filter: + cat_sets = set(s.lower() for s in metadata.get("sets", [])) + if not (set_filter & cat_sets): + continue + + if hash_re and not re.search(hash_re, metadata.get("hash", "")): + continue + + exercise_dirs.append(cat_file.parent) + + logger.info(f"Found {len(exercise_dirs)} cats") + return exercise_dirs + + exercise_dirs = get_exercise_dirs( + original_dname, languages, sets, hash_re, legacy=legacy + ) if not exercise_dirs: - print("No exercise directories found") + logger.error("No exercise directories found") return 1 - if clean and dirname.exists(): - print("Cleaning up and replacing", dirname) - dir_files = set(fn.name for fn in dirname.glob("*")) + if clean and results_dir.exists() and not dry: + logger.info(f"Cleaning up and replacing {results_dir}") + dir_files = set(fn.name for fn in results_dir.glob("*")) original_files = set(fn.name for fn in original_dname.glob("*")) if dir_files != original_files: - print("ERROR: will not delete dir that does not look like original tests", dirname) + logger.error( + f"ERROR: will not delete dir that does not look like original tests {results_dir}" + ) return - dest = dirname.parent / "OLD" / dirname.name + dest = results_dir.parent / "OLD" / results_dir.name if dest.exists(): old_now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - dest = dirname.parent / "OLD" / (old_now + dirname.name) - - dirname.rename(dest) - - if not dirname.exists(): - print(f"Copying {original_dname} -> {dirname} ...") - # Only copy the practice subdirs with exercises - os.makedirs(dirname, exist_ok=True) - for lang_dir in original_dname.iterdir(): - if not lang_dir.is_dir(): - continue - practice_dir = lang_dir / "exercises" / "practice" - if practice_dir.exists(): - dest_lang_dir = dirname / lang_dir.name / "exercises" / "practice" - os.makedirs(dest_lang_dir.parent, exist_ok=True) - shutil.copytree(practice_dir, dest_lang_dir) - print("...done") - - test_dnames = sorted(str(d.relative_to(original_dname)) for d in exercise_dirs) - - resource_metadata = importlib_resources.files("aider.resources").joinpath("model-metadata.json") + dest = results_dir.parent / "OLD" / (old_now + results_dir.name) + + results_dir.rename(dest) + + if not dry: + if not results_dir.exists(): + logger.info(f"Copying {original_dname} -> {results_dir} ...") + os.makedirs(results_dir, exist_ok=True) + + copied = False + for exercise_dir in exercise_dirs: + dest_dir = results_dir / exercise_dir.name + if not dest_dir.exists(): + if not copied: + logger.info(f"Adding missing exercises to {results_dir} ...") + shutil.copytree(exercise_dir, dest_dir) + copied = True + if copied: + logger.info("...done") + + test_dnames = sorted(d.name for d in exercise_dirs) + + resource_metadata = importlib_resources.files("aider.resources").joinpath( + "model-metadata.json" + ) model_metadata_files_loaded = models.register_litellm_models([resource_metadata]) dump(model_metadata_files_loaded) if read_model_settings: try: files_loaded = models.register_models([read_model_settings]) - if verbose: - if files_loaded: - print(f"Loaded model settings from: {files_loaded[0]}") - else: - print(f"No model settings loaded from: {read_model_settings}") + if files_loaded: + logger.debug(f"Loaded model settings from: {files_loaded[0]}") + else: + logger.debug(f"No model settings loaded from: {read_model_settings}") except Exception as e: - print(f"Error loading model settings: {e}") + logger.error(f"Error loading model settings: {e}") return 1 if keywords: keywords = keywords.split(",") - test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn] + test_dnames = [ + dn for dn in test_dnames for keyword in keywords if keyword in dn + ] random.shuffle(test_dnames) if num_tests > 0: test_dnames = test_dnames[:num_tests] - # Don't give up when benchmarking - LONG_TIMEOUT = 24 * 60 * 60 - sendchat.RETRY_TIMEOUT = LONG_TIMEOUT - base_coder.RETRY_TIMEOUT = LONG_TIMEOUT - models.RETRY_TIMEOUT = LONG_TIMEOUT + if not no_aider: + # Don't give up when benchmarking + LONG_TIMEOUT = 24 * 60 * 60 + sendchat.RETRY_TIMEOUT = LONG_TIMEOUT + base_coder.RETRY_TIMEOUT = LONG_TIMEOUT + models.RETRY_TIMEOUT = LONG_TIMEOUT # Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention repomap_in_memory = threads > 1 - if threads == 1: - all_results = [] - for test_path in test_dnames: - results = run_test( - original_dname, - dirname / test_path, - model, - edit_format, - tries, - no_unit_tests, - no_aider, - verbose, - commit_hash, - replay, - editor_model, - editor_edit_format, - num_ctx, - sleep, - reasoning_effort, - thinking_tokens, - map_tokens, - repomap_in_memory, - ) + test_args = dict( + model_name=model, + edit_format=edit_format, + tries=tries, + no_unit_tests=no_unit_tests, + no_aider=no_aider, + verbose=verbose, + commit_hash=commit_hash, + replay=replay, + editor_model=editor_model, + editor_edit_format=editor_edit_format, + num_ctx=num_ctx, + sleep=sleep, + reasoning_effort=reasoning_effort, + thinking_tokens=thinking_tokens, + map_tokens=map_tokens, + repomap_in_memory=repomap_in_memory, + dry=dry, + results_dir=results_dir, + ) - all_results.append(results) - summarize_results(dirname, verbose) - if sleep: - time.sleep(sleep) - else: + if threads > 1: run_test_threaded = lox.thread(threads)(run_test) for test_path in test_dnames: run_test_threaded.scatter( - original_dname, - dirname / test_path, - model, - edit_format, - tries, - no_unit_tests, - no_aider, - verbose, - commit_hash, - replay, - editor_model, - editor_edit_format, - num_ctx, - sleep, - reasoning_effort, - thinking_tokens, - map_tokens, - repomap_in_memory, + original_dname, results_dir / test_path, **test_args ) all_results = run_test_threaded.gather(tqdm=True) + else: + all_results = [] + for test_path in test_dnames: + results = run_test(original_dname, results_dir / test_path, **test_args) + all_results.append(results) + summarize_results(results_dir, verbose) + if sleep: + time.sleep(sleep) print() print() print() - summarize_results(dirname, verbose) + summarize_results(results_dir, verbose) return 0 -def show_diffs(dirnames): - dirnames = sorted(dirnames) - - all_results = dict((dirname, load_results(dirname)) for dirname in dirnames) - testcases = set() - for results in all_results.values(): - testcases.update(result["testcase"] for result in results) - - testcases = sorted(testcases) - - unchanged = set() - - for testcase in testcases: - all_outcomes = [] - for dirname in dirnames: - results = all_results[dirname] - result = [r for r in results if r["testcase"] == testcase][0] - - outcomes = tuple(result["tests_outcomes"]) - all_outcomes.append(True in outcomes) - - if len(set(all_outcomes)) == 1: - unchanged.add(testcase) - continue - - print() - print(testcase) - for outcome, dirname in zip(all_outcomes, dirnames): - print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md") - - changed = set(testcases) - unchanged - print() - print("changed:", len(changed), ",".join(sorted(changed))) - print() - print("unchanged:", len(unchanged), ",".join(sorted(unchanged))) - - -def load_results(dirname, stats_languages=None): - dirname = Path(dirname) +def load_results(results_dir, stats_languages=None): + results_dir = Path(results_dir) lang_to_results = {} - if stats_languages: - languages = [lang.strip().lower() for lang in stats_languages.split(",")] - glob_patterns = [f"{lang}/exercises/practice/*/.aider.results.json" for lang in languages] - else: - glob_patterns = ["*/exercises/practice/*/.aider.results.json"] + # BUG20251223 + logger.debug(f"Globbing {results_dir} for results") + files = list(results_dir.glob("*/.aider.results.json")) + logger.debug(f"Found {len(files)} files") - for pattern in glob_patterns: - for fname in dirname.glob(pattern): - try: - results = json.loads(fname.read_text()) - # json / test / prac / exer / lang - lang = fname.parent.parent.parent.parent.name - lang_to_results.setdefault(lang, []).append(results) - except json.JSONDecodeError: - print("json.JSONDecodeError", fname) - continue + for fname in files: + try: + results = json.loads(fname.read_text()) + # BUG20251223 + logger.debug(f"Processing result file: {fname}") + + # Try to get language from cat.yaml if it exists in the same dir + lang = "unknown" + cat_yaml = fname.parent / "cat.yaml" + if cat_yaml.exists(): + try: + with open(cat_yaml, "r") as f: + metadata = yaml.safe_load(f) + lang = metadata.get("language", "unknown") + except Exception: + pass + + if stats_languages: + languages = [ + lang.strip().lower() for lang in stats_languages.split(",") + ] + if lang.lower() not in languages: + continue + + logger.debug(f"Derived lang: {lang}") + lang_to_results.setdefault(lang, []).append(results) + except json.JSONDecodeError: + logger.warning(f"json.JSONDecodeError {fname}") + continue return lang_to_results -def summarize_results(dirname, verbose, stats_languages=None): - lang_to_results = load_results(dirname, stats_languages) +def summarize_results(results_dir, verbose, stats_languages=None): + lang_to_results = load_results(results_dir, stats_languages) res = SimpleNamespace() - res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*"))) + res.total_tests = len(list(Path(results_dir).glob("*/.aider.results.json"))) try: tries = max( @@ -508,7 +501,7 @@ def summarize_results(dirname, verbose, stats_languages=None): except ValueError: tries = 0 - res.dir_name = str(dirname) + res.dir_name = str(results_dir) passed_tests = [0] * tries @@ -600,16 +593,30 @@ def add(attr_name, increment, global_stats, lang_stats): add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats) add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats) - add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats) + add( + "indentation_errors", + results.get("indentation_errors", 0), + res, + lang_stats, + ) add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats) - add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats) + add( + "completion_tokens", + results.get("completion_tokens", 0), + res, + lang_stats, + ) res.reasoning_effort = results.get("reasoning_effort") res.thinking_tokens = results.get("thinking_tokens") res.map_tokens = results.get("map_tokens") - for key in "model edit_format commit_hash editor_model editor_edit_format".split(): + for ( + key + ) in ( + "model edit_format commit_hash editor_model editor_edit_format".split() + ): val = results.get(key) if val: variants[key].add(val) @@ -621,11 +628,11 @@ def add(attr_name, increment, global_stats, lang_stats): # return console = Console(highlight=False) - console.rule(title=str(dirname)) + console.rule(title=str(results_dir)) commit_hashes = variants["commit_hash"] versions = get_versions(commit_hashes) - date = dirname.name[:10] + date = results_dir.name[:10] def show(stat, red="red"): val = getattr(res, stat) @@ -640,7 +647,7 @@ def show(stat, red="red"): setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}") setattr(res, f"pass_num_{i + 1}", passed_tests[i]) - print(f"- dirname: {dirname.name}") + print(f"- results_dir: {results_dir.name}") style = None if res.completed_tests == res.total_tests else "red" console.print(f" test_cases: {res.completed_tests}", style=style) for key, val in variants.items(): @@ -732,7 +739,9 @@ def format_lang_stats(lang, lang_stats): def compute_lang_to_col_widths(lang_to_stats): lang_to_col_widths = {} for lang, lang_stats in lang_to_stats.items(): - lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__] + lang_stat_attrs = [ + getattr(lang_stats, attr) for attr in lang_stats.__dict__ + ] lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len))) lang_to_col_widths[lang] = lang_col_width @@ -742,7 +751,10 @@ def compute_lang_to_col_widths(lang_to_stats): print("======== Stats by language ========") print() - [format_lang_stats(lang, lang_stats) for lang, lang_stats in lang_to_stats.items()] + [ + format_lang_stats(lang, lang_stats) + for lang, lang_stats in lang_to_stats.items() + ] lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats) any_stats = list(lang_to_stats.values())[0] @@ -829,24 +841,28 @@ def get_replayed_content(replay_dname, test_dname): return res res = res.splitlines(keepends=True) - res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")] + res = [ + line + for line in res + if not line.startswith("> ") and not line.startswith("#### ") + ] return "".join(res) def run_test(original_dname, testdir, *args, **kwargs): try: - return run_test_real(original_dname, testdir, *args, **kwargs) + return asyncio.run(run_test_real(original_dname, testdir, *args, **kwargs)) except Exception: - print("=" * 40) - print("Test failed") - traceback.print_exc() + logger.error("=" * 40) + logger.error("Test failed") + logger.error(traceback.format_exc()) testdir = Path(testdir) results_fname = testdir / ".aider.results.json" results_fname.write_text(json.dumps(dict(exception=traceback.format_exc()))) -def run_test_real( +async def run_test_real( original_dname, testdir, model_name, @@ -866,6 +882,8 @@ def run_test_real( map_tokens: Optional[int] = None, read_model_settings=None, repomap_in_memory: bool = False, + dry: bool = False, + results_dir=None, ): # Lazy imports: only needed in the actual benchmark execution path import git @@ -876,7 +894,9 @@ def run_test_real( from aider.io import InputOutput if not os.path.isdir(testdir): - print("Not a dir:", testdir) + if dry: + return + logger.error(f"Not a dir: {testdir}") return testdir = Path(testdir) @@ -892,7 +912,7 @@ def run_test_real( # else: return res except JSONDecodeError: - print(f"{results_fname} failed to parse, redoing...") + logger.warning(f"{results_fname} failed to parse, redoing...") # Read solution and test files from config fnames = [] @@ -927,6 +947,25 @@ def run_test_real( # Remove any ignore files from the solution set that LLM will edit solution_files.difference_update(ignore_files) + # Try to find original relative path from cat.yaml + original_rel_path = None + cat_yaml = testdir / "cat.yaml" + if cat_yaml.exists(): + try: + with open(cat_yaml, "r") as f: + metadata = yaml.safe_load(f) + # We need to find where this exercise was in original_dname. + # Since we don't store the full relative path in cat.yaml, + # we have to search for it or rely on the fact that we know + # it was copied from original_dname. + # A better way is to look for the directory with the same name (hash) + # in original_dname. + matches = list(original_dname.rglob(testdir.name)) + if matches: + original_rel_path = matches[0].relative_to(original_dname) + except Exception: + pass + # Copy all solution files for file_path in solution_files: src = testdir / Path(file_path) @@ -934,20 +973,13 @@ def run_test_real( fnames.append(src) # restore the original file, in case we interrupted a prev run # Find the original file in the language-specific practice dir - lang_part = str(testdir).split("/exercises/practice/")[0] - original_fname = ( - original_dname - / Path(lang_part).name - / "exercises" - / "practice" - / testdir.name - / file_path - ) - if original_fname.exists(): - os.makedirs(src.parent, exist_ok=True) - shutil.copy(original_fname, src) + if not dry and original_rel_path: + original_fname = original_dname / original_rel_path / file_path + if original_fname.exists(): + os.makedirs(src.parent, exist_ok=True) + shutil.copy(original_fname, src) else: - print(f"Warning: Solution file not found: {src}") + logger.warning(f"Warning: Solution file not found: {src}") file_list = " ".join(fname.name for fname in fnames) @@ -997,22 +1029,28 @@ def run_test_real( dump(main_model) dump(edit_format) show_fnames = ",".join(map(str, fnames)) - print("fnames:", show_fnames) + logger.info(f"fnames: {show_fnames}") # Ensure this test directory is a standalone git repo so RepoMap can be used - try: - git_dir = testdir / ".git" - if not git_dir.exists(): - r = git.Repo.init(testdir) - # Set a local identity to avoid commit failures in clean containers - with r.config_writer() as cw: - cw.set_value("user", "name", "aider-benchmark") - cw.set_value("user", "email", "aider-benchmark@example.com") - # Add existing files (solution set and any current files) - r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()]) - r.index.commit("Initial commit for aider benchmark") - except Exception as e: - if verbose: - print(f"Warning: failed to initialize git repo in {testdir}: {e}") + if not dry: + try: + git_dir = testdir / ".git" + if not git_dir.exists(): + r = git.Repo.init(testdir) + # Set a local identity to avoid commit failures in clean containers + with r.config_writer() as cw: + cw.set_value("user", "name", "aider-benchmark") + cw.set_value("user", "email", "aider-benchmark@example.com") + # Add existing files (solution set and any current files) + r.index.add( + [ + str(p.relative_to(testdir)) + for p in testdir.rglob("*") + if p.is_file() + ] + ) + r.index.commit("Initial commit for aider benchmark") + except Exception as e: + logger.debug(f"Warning: failed to initialize git repo in {testdir}: {e}") coder_kwargs = dict( main_model=main_model, @@ -1036,7 +1074,7 @@ def run_test_real( if map_tokens is not None: coder_kwargs["map_tokens"] = map_tokens - coder = Coder.create(**coder_kwargs) + coder = await Coder.create(**coder_kwargs) dump(coder.ignore_mentions) coder.show_announcements() @@ -1063,9 +1101,9 @@ def run_test_real( show = [">> " + line for line in show] io.append_chat_history("".join(show)) - coder.apply_updates() + await coder.apply_updates() else: - response = coder.run(with_message=instructions, preproc=False) + response = await coder.run(with_message=instructions, preproc=False) dur += time.time() - start @@ -1103,46 +1141,45 @@ def run_test_real( errors = errors.splitlines() syntax_errors += sum(1 for line in errors if line.startswith("SyntaxError")) - indentation_errors += sum(1 for line in errors if line.startswith("IndentationError")) + indentation_errors += sum( + 1 for line in errors if line.startswith("IndentationError") + ) - print(errors[-1]) + logger.info(errors[-1]) errors = "\n".join(errors) instructions = errors instructions += prompts.test_failures.format(file_list=file_list) - # Clean up build directories after all attempts - # Rust target/debug - target_dir = testdir / "target" / "debug" - if target_dir.exists(): - try: - shutil.rmtree(target_dir) - if verbose: - print(f"Cleaned up Rust target/debug directory: {target_dir}") - except (OSError, shutil.Error, PermissionError) as e: - if verbose: - print(f"Failed to clean up Rust target/debug directory: {e}") - - # Java build directories - java_build_dir = testdir / "build" - if java_build_dir.exists(): - try: - shutil.rmtree(java_build_dir) - if verbose: - print(f"Cleaned up Java build directory: {java_build_dir}") - except (OSError, shutil.Error, PermissionError) as e: - if verbose: - print(f"Failed to clean up Java build directory: {e}") - - # Node.js node_modules directories - node_modules_dir = testdir / "node_modules" - if node_modules_dir.exists(): - try: - shutil.rmtree(node_modules_dir) - if verbose: - print(f"Cleaned up Node.js node_modules directory: {node_modules_dir}") - except (OSError, shutil.Error, PermissionError) as e: - if verbose: - print(f"Failed to clean up Node.js node_modules directory: {e}") + if not dry: + # Clean up build directories after all attempts + # Rust target/debug + target_dir = testdir / "target" / "debug" + if target_dir.exists(): + try: + shutil.rmtree(target_dir) + logger.debug(f"Cleaned up Rust target/debug directory: {target_dir}") + except (OSError, shutil.Error, PermissionError) as e: + logger.debug(f"Failed to clean up Rust target/debug directory: {e}") + + # Java build directories + java_build_dir = testdir / "build" + if java_build_dir.exists(): + try: + shutil.rmtree(java_build_dir) + logger.debug(f"Cleaned up Java build directory: {java_build_dir}") + except (OSError, shutil.Error, PermissionError) as e: + logger.debug(f"Failed to clean up Java build directory: {e}") + + # Node.js node_modules directories + node_modules_dir = testdir / "node_modules" + if node_modules_dir.exists(): + try: + shutil.rmtree(node_modules_dir) + logger.debug( + f"Cleaned up Node.js node_modules directory: {node_modules_dir}" + ) + except (OSError, shutil.Error, PermissionError) as e: + logger.debug(f"Failed to clean up Node.js node_modules directory: {e}") results = dict( testdir=str(testdir), @@ -1175,7 +1212,9 @@ def run_test_real( ) if edit_format == "architect": - results["editor_model"] = main_model.editor_model.name if main_model.editor_model else None + results["editor_model"] = ( + main_model.editor_model.name if main_model.editor_model else None + ) results["editor_edit_format"] = main_model.editor_edit_format dump(results) @@ -1187,6 +1226,12 @@ def run_test_real( def run_unit_tests(original_dname, testdir, history_fname, test_files): timeout = 60 * 3 + # Find original relative path + original_rel_path = None + matches = list(original_dname.rglob(testdir.name)) + if matches: + original_rel_path = matches[0].relative_to(original_dname) + # Map of file extensions to test commands TEST_COMMANDS = { ".py": ["pytest"], @@ -1208,14 +1253,18 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files): break if not command: - raise ValueError(f"No test command found for files with extensions: {extensions}") + raise ValueError( + f"No test command found for files with extensions: {extensions}" + ) # Copy test files from original directory for file_path in test_files: - src = original_dname / Path(*testdir.parts[-4:]) / file_path + if not original_rel_path: + break + src = original_dname / original_rel_path / file_path dst = testdir / file_path if src.exists(): - print("copying", src, dst) + logger.info(f"copying {src} {dst}") os.makedirs(dst.parent, exist_ok=True) shutil.copy(src, dst) @@ -1228,7 +1277,7 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files): content = re.sub(r"@Disabled\([^)]*\)\s*\n", "", content) test_file.write_text(content) - print(" ".join(command)) + logger.info(" ".join(command)) result = subprocess.run( command, @@ -1250,7 +1299,7 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files): fh.write(f"```\n{res}\n```") if not success: - print(f"Tests failed: {testdir}") + logger.info(f"Tests failed: {testdir}") return res diff --git a/benchmark/benchmark_classic.py b/benchmark/benchmark_classic.py new file mode 100755 index 00000000000..02117242742 --- /dev/null +++ b/benchmark/benchmark_classic.py @@ -0,0 +1,1265 @@ +#!/usr/bin/env python3 +import datetime +import json +import os +import random +import re +import shutil +import subprocess +import sys +import time +import traceback +from collections import defaultdict +from json.decoder import JSONDecodeError +from pathlib import Path +from types import SimpleNamespace +from typing import List, Optional + +""" +Performance-oriented refactors: +- Avoid heavy imports unless needed for a given code path. +- Fast path for `--stats` to skip GitPython and benchmarking deps. +- Build DataFrame / import plotting only when `--graphs` is true. +- Use json.load for result file parsing to reduce memory churn. +- Cache git version lookups across a single invocation. +""" + +# Heavy modules are lazily imported within the code paths that need them. +import typer +from dotenv import load_dotenv +from rich.console import Console + +from aider.dump import dump # noqa: F401 + +# Cache for commit-hash -> version lookup +_VERSION_CACHE = {} + +BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks")) + +EXERCISES_DIR_DEFAULT = "polyglot-benchmark" + +app = typer.Typer(add_completion=False, pretty_exceptions_enable=False) + + +load_dotenv(override=True) + + +def find_latest_benchmark_dir(): + benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()] + if not benchmark_dirs: + print("Error: No benchmark directories found under tmp.benchmarks.") + sys.exit(1) + + # Get current time and 24 hours ago + now = datetime.datetime.now() + day_ago = now - datetime.timedelta(days=1) + + # Filter directories by name pattern YYYY-MM-DD-HH-MM-SS-- + recent_dirs = [] + for d in benchmark_dirs: + try: + # Extract datetime from directory name + date_str = d.name[:19] # Takes YYYY-MM-DD-HH-MM-SS + dir_date = datetime.datetime.strptime(date_str, "%Y-%m-%d-%H-%M-%S") + if dir_date >= day_ago: + recent_dirs.append(d) + except ValueError: + # Skip directories that don't match the expected format + continue + + if not recent_dirs: + print("Error: No benchmark directories found from the last 24 hours.") + sys.exit(1) + + # Find directory with most recently modified .md file + latest_dir = None + latest_time = 0 + + for d in recent_dirs: + # Look for .md files in subdirectories + for md_file in d.glob("*/exercises/practice/*/.*.md"): + if md_file.is_file(): + mtime = md_file.stat().st_mtime + if mtime > latest_time: + latest_time = mtime + latest_dir = d + + if not latest_dir: + print("Error: No .md files found in recent benchmark directories.") + sys.exit(1) + + print(f"Using the most recently updated benchmark directory: {latest_dir.name}") + return latest_dir + + +def show_stats(dirnames, graphs, verbose, stats_languages=None): + raw_rows = [] + for dirname in dirnames: + row = summarize_results(dirname, verbose, stats_languages) + raw_rows.append(row) + + # return + + seen = dict() + rows = [] + for row in raw_rows: + if not row: + continue + + if row.completed_tests != row.total_tests: + print( + f"Warning: {row.dir_name} is incomplete: {row.completed_tests} of {row.total_tests}" + ) + + try: + kind = (row.model, row.edit_format) + except AttributeError: + return + + if kind in seen: + dump(row.dir_name) + dump(seen[kind]) + return + + seen[kind] = row.dir_name + rows.append(vars(row)) + + repeat_hi = repeat_lo = repeat_avg = None # noqa: F841 + + # Only build a DataFrame and import plotting libs when graphs are requested + if graphs: + import pandas as pd # Lazy import + from plots import plot_refactoring # Lazy import + + df = pd.DataFrame.from_records(rows) + # plot_timing(df) + # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) + # plot_outcomes_claude(df) + plot_refactoring(df) + + +def resolve_dirname(dirname, use_single_prior, make_new): + if len(dirname.parts) > 1: + return dirname + + priors = list(BENCHMARK_DNAME.glob(f"*--{dirname}")) + if len(priors) == 1 and use_single_prior: + dirname = priors[0].name + print(f"Using pre-existing {dirname}") + elif len(priors): + if not make_new: + print(f"Prior runs of {dirname} exist, use --new or name one explicitly") + print() + for prior in priors: + print(prior) + return + + if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(dirname)): + now = datetime.datetime.now() + now = now.strftime("%Y-%m-%d-%H-%M-%S--") + dirname = now + dirname.name + + dirname = BENCHMARK_DNAME / dirname + return dirname + + +@app.command() +def main( + dirnames: Optional[List[str]] = typer.Argument(None, help="Directory names"), + graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"), + model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"), + sleep: float = typer.Option( + 0, "--sleep", help="Sleep seconds between tests when single threaded" + ), + languages: str = typer.Option( + None, "--languages", "-l", help="Only run tests for specific languages (comma separated)" + ), + edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"), + editor_model: str = typer.Option(None, "--editor-model", help="Editor model name"), + editor_edit_format: str = typer.Option(None, "--editor-edit-format", help="Editor edit format"), + replay: str = typer.Option( + None, + "--replay", + help="Replay previous .aider.chat.history.md responses from previous benchmark run", + ), + keywords: str = typer.Option( + None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)" + ), + clean: bool = typer.Option( + False, "--clean", "-c", help="Discard the existing testdir and make a clean copy" + ), + cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"), + make_new: bool = typer.Option(False, "--new", help="Make a new dated testdir"), + no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"), + no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), + stats_only: bool = typer.Option( + False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests" + ), + stats_languages: str = typer.Option( + None, + "--stats-languages", + help="Only include stats for specific languages (comma separated)", + ), + diffs_only: bool = typer.Option(False, "--diffs", help="Just diff the provided stats dirs"), + tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"), + threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"), + num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"), + num_ctx: Optional[int] = typer.Option( + None, "--num-ctx", help="Override model context window size" + ), + read_model_settings: str = typer.Option( + None, "--read-model-settings", help="Load aider model settings from YAML file" + ), + reasoning_effort: Optional[str] = typer.Option( + None, "--reasoning-effort", help="Set reasoning effort for models that support it" + ), + thinking_tokens: Optional[int] = typer.Option( + None, "--thinking-tokens", help="Set thinking tokens for models that support it" + ), + map_tokens: Optional[int] = typer.Option( + None, + "--map-tokens", + help="Suggested number of tokens for repo map (0 to disable)", + ), + exercises_dir: str = typer.Option( + EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files" + ), +): + if stats_only and not dirnames: + latest_dir = find_latest_benchmark_dir() + dirnames = [str(latest_dir)] + + if dirnames is None: + dirnames = [] + + if len(dirnames) > 1 and not (stats_only or diffs_only): + print("Only provide 1 dirname unless running with --stats or --diffs") + return 1 + + updated_dirnames = [] + for dirname in dirnames: + dirname = Path(dirname) + dirname = resolve_dirname(dirname, stats_only or cont, make_new) + if not dirname: + return 1 + updated_dirnames.append(dirname) + + if stats_only: + return show_stats(updated_dirnames, graphs, verbose, stats_languages) + + if diffs_only: + return show_diffs(updated_dirnames) + + assert len(updated_dirnames) == 1, updated_dirnames + dirname = updated_dirnames[0] + + # Lazy imports for the actual benchmark run + import git # Heavy; avoid for --stats/--diffs + import importlib_resources # Used for model metadata registration + import lox # Only needed for threaded runs + + from aider import models, sendchat + from aider.coders import base_coder + + repo = git.Repo(search_parent_directories=True) + commit_hash = repo.head.object.hexsha[:7] + if repo.is_dirty(): + commit_hash += "-dirty" + + if "AIDER_DOCKER" not in os.environ: + print("Warning: benchmarking runs unvetted code from GPT, run in a docker container") + return + + assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME + + def get_exercise_dirs(base_dir, languages=None): + """Get all exercise directories for specified languages (or all if none specified)""" + base_dir = Path(base_dir) + + # Get available language dirs + lang_dirs = [d for d in base_dir.iterdir() if d.is_dir()] + + # Filter to requested languages if specified + if languages: + requested = set(lang.strip().lower() for lang in languages.split(",")) + lang_dirs = [d for d in lang_dirs if d.name.lower() in requested] + dump(lang_dirs) + if not lang_dirs: + print(f"No matching language directories found for: {languages}") + return [] + + # Get all exercise dirs under exercises/practice for each language + exercise_dirs = [] + for lang_dir in lang_dirs: + practice_dir = lang_dir / "exercises" / "practice" + if practice_dir.exists(): + exercise_dirs.extend(d for d in practice_dir.iterdir() if d.is_dir()) + + return exercise_dirs + + original_dname = BENCHMARK_DNAME / exercises_dir + assert original_dname.exists() and original_dname.is_dir(), original_dname + + exercise_dirs = get_exercise_dirs(original_dname, languages) + + if not exercise_dirs: + print("No exercise directories found") + return 1 + + if clean and dirname.exists(): + print("Cleaning up and replacing", dirname) + dir_files = set(fn.name for fn in dirname.glob("*")) + original_files = set(fn.name for fn in original_dname.glob("*")) + if dir_files != original_files: + print("ERROR: will not delete dir that does not look like original tests", dirname) + return + + dest = dirname.parent / "OLD" / dirname.name + if dest.exists(): + old_now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + dest = dirname.parent / "OLD" / (old_now + dirname.name) + + dirname.rename(dest) + + if not dirname.exists(): + print(f"Copying {original_dname} -> {dirname} ...") + # Only copy the practice subdirs with exercises + os.makedirs(dirname, exist_ok=True) + for lang_dir in original_dname.iterdir(): + if not lang_dir.is_dir(): + continue + practice_dir = lang_dir / "exercises" / "practice" + if practice_dir.exists(): + dest_lang_dir = dirname / lang_dir.name / "exercises" / "practice" + os.makedirs(dest_lang_dir.parent, exist_ok=True) + shutil.copytree(practice_dir, dest_lang_dir) + print("...done") + + test_dnames = sorted(str(d.relative_to(original_dname)) for d in exercise_dirs) + + resource_metadata = importlib_resources.files("aider.resources").joinpath("model-metadata.json") + model_metadata_files_loaded = models.register_litellm_models([resource_metadata]) + dump(model_metadata_files_loaded) + + if read_model_settings: + try: + files_loaded = models.register_models([read_model_settings]) + if verbose: + if files_loaded: + print(f"Loaded model settings from: {files_loaded[0]}") + else: + print(f"No model settings loaded from: {read_model_settings}") + except Exception as e: + print(f"Error loading model settings: {e}") + return 1 + + if keywords: + keywords = keywords.split(",") + test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn] + + random.shuffle(test_dnames) + if num_tests > 0: + test_dnames = test_dnames[:num_tests] + + # Don't give up when benchmarking + LONG_TIMEOUT = 24 * 60 * 60 + sendchat.RETRY_TIMEOUT = LONG_TIMEOUT + base_coder.RETRY_TIMEOUT = LONG_TIMEOUT + models.RETRY_TIMEOUT = LONG_TIMEOUT + + # Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention + repomap_in_memory = threads > 1 + + if threads == 1: + all_results = [] + for test_path in test_dnames: + results = run_test( + original_dname, + dirname / test_path, + model, + edit_format, + tries, + no_unit_tests, + no_aider, + verbose, + commit_hash, + replay, + editor_model, + editor_edit_format, + num_ctx, + sleep, + reasoning_effort, + thinking_tokens, + map_tokens, + repomap_in_memory, + ) + + all_results.append(results) + summarize_results(dirname, verbose) + if sleep: + time.sleep(sleep) + else: + run_test_threaded = lox.thread(threads)(run_test) + for test_path in test_dnames: + run_test_threaded.scatter( + original_dname, + dirname / test_path, + model, + edit_format, + tries, + no_unit_tests, + no_aider, + verbose, + commit_hash, + replay, + editor_model, + editor_edit_format, + num_ctx, + sleep, + reasoning_effort, + thinking_tokens, + map_tokens, + repomap_in_memory, + ) + all_results = run_test_threaded.gather(tqdm=True) + + print() + print() + print() + summarize_results(dirname, verbose) + + return 0 + + +def show_diffs(dirnames): + dirnames = sorted(dirnames) + + all_results = dict((dirname, load_results(dirname)) for dirname in dirnames) + testcases = set() + for results in all_results.values(): + testcases.update(result["testcase"] for result in results) + + testcases = sorted(testcases) + + unchanged = set() + + for testcase in testcases: + all_outcomes = [] + for dirname in dirnames: + results = all_results[dirname] + result = [r for r in results if r["testcase"] == testcase][0] + + outcomes = tuple(result["tests_outcomes"]) + all_outcomes.append(True in outcomes) + + if len(set(all_outcomes)) == 1: + unchanged.add(testcase) + continue + + print() + print(testcase) + for outcome, dirname in zip(all_outcomes, dirnames): + print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md") + + changed = set(testcases) - unchanged + print() + print("changed:", len(changed), ",".join(sorted(changed))) + print() + print("unchanged:", len(unchanged), ",".join(sorted(unchanged))) + + +def load_results(dirname, stats_languages=None): + dirname = Path(dirname) + lang_to_results = {} + + if stats_languages: + languages = [lang.strip().lower() for lang in stats_languages.split(",")] + glob_patterns = [f"{lang}/exercises/practice/*/.aider.results.json" for lang in languages] + else: + glob_patterns = ["*/exercises/practice/*/.aider.results.json"] + + for pattern in glob_patterns: + for fname in dirname.glob(pattern): + try: + results = json.loads(fname.read_text()) + # json / test / prac / exer / lang + lang = fname.parent.parent.parent.parent.name + lang_to_results.setdefault(lang, []).append(results) + except json.JSONDecodeError: + print("json.JSONDecodeError", fname) + continue + return lang_to_results + + +def summarize_results(dirname, verbose, stats_languages=None): + lang_to_results = load_results(dirname, stats_languages) + + res = SimpleNamespace() + res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*"))) + + try: + tries = max( + len(results.get("tests_outcomes", [])) + for results_list in lang_to_results.values() + for results in results_list + if results + ) + except ValueError: + tries = 0 + + res.dir_name = str(dirname) + + passed_tests = [0] * tries + + res.completed_tests = 0 + res.duration = 0 + res.cost = 0 + res.error_outputs = 0 + res.user_asks = 0 + res.test_timeouts = 0 + res.exhausted_context_windows = 0 + res.num_malformed_responses = 0 + res.num_with_malformed_responses = 0 + res.syntax_errors = 0 + res.indentation_errors = 0 + res.lazy_comments = 0 + res.prompt_tokens = 0 + res.completion_tokens = 0 + + res.reasoning_effort = None + res.thinking_tokens = None + res.map_tokens = None + variants = defaultdict(set) + + def add(attr_name, increment, global_stats, lang_stats): + global_prev = getattr(global_stats, attr_name) + setattr(global_stats, attr_name, global_prev + increment) + + lang_prev = getattr(lang_stats, attr_name) + setattr(lang_stats, attr_name, lang_prev + increment) + + lang_to_stats = {} + lang_to_passed_tests = {} + for lang, results_list in lang_to_results.items(): + lang_stats = SimpleNamespace() + lang_stats.completed_tests = 0 + lang_stats.duration = 0 + lang_stats.avg_duration_per_test = 0 + lang_stats.cost = 0 + for i in range(tries): + setattr(lang_stats, f"pass_rate_{i + 1}", 0) + for i in range(tries): + setattr(lang_stats, f"pass_num_{i + 1}", 0) + lang_stats.error_outputs = 0 + lang_stats.user_asks = 0 + lang_stats.test_timeouts = 0 + lang_stats.exhausted_context_windows = 0 + lang_stats.num_malformed_responses = 0 + lang_stats.num_with_malformed_responses = 0 + lang_stats.syntax_errors = 0 + lang_stats.indentation_errors = 0 + lang_stats.lazy_comments = 0 + lang_stats.prompt_tokens = 0 + lang_stats.completion_tokens = 0 + lang_to_stats[lang] = lang_stats + lang_to_passed_tests[lang] = [0] * tries + + for results in results_list: + if not results: + continue + + add("completed_tests", 1, res, lang_stats) + tests_outcomes = results.get("tests_outcomes", []) + passed = tests_outcomes and tests_outcomes[-1] + if passed: + for i in range(len(tests_outcomes) - 1, tries): + passed_tests[i] += 1 + lang_to_passed_tests[lang][i] += 1 + + add("cost", results.get("cost", 0), res, lang_stats) + add("duration", results.get("duration", 0), res, lang_stats) + add("test_timeouts", results.get("test_timeouts", 0), res, lang_stats) + + add("error_outputs", results.get("num_error_outputs", 0), res, lang_stats) + add("user_asks", results.get("num_user_asks", 0), res, lang_stats) + add( + "exhausted_context_windows", + results.get("num_exhausted_context_windows", 0), + res, + lang_stats, + ) + add( + "num_malformed_responses", + results.get("num_malformed_responses", 0), + res, + lang_stats, + ) + if results.get("num_malformed_responses"): + add("num_with_malformed_responses", 1, res, lang_stats) + add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats) + + add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats) + add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats) + + add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats) + add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats) + + res.reasoning_effort = results.get("reasoning_effort") + res.thinking_tokens = results.get("thinking_tokens") + res.map_tokens = results.get("map_tokens") + + for key in "model edit_format commit_hash editor_model editor_edit_format".split(): + val = results.get(key) + if val: + variants[key].add(val) + + if not res.completed_tests: + return + + # if res.completed_tests < 133: + # return + + console = Console(highlight=False) + console.rule(title=str(dirname)) + + commit_hashes = variants["commit_hash"] + versions = get_versions(commit_hashes) + date = dirname.name[:10] + + def show(stat, red="red"): + val = getattr(res, stat) + style = red if val else None + console.print(f" {stat}: {val}", style=style) + + percents = dict() + for i in range(tries): + pass_rate = 100 * passed_tests[i] / res.completed_tests + percents[i] = pass_rate + # console.print(f"{pass_rate:.1f}% correct after try {i + 1}") + setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}") + setattr(res, f"pass_num_{i + 1}", passed_tests[i]) + + print(f"- dirname: {dirname.name}") + style = None if res.completed_tests == res.total_tests else "red" + console.print(f" test_cases: {res.completed_tests}", style=style) + for key, val in variants.items(): + if len(val) > 1: + style = "red" + else: + style = None + val = ", ".join(map(str, val)) + setattr(res, key, val) + console.print(f" {key}: {val}", style=style) + + if res.reasoning_effort is not None: + print(f" reasoning_effort: {res.reasoning_effort}") + if res.thinking_tokens is not None: + print(f" thinking_tokens: {res.thinking_tokens}") + if res.map_tokens is not None: + print(f" map_tokens: {res.map_tokens}") + + for i in range(tries): + print(f" pass_rate_{i + 1}: {percents[i]:.1f}") + for i in range(tries): + print(f" pass_num_{i + 1}: {passed_tests[i]}") + + pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests + print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}") + + show("error_outputs") + show("num_malformed_responses") + show("num_with_malformed_responses") + show("user_asks") + show("lazy_comments") + show("syntax_errors") + show("indentation_errors") + show("exhausted_context_windows") + show("prompt_tokens", red=None) + show("completion_tokens", red=None) + show("test_timeouts") + print(f" total_tests: {res.total_tests}") + + if variants["model"]: + a_model = set(variants["model"]).pop() + command = f"aider-ce --model {a_model}" + print(f" command: {command}") + + print(f" date: {date}") + print(" versions:", ",".join(versions)) + + res.avg_duration = res.duration / res.completed_tests + print(f" seconds_per_case: {res.avg_duration:.1f}") + + print(f" total_cost: {res.cost:.4f}") + + res.avg_cost = res.cost / res.completed_tests + + projected_cost = res.avg_cost * res.total_tests + + print() + print( + f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total," + f" ${projected_cost:.2f} projected" + ) + + if verbose and len(lang_to_stats) > 0: + + def format_lang_stats(lang, lang_stats): + # First, postprocess attributes for easier printing + if lang_stats.completed_tests > 0: + lang_stats.avg_duration_per_test = lang_stats.duration / float( + lang_stats.completed_tests + ) + for i in range(tries): + num_passed = lang_to_passed_tests[lang][i] + setattr(lang_stats, f"pass_num_{i + 1}", num_passed) + pass_rate = 100 * num_passed / float(lang_stats.completed_tests) + setattr(lang_stats, f"pass_rate_{i + 1}", pass_rate) + + # Then format attributes into ready-to-print strings + for attr in lang_stats.__dict__: + val = getattr(lang_stats, attr) + if val == 0: + val = "-" + elif isinstance(val, float): + val = f"{val:,.2f}" + else: + val = f"{val:,}" + + setattr(lang_stats, attr, val) + + def compute_lang_to_col_widths(lang_to_stats): + lang_to_col_widths = {} + for lang, lang_stats in lang_to_stats.items(): + lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__] + lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len))) + lang_to_col_widths[lang] = lang_col_width + + return lang_to_col_widths + + print() + print("======== Stats by language ========") + print() + + [format_lang_stats(lang, lang_stats) for lang, lang_stats in lang_to_stats.items()] + lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats) + + any_stats = list(lang_to_stats.values())[0] + attrs = list(any_stats.__dict__) + attr_col_width = len(max(["language"] + attrs, key=len)) + langs = list(lang_to_stats.keys()) + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + print(f"| {' '.center(attr_col_width)}", end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(f" | {lang.center(col_width)}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + for attr in attrs: + print(f"| {attr:<{attr_col_width}}", end="") + for lang in langs: + lang_stats = lang_to_stats[lang] + col_width = lang_to_col_widths[lang] + print(f" | {getattr(lang_stats, attr):>{col_width}}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + print() + + console.rule() + + # print(json.dumps(vars(res), indent=4, sort_keys=True)) + return res + + +def get_versions(commit_hashes): + versions = set() + for hsh in commit_hashes: + if not hsh: + continue + short = hsh.split("-")[0] + if short in _VERSION_CACHE: + ver = _VERSION_CACHE.get(short) + if ver: + versions.add(ver) + continue + + try: + version_src = subprocess.check_output( + ["git", "show", f"{short}:aider/__init__.py"], universal_newlines=True + ) + match = re.search(r'__version__ = "(.*)"', version_src) + ver = match.group(1) if match else None + _VERSION_CACHE[short] = ver + if ver: + versions.add(ver) + except subprocess.CalledProcessError: + _VERSION_CACHE[short] = None + pass + return versions + + +def get_replayed_content(replay_dname, test_dname): + replay_dname = Path(replay_dname) + test_dname = Path(test_dname) + dump(replay_dname, test_dname) + + test_name = test_dname.name + replay_fname = replay_dname / test_name / ".aider.chat.history.md" + dump(replay_fname) + + res = replay_fname.read_text() + return res + + res = res.splitlines(keepends=True) + res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")] + return "".join(res) + + +def run_test(original_dname, testdir, *args, **kwargs): + try: + return run_test_real(original_dname, testdir, *args, **kwargs) + except Exception: + print("=" * 40) + print("Test failed") + traceback.print_exc() + + testdir = Path(testdir) + results_fname = testdir / ".aider.results.json" + results_fname.write_text(json.dumps(dict(exception=traceback.format_exc()))) + + +def run_test_real( + original_dname, + testdir, + model_name, + edit_format, + tries, + no_unit_tests, + no_aider, + verbose, + commit_hash, + replay, + editor_model, + editor_edit_format, + num_ctx=None, + sleep=0, + reasoning_effort: Optional[str] = None, + thinking_tokens: Optional[int] = None, + map_tokens: Optional[int] = None, + read_model_settings=None, + repomap_in_memory: bool = False, +): + # Lazy imports: only needed in the actual benchmark execution path + import git + import prompts + + from aider import models + from aider.coders import Coder + from aider.io import InputOutput + + if not os.path.isdir(testdir): + print("Not a dir:", testdir) + return + + testdir = Path(testdir) + + history_fname = testdir / ".aider.chat.history.md" + + results_fname = testdir / ".aider.results.json" + if results_fname.exists(): + try: + res = json.loads(results_fname.read_text()) + # if res.get("test_timeouts", 0) > 0: + # print(f"{results_fname} test timeouts, redoing...") + # else: + return res + except JSONDecodeError: + print(f"{results_fname} failed to parse, redoing...") + + # Read solution and test files from config + fnames = [] + config_file = testdir / ".meta/config.json" + if not config_file.exists(): + raise ValueError(f"No config file found: {config_file}") + + with open(config_file) as f: + config = json.loads(f.read()) + + # Get file sets from config + test_files = config.get("files", {}).get("test", []) + example_files = config.get("files", {}).get("example", []) + solution_files = set(config.get("files", {}).get("solution", [])) + + # Forcibly ignore certain files not covered by test_files and example_files + ignore_files = set( + [ + "CMakeLists.txt", + "Cargo.toml", + ] + ) + + # Add all files under .meta and .docs directories + ignore_files.update(str(p.relative_to(testdir)) for p in testdir.glob(".meta/**/*")) + ignore_files.update(str(p.relative_to(testdir)) for p in testdir.glob(".docs/**/*")) + + # Also ignore test & example files + ignore_files.update(test_files) + ignore_files.update(example_files) + + # Remove any ignore files from the solution set that LLM will edit + solution_files.difference_update(ignore_files) + + # Copy all solution files + for file_path in solution_files: + src = testdir / Path(file_path) + if src.exists(): + fnames.append(src) + # restore the original file, in case we interrupted a prev run + # Find the original file in the language-specific practice dir + lang_part = str(testdir).split("/exercises/practice/")[0] + original_fname = ( + original_dname + / Path(lang_part).name + / "exercises" + / "practice" + / testdir.name + / file_path + ) + if original_fname.exists(): + os.makedirs(src.parent, exist_ok=True) + shutil.copy(original_fname, src) + else: + print(f"Warning: Solution file not found: {src}") + + file_list = " ".join(fname.name for fname in fnames) + + instructions = "" + + introduction = testdir / ".docs/introduction.md" + if introduction.exists(): + instructions += introduction.read_text() + instructions += (testdir / ".docs/instructions.md").read_text() + instructions_append = testdir / ".docs/instructions.append.md" + if instructions_append.exists(): + instructions += instructions_append.read_text() + + instructions += prompts.instructions_addendum.format(file_list=file_list) + + io = InputOutput( + pretty=False, + yes=True, + chat_history_file=history_fname, + ) + + # weak_model_name = model_name + weak_model_name = None + + main_model = models.Model( + model_name, + weak_model=weak_model_name, + editor_model=editor_model, + editor_edit_format=editor_edit_format, + verbose=verbose, + ) + + if reasoning_effort is not None: + main_model.set_reasoning_effort(reasoning_effort) + + if thinking_tokens is not None: + main_model.set_thinking_tokens(thinking_tokens) + + dump(main_model.max_chat_history_tokens) + + if num_ctx: + if not main_model.extra_params: + main_model.extra_params = {} + main_model.extra_params["num_ctx"] = num_ctx + edit_format = edit_format or main_model.edit_format + + dump(main_model) + dump(edit_format) + show_fnames = ",".join(map(str, fnames)) + print("fnames:", show_fnames) + # Ensure this test directory is a standalone git repo so RepoMap can be used + try: + git_dir = testdir / ".git" + if not git_dir.exists(): + r = git.Repo.init(testdir) + # Set a local identity to avoid commit failures in clean containers + with r.config_writer() as cw: + cw.set_value("user", "name", "aider-benchmark") + cw.set_value("user", "email", "aider-benchmark@example.com") + # Add existing files (solution set and any current files) + r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()]) + r.index.commit("Initial commit for aider benchmark") + except Exception as e: + if verbose: + print(f"Warning: failed to initialize git repo in {testdir}: {e}") + + coder_kwargs = dict( + main_model=main_model, + edit_format=edit_format, + io=io, + fnames=fnames, + use_git=True, + auto_commits=False, + dirty_commits=False, + stream=False, + verbose=verbose, + # auto_lint=False, # disabled for code-in-json experiments + cache_prompts=True, + suggest_shell_commands=False, + ignore_mentions=ignore_files, + # Reduce repo map contention and size for benchmarks + map_cache_dir=str(testdir), + repomap_in_memory=repomap_in_memory, + map_mul_no_files=4, + ) + if map_tokens is not None: + coder_kwargs["map_tokens"] = map_tokens + + coder = Coder.create(**coder_kwargs) + dump(coder.ignore_mentions) + + coder.show_announcements() + coder.get_file_mentions = lambda x: set() # No loading of any other files + + timeouts = 0 + + syntax_errors = 0 + indentation_errors = 0 + lazy_comments = 0 + + dur = 0 + test_outcomes = [] + for i in range(tries): + start = time.time() + + if no_aider: + pass + elif replay: + response = get_replayed_content(replay, testdir) + coder.partial_response_content = response + + show = response.splitlines(keepends=True) + show = [">> " + line for line in show] + io.append_chat_history("".join(show)) + + coder.apply_updates() + else: + response = coder.run(with_message=instructions, preproc=False) + + dur += time.time() - start + + if not no_aider: + pat = r"^[+]? *[#].* [.][.][.] " + # Count the number of lines that match pat in response + dump(response) + lazy_comments += len(re.findall(pat, response, re.MULTILINE)) + dump(lazy_comments) + + if coder.last_keyboard_interrupt: + raise KeyboardInterrupt + + if no_unit_tests: + break + + try: + errors = run_unit_tests(original_dname, testdir, history_fname, test_files) + except subprocess.TimeoutExpired: + # try: + # errors = run_unit_tests(original_dname, testdir, history_fname, test_files) + # except subprocess.TimeoutExpired: + errors = "Tests timed out!" + timeouts += 1 + + if errors: + test_outcomes.append(False) + else: + test_outcomes.append(True) + break + + if replay: + io.append_chat_history(errors) + + errors = errors.splitlines() + + syntax_errors += sum(1 for line in errors if line.startswith("SyntaxError")) + indentation_errors += sum(1 for line in errors if line.startswith("IndentationError")) + + print(errors[-1]) + errors = "\n".join(errors) + instructions = errors + instructions += prompts.test_failures.format(file_list=file_list) + + # Clean up build directories after all attempts + # Rust target/debug + target_dir = testdir / "target" / "debug" + if target_dir.exists(): + try: + shutil.rmtree(target_dir) + if verbose: + print(f"Cleaned up Rust target/debug directory: {target_dir}") + except (OSError, shutil.Error, PermissionError) as e: + if verbose: + print(f"Failed to clean up Rust target/debug directory: {e}") + + # Java build directories + java_build_dir = testdir / "build" + if java_build_dir.exists(): + try: + shutil.rmtree(java_build_dir) + if verbose: + print(f"Cleaned up Java build directory: {java_build_dir}") + except (OSError, shutil.Error, PermissionError) as e: + if verbose: + print(f"Failed to clean up Java build directory: {e}") + + # Node.js node_modules directories + node_modules_dir = testdir / "node_modules" + if node_modules_dir.exists(): + try: + shutil.rmtree(node_modules_dir) + if verbose: + print(f"Cleaned up Node.js node_modules directory: {node_modules_dir}") + except (OSError, shutil.Error, PermissionError) as e: + if verbose: + print(f"Failed to clean up Node.js node_modules directory: {e}") + + results = dict( + testdir=str(testdir), + testcase=testdir.name, + model=main_model.name, + edit_format=edit_format, + tests_outcomes=test_outcomes, + cost=coder.total_cost, + duration=dur, + test_timeouts=timeouts, + commit_hash=commit_hash, + num_error_outputs=io.num_error_outputs, + num_user_asks=io.num_user_asks, + num_exhausted_context_windows=coder.num_exhausted_context_windows, + num_malformed_responses=coder.num_malformed_responses, + syntax_errors=syntax_errors, + indentation_errors=indentation_errors, + lazy_comments=lazy_comments, # Add the count of pattern matches to the results + reasoning_effort=reasoning_effort, + prompt_tokens=coder.total_tokens_sent, + completion_tokens=coder.total_tokens_received, + thinking_tokens=thinking_tokens, + map_tokens=map_tokens, + chat_hashes=list( + zip( + coder.chat_completion_call_hashes, + coder.chat_completion_response_hashes, + ) + ), + ) + + if edit_format == "architect": + results["editor_model"] = main_model.editor_model.name if main_model.editor_model else None + results["editor_edit_format"] = main_model.editor_edit_format + dump(results) + + results_fname.write_text(json.dumps(results, indent=4)) + + return results + + +def run_unit_tests(original_dname, testdir, history_fname, test_files): + timeout = 60 * 3 + + # Map of file extensions to test commands + TEST_COMMANDS = { + ".py": ["pytest"], + ".rs": ["cargo", "test", "--", "--include-ignored"], + ".go": ["go", "test", "./..."], + ".js": ["/aider/benchmark/npm-test.sh"], + ".cpp": ["/aider/benchmark/cpp-test.sh"], + ".java": ["./gradlew", "test"], + } + + # Get unique file extensions from test files + extensions = {Path(f).suffix for f in test_files} + + # Find matching test command + command = None + for ext in extensions: + if ext in TEST_COMMANDS: + command = TEST_COMMANDS[ext] + break + + if not command: + raise ValueError(f"No test command found for files with extensions: {extensions}") + + # Copy test files from original directory + for file_path in test_files: + src = original_dname / Path(*testdir.parts[-4:]) / file_path + dst = testdir / file_path + if src.exists(): + print("copying", src, dst) + os.makedirs(dst.parent, exist_ok=True) + shutil.copy(src, dst) + + # Remove @Disabled annotations from Java test files + for file_path in test_files: + if file_path.endswith(".java"): + test_file = testdir / file_path + if test_file.exists(): + content = test_file.read_text() + content = re.sub(r"@Disabled\([^)]*\)\s*\n", "", content) + test_file.write_text(content) + + print(" ".join(command)) + + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + cwd=testdir, + encoding="utf-8", + errors="replace", + ) + + success = result.returncode == 0 + res = result.stdout + res = cleanup_test_output(res, testdir) + dump(res) + + with history_fname.open("a") as fh: + fh.write(f"```\n{res}\n```") + + if not success: + print(f"Tests failed: {testdir}") + return res + + +def cleanup_test_output(output, testdir): + # remove timing info, to avoid randomizing the response to GPT + res = re.sub(r"\bin \d+\.\d+s\b", "", output) + res = res.replace(str(testdir), str(testdir.name)) + return res + + +if __name__ == "__main__": + app() diff --git a/benchmark/docker.sh b/benchmark/docker.sh index 6f97b865e19..b4265a69401 100755 --- a/benchmark/docker.sh +++ b/benchmark/docker.sh @@ -1,19 +1,20 @@ #!/bin/bash +# FIXME - should be able to choose the keys to pass internal +# docker run \ - -it --rm \ - --memory=12g \ - --memory-swap=12g \ - --add-host=host.docker.internal:host-gateway \ - -v `pwd`:/aider \ - -v `pwd`/tmp.benchmarks/.:/benchmarks \ - -e OPENAI_API_KEY=$OPENAI_API_KEY \ - -e HISTFILE=/aider/.bash_history \ - -e PROMPT_COMMAND='history -a' \ - -e HISTCONTROL=ignoredups \ - -e HISTSIZE=10000 \ - -e HISTFILESIZE=20000 \ - -e AIDER_DOCKER=1 \ - -e AIDER_BENCHMARK_DIR=/benchmarks \ - aider-benchmark \ - bash + -it --rm \ + --memory=12g \ + --memory-swap=12g \ + --add-host=host.docker.internal:host-gateway \ + -v $(pwd):/cecli \ + -v $(pwd)/tmp.benchmarks/.:/benchmarks \ + -e GEMINI_API_KEY=$GEMINI_API_KEY \ + -e PROMPT_COMMAND='history -a' \ + -e HISTCONTROL=ignoredups \ + -e HISTSIZE=10000 \ + -e HISTFILESIZE=20000 \ + -e AIDER_DOCKER=1 \ + -e AIDER_BENCHMARK_DIR=/benchmarks \ + cecli-cat \ + bash diff --git a/benchmark/docker_build.sh b/benchmark/docker_build.sh index a6619bb5ce1..a132463ef17 100755 --- a/benchmark/docker_build.sh +++ b/benchmark/docker_build.sh @@ -3,6 +3,6 @@ set -e docker build \ - --file benchmark/Dockerfile \ - -t aider-benchmark \ - . + --file benchmark/Dockerfile \ + -t cecli-cat \ + .