diff --git a/benchmark/Dockerfile b/benchmark/Dockerfile
index a5926dab744..a210915e29e 100644
--- a/benchmark/Dockerfile
+++ b/benchmark/Dockerfile
@@ -57,8 +57,8 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     core-js@3.37.1 \
     eslint@8.49.0
 
-COPY . /aider
 RUN pip3 install --no-cache-dir --upgrade pip uv
-RUN uv pip install --system --no-cache-dir -e /aider[dev]
-RUN git config --global --add safe.directory /aider
-WORKDIR /aider
+COPY . /cecli
+RUN uv pip install --system --no-cache-dir -e /cecli[dev]
+RUN git config --global --add safe.directory /cecli
+WORKDIR /cecli
diff --git a/benchmark/README.md b/benchmark/README.md
index 988406de687..c35bcd61a95 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1,94 +1,124 @@
-
 # Aider benchmark harness
 
-Aider uses benchmarks to quantitatively measure how well it works
-with various LLMs.
+Before `cecli` was born, the old `aider` used benchmarks to quantitatively
+measure how well it works with various LLMs.
+
 This directory holds the harness and tools needed to run the benchmarking suite.
 
+If you're familiar with the `aider` benchmarking, see the "What's new..."
+section below.
+
 ## Background
 
-The benchmark is based on the [Exercism](https://github.com/exercism/python) coding exercises.
-This
-benchmark evaluates how effectively aider and LLMs can translate a
-natural language coding request into executable code saved into
-files that pass unit tests.
-It provides an end-to-end evaluation of not just
-the LLM's coding ability, but also its capacity to *edit existing code*
-and *format those code edits* so that aider can save the
-edits to the local source files.
-
-See [this writeup for a longer discussion about the benchmark](https://aider.chat/2024/12/21/polyglot.html).
-
-The benchmark is intended to be run *inside a docker container*.
-This is because the benchmarking harness will be
-taking code written by an LLM
-and executing it without any human review or supervision!
-The LLM could generate dangerous python that harms your system, like this: `import os; os.system("sudo rm -rf /")`.
+The benchmark was based on the [Exercism](https://github.com/exercism/python)
+coding exercises. This benchmark evaluates how effectively aider and LLMs can
+translate a natural language coding request into executable code saved into
+files that pass unit tests. It provides an end-to-end evaluation of not just the
+LLM's coding ability, but also its capacity to _edit existing code_ and _format
+those code edits_ so that aider can save the edits to the local source files.
+
+See
+[this writeup for a longer discussion about the benchmark](https://aider.chat/2024/12/21/polyglot.html).
+
+The benchmark is intended to be run _inside a docker container_. This is because
+the benchmarking harness will be taking code written by an LLM and executing it
+without any human review or supervision! The LLM could generate dangerous python
+that harms your system, like this: `import os; os.system("sudo rm -rf /")`.
 Running inside a docker container helps limit the damage that could be done.
 
 ## Usage
 
-There are 3 main tasks involved in benchmarking aider:
+There are 3 main tasks involved in benchmarking:
 
-1. Install and setup for benchmarking.
+1. Install and setup.
 
-2. Run the benchmark to measure performance across all the exercises.
+2. Run the benchmark.
 
-3. Generate a summary report of how many of the exercises succeeded or failed.
+3. Analysis.
 
-### Setup for benchmarking
+### Setup
 
-First, prepare all the groundwork for running the benchmarks.
 These steps only need to be done once.
 
 ```
-# Clone the aider repo
-git clone https://github.com/Aider-AI/aider.git
+ORG=Aider-AI
+REPO=aider
+# Clone the main repo
+git clone https://github.com/$ORG/$REPO.git
 
-# Create the scratch dir to hold benchmarking results inside the main aider dir:
-cd aider
+# Create the scratch dir to hold benchmarking results inside the main repo:
+cd $REPO
 mkdir tmp.benchmarks
 
 # Clone the repo with the exercises
-git clone https://github.com/Aider-AI/polyglot-benchmark tmp.benchmarks/polyglot-benchmark
+git clone https://github.com/$ORG/polyglot-benchmark tmp.benchmarks/polyglot-benchmark
 
 # Build the docker container
 ./benchmark/docker_build.sh
 ```
 
-### Running the benchmark
+### Running the benchmarks
 
 Launch the docker container and run the benchmark inside it:
 
 ```
 # Launch the docker container
+# You probably want to tweak this script to import your service keys.
+# It's curretnly configured to import GEMINI_API_KEY only.
+# PR's welcome to more effectively grab the keys without causing anxiety.
 ./benchmark/docker.sh
 
 # Inside the container, install aider as a development build.
 # This way you're running the code that you cloned above, including any local changes.
+# TODO: this step should be included in the Dockerfile
 pip install -e .[dev]
 
 # Run the benchmark:
 ./benchmark/benchmark.py a-helpful-name-for-this-run --model gpt-3.5-turbo --edit-format whole --threads 10 --exercises-dir polyglot-benchmark
 ```
 
-The above will create a folder `tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run` with benchmarking results.
-Run like this, the script will run all the exercises in a random order.
-
-You can run `./benchmark/benchmark.py --help` for a list of all the arguments, but here are the most useful to keep in mind:
-
-- `--model` is the name of the model, same as you would pass directly to `aider`.
-- `--edit-format` is the name of the edit format, same as you would pass directly to `aider`. When working with an experimental LLM, I recommend starting with `whole`
-- `--threads` specifies how many exercises to benchmark in parallel. Start with a single thread if you are working out the kinks on your benchmarking setup or working with a new model, etc. Once you are getting reliable results, you can speed up the process by running with more threads. 10 works well against the OpenAI APIs.
-- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
-- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
-- `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
-- `--map-tokens` sets a token budget for the repo map sent with each request. Set `0` to disable the repo map. This lets you enable repo map usage for any model (e.g., `--map-tokens 1024`).
+The above will create a folder
+`tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run` with
+benchmarking results. Run like this, the script will run all the exercises in a
+random order.
+
+You can run `./benchmark/benchmark.py --help` for a list of all the arguments,
+but here are the most useful to keep in mind:
+
+- `--model` is the name of the model, same as you would pass directly to
+  `aider`.
+- `--edit-format` is the name of the edit format, same as you would pass
+  directly to `aider`. When working with an experimental LLM, I recommend
+  starting with `whole`
+- `--sets` runs specific groups of tests using the `sets` in the `cat.yaml`.
+  (Hopefully, the sets will grow with time but currently it just bookmarks
+  the classic "polyglot" test battery.)
+- `--hash-re` allows for deterministic slicing of the exercise set based on the
+  exercise hash. This is useful for quickly grabbing a consistent subset or k-fold
+  cross-validation. For example:
+  - `^0`: 1/16 of the set.
+  - `^[01]`: 1/8 of the set.
+  - `^[0-3]`: 1/4 of the set.
+  - `^.{2}[4-7]`: 1/4 of the set, using the 3 character of the hash.
+- `--threads` specifies how many exercises to benchmark in parallel. Start with
+  a single thread if you are working out the kinks on your benchmarking setup or
+  working with a new model, etc. Once you are getting reliable results, you can
+  speed up the process by running with more threads. 10 works well against the
+  OpenAI APIs.
+- `--num-tests` specifies how many of the tests to run before stopping. This is
+  another way to start gently as you debug your benchmarking setup.
+- `--keywords` filters the tests to run to only the ones whose name match the
+  supplied argument (similar to `pytest -k xxxx`).
+- `--read-model-settings=<filename.yml>` specify model settings, see here:
+  https://aider.chat/docs/config/adv-model-settings.html#model-settings
+- `--map-tokens` sets a token budget for the repo map sent with each request.
+  Set `0` to disable the repo map. This lets you enable repo map usage for any
+  model (e.g., `--map-tokens 1024`).
 
 ### Benchmark report
 
-You can generate stats about any benchmark, including ones which are still running.
-You don't need to run this inside the docker container, as it is just
+You can generate stats about any benchmark, including ones which are still
+running. You don't need to run this inside the docker container, as it is just
 collecting stats not executing unsafe python.
 
 ```
@@ -96,52 +126,55 @@ collecting stats not executing unsafe python.
 ./benchmark/benchmark.py --stats tmp.benchmarks/YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run
 ```
 
-The benchmark report is a yaml record with statistics about the run:
-
-```yaml
-- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
-  test_cases: 225
-  model: claude-3.5-sonnet
-  edit_format: diff
-  commit_hash: 35f21b5
-  pass_rate_1: 57.1
-  pass_rate_2: 77.4
-  percent_cases_well_formed: 99.2
-  error_outputs: 23
-  num_malformed_responses: 4
-  num_with_malformed_responses: 1
-  user_asks: 2
-  lazy_comments: 0
-  syntax_errors: 1
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 1
-  command: aider --sonnet
-  date: 2024-07-04
-  versions: 0.42.1-dev
-  seconds_per_case: 17.6
-  total_cost: 3.6346
-```
-
-The key statistics are the `pass_rate_#` entries, which report the
-percent of the tasks which had all tests passing.
-There will be multiple of these pass rate stats,
-depending on the value of the `--tries` parameter.
-
-The yaml also includes all the settings which were in effect for the benchmark run.
-It also reports the git hash of the repo at the time that the benchmark was
-run, with `(dirty)` if there were uncommitted changes.
-It's good practice to commit the repo before starting a benchmark run.
-This way the `model`, `edit_format` and `commit_hash`
-should be enough to reliably reproduce any benchmark run.
-
-You can see examples of the benchmark report yaml in the
-[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
-
-
-## Limitations, notes
-
-- Contributions of benchmark results are welcome! Submit results by opening a PR with edits to the
-[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
-- These scripts are not intended for use by typical aider end users.
-- Some of these tools are written as `bash` scripts, so it will be hard to use them on Windows.
+The benchmark report is a yaml record with statistics about the run.
+
+The key statistics are the `pass_rate_#` entries, which report the percent of
+the tasks which had all tests passing. There will be multiple of these pass rate
+stats, depending on the value of the `--tries` parameter.
+
+The yaml also includes all the settings which were in effect for the benchmark
+run. It also reports the git hash of the repo at the time that the benchmark was
+run, with `(dirty)` if there were uncommitted changes. It's good practice to
+commit the repo before starting a benchmark run. This way the `model`,
+`edit_format` and `commit_hash` should be enough to reliably reproduce any
+benchmark run.
+
+## Contributing
+
+Contributions of benchmark results and tests are welcome! Submit results by opening a PR.
+
+Note the roadmap priorities:
+
+1. Complete 'set up records' to support smart caching.
+2. Atomic data collection. Most of the data is saved but need protocols for sharing.
+3. **Dimensional Parameter Walking** allowing for n-dimensional parameter tuning,
+   facilitating "gradient descent" approach to opimisation accross multiple parameters.
+   The test runner should accept n lists of options, e.g., ["thinking: 100", "thinking: 200", "thinking: 400"], ["optionA: B", "optionD: C"].
+4. Smart Caching so the runner can optionally skip any tests for which "similar" result data
+   is already available based on fuzzy metadata matching. This aids iterative Testing as
+   when adding a new option to a list of permutations, only the new permutations need to
+   be run. Also when new Cats join the collection it is easy to incrementally collect the data.
+5. Data aggregation and analysis. These will be seperate specialised tools.
+
+## Limitations
+
+- These scripts are not intended for use by typical `cecli` end users.
+- Some of the old (?deprecated) tools are written as `bash` scripts, so it will be hard to use
+  them on Windows.
+- Currently the JS and cpp tests appear broken.
+
+## What's new with Cecli Cats?
+
+The benchmark has evolved into a collection of **Cecli Atomic Tests (Cats)**.
+
+- **YAML Metadata**: Every Cat has its own `cat.yaml` file containing metadata,
+  including a unique UUID that may or may not be useful later.
+- **Evolving Collection**: The directory structure of the Cats is laid out to
+  facilitate the growth and evolution of the collection. As the benchmark
+  matures, Cats will come and go.
+- **Simplified Runner**: The test runner is being simplified to focus on its
+  core job: executing tests and recording results. Downstream aggregation and
+  analysis of results will be shifted to other tools and projects.
+- **Subset Filtering**: see `--sets`
+- **K-fold Evaluation Slicing**: The `--hash-re` option allows for deterministic
+  slicing of the exercise (now `cats`) based on the exercise hash.
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 02117242742..660aa50d57c 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
+import asyncio
 import datetime
+import importlib_resources
 import json
 import os
 import random
@@ -9,17 +11,18 @@
 import sys
 import time
 import traceback
+import yaml
 from collections import defaultdict
 from json.decoder import JSONDecodeError
 from pathlib import Path
 from types import SimpleNamespace
 from typing import List, Optional
+import logging
 
 """
 Performance-oriented refactors:
 - Avoid heavy imports unless needed for a given code path.
 - Fast path for `--stats` to skip GitPython and benchmarking deps.
-- Build DataFrame / import plotting only when `--graphs` is true.
 - Use json.load for result file parsing to reduce memory churn.
 - Cache git version lookups across a single invocation.
 """
@@ -31,12 +34,14 @@
 
 from aider.dump import dump  # noqa: F401
 
+logger = logging.getLogger("aider.benchmark")
+
 # Cache for commit-hash -> version lookup
 _VERSION_CACHE = {}
 
 BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))
-
-EXERCISES_DIR_DEFAULT = "polyglot-benchmark"
+EXERCISES_DIR_DEFAULT = "cecli-cat"
+RESULTS_DIR_DEFAULT = "cat-results"
 
 app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 
@@ -44,167 +49,108 @@
 load_dotenv(override=True)
 
 
-def find_latest_benchmark_dir():
-    benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
-    if not benchmark_dirs:
-        print("Error: No benchmark directories found under tmp.benchmarks.")
-        sys.exit(1)
-
-    # Get current time and 24 hours ago
-    now = datetime.datetime.now()
-    day_ago = now - datetime.timedelta(days=1)
-
-    # Filter directories by name pattern YYYY-MM-DD-HH-MM-SS--
-    recent_dirs = []
-    for d in benchmark_dirs:
-        try:
-            # Extract datetime from directory name
-            date_str = d.name[:19]  # Takes YYYY-MM-DD-HH-MM-SS
-            dir_date = datetime.datetime.strptime(date_str, "%Y-%m-%d-%H-%M-%S")
-            if dir_date >= day_ago:
-                recent_dirs.append(d)
-        except ValueError:
-            # Skip directories that don't match the expected format
-            continue
-
-    if not recent_dirs:
-        print("Error: No benchmark directories found from the last 24 hours.")
-        sys.exit(1)
-
-    # Find directory with most recently modified .md file
-    latest_dir = None
-    latest_time = 0
-
-    for d in recent_dirs:
-        # Look for .md files in subdirectories
-        for md_file in d.glob("*/exercises/practice/*/.*.md"):
-            if md_file.is_file():
-                mtime = md_file.stat().st_mtime
-                if mtime > latest_time:
-                    latest_time = mtime
-                    latest_dir = d
-
-    if not latest_dir:
-        print("Error: No .md files found in recent benchmark directories.")
-        sys.exit(1)
-
-    print(f"Using the most recently updated benchmark directory: {latest_dir.name}")
-    return latest_dir
-
-
-def show_stats(dirnames, graphs, verbose, stats_languages=None):
-    raw_rows = []
-    for dirname in dirnames:
-        row = summarize_results(dirname, verbose, stats_languages)
-        raw_rows.append(row)
-
-    # return
-
-    seen = dict()
-    rows = []
-    for row in raw_rows:
-        if not row:
-            continue
-
-        if row.completed_tests != row.total_tests:
-            print(
-                f"Warning: {row.dir_name} is incomplete: {row.completed_tests} of {row.total_tests}"
-            )
-
-        try:
-            kind = (row.model, row.edit_format)
-        except AttributeError:
-            return
-
-        if kind in seen:
-            dump(row.dir_name)
-            dump(seen[kind])
-            return
-
-        seen[kind] = row.dir_name
-        rows.append(vars(row))
-
-    repeat_hi = repeat_lo = repeat_avg = None  # noqa: F841
-
-    # Only build a DataFrame and import plotting libs when graphs are requested
-    if graphs:
-        import pandas as pd  # Lazy import
-        from plots import plot_refactoring  # Lazy import
+def resolve_dirname(results_dir, use_single_prior, make_new):
+    """
+    Determines the actual directory path used for storing benchmark results.
 
-        df = pd.DataFrame.from_records(rows)
-        # plot_timing(df)
-        # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
-        # plot_outcomes_claude(df)
-        plot_refactoring(df)
+    1. Resuming a previous run: If the --cont flag is used and exactly one matching previous run exists, it selects that existing directory.
+    2. Safety check: If previous runs exist but the user didn't specify --new or --cont, it warns the user and aborts to prevent accidental overwrites or confusion.
+    3. Creating a new run: If no prior run exists (or --new is used), it prepends the current timestamp to the directory name to ensure a unique workspace.
+    """
+    logger.debug(f"initial results_dir: {results_dir}")
+    results_dir = Path(results_dir)
+    logger.debug(f"dirname1: {results_dir}")
+    if len(results_dir.parts) > 1:
+        return results_dir
 
+    priors = list(BENCHMARK_DNAME.glob(f"*--{results_dir}"))
+    # BUG20251223
+    logger.debug(f"Found priors: {priors}")
+    logger.debug(f"use_single_prior: {use_single_prior}, make_new: {make_new}")
 
-def resolve_dirname(dirname, use_single_prior, make_new):
-    if len(dirname.parts) > 1:
-        return dirname
-
-    priors = list(BENCHMARK_DNAME.glob(f"*--{dirname}"))
     if len(priors) == 1 and use_single_prior:
-        dirname = priors[0].name
-        print(f"Using pre-existing {dirname}")
+        results_dir = priors[0].name
+        logger.info(f"Using pre-existing {results_dir}")
     elif len(priors):
         if not make_new:
-            print(f"Prior runs of {dirname} exist, use --new or name one explicitly")
-            print()
+            logger.warning(
+                f"Prior runs of {results_dir} exist, use --new or name one explicitly"
+            )
             for prior in priors:
-                print(prior)
-            return
+                logger.warning(prior)
+            sys.exit(1)
 
-    if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(dirname)):
+    if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(results_dir)):
         now = datetime.datetime.now()
         now = now.strftime("%Y-%m-%d-%H-%M-%S--")
-        dirname = now + dirname.name
+        results_dir = now + results_dir.name
 
-    dirname = BENCHMARK_DNAME / dirname
-    return dirname
+    logger.debug(f"resolved {results_dir}")
+    results_dir = BENCHMARK_DNAME / results_dir
+    logger.info(f"updated results_dir: {results_dir}")
+    return results_dir
 
 
 @app.command()
 def main(
-    dirnames: Optional[List[str]] = typer.Argument(None, help="Directory names"),
-    graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"),
-    model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"),
+    results_dir: Optional[str] = typer.Argument(
+        "unnamed", help="Results directory slug"
+    ),
+    model: str = typer.Option(
+        "gemini/gemini-3-flash-preview", "--model", "-m", help="Model name"
+    ),
     sleep: float = typer.Option(
         0, "--sleep", help="Sleep seconds between tests when single threaded"
     ),
     languages: str = typer.Option(
-        None, "--languages", "-l", help="Only run tests for specific languages (comma separated)"
+        None,
+        "--languages",
+        "-l",
+        help="Only run tests for specific languages (comma separated)",
     ),
     edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"),
     editor_model: str = typer.Option(None, "--editor-model", help="Editor model name"),
-    editor_edit_format: str = typer.Option(None, "--editor-edit-format", help="Editor edit format"),
+    editor_edit_format: str = typer.Option(
+        None, "--editor-edit-format", help="Editor edit format"
+    ),
     replay: str = typer.Option(
         None,
         "--replay",
         help="Replay previous .aider.chat.history.md responses from previous benchmark run",
     ),
     keywords: str = typer.Option(
-        None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)"
+        None,
+        "--keywords",
+        "-k",
+        help="Only run tests that contain keywords (comma sep)",
     ),
     clean: bool = typer.Option(
-        False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
+        False,
+        "--clean",
+        "-c",
+        help="Discard the existing testdir and make a clean copy",
+    ),
+    cont: bool = typer.Option(
+        False, "--cont", help="Continue the (single) matching testdir"
     ),
-    cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
     make_new: bool = typer.Option(False, "--new", help="Make a new dated testdir"),
-    no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
+    no_unit_tests: bool = typer.Option(
+        False, "--no-unit-tests", help="Do not run unit tests"
+    ),
     no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
-    verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
-    stats_only: bool = typer.Option(
-        False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests"
+    verbose: int = typer.Option(
+        0, "--verbose", "-v", count=True, help="Verbose output"
     ),
-    stats_languages: str = typer.Option(
-        None,
-        "--stats-languages",
-        help="Only include stats for specific languages (comma separated)",
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Quiet output"),
+    tries: int = typer.Option(
+        2, "--tries", "-r", help="Number of tries for running tests"
+    ),
+    threads: int = typer.Option(
+        1, "--threads", "-t", help="Number of threads to run in parallel"
+    ),
+    num_tests: int = typer.Option(
+        -1, "--num-tests", "-n", help="Number of tests to run"
     ),
-    diffs_only: bool = typer.Option(False, "--diffs", help="Just diff the provided stats dirs"),
-    tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"),
-    threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"),
-    num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"),
     num_ctx: Optional[int] = typer.Option(
         None, "--num-ctx", help="Override model context window size"
     ),
@@ -212,7 +158,9 @@ def main(
         None, "--read-model-settings", help="Load aider model settings from YAML file"
     ),
     reasoning_effort: Optional[str] = typer.Option(
-        None, "--reasoning-effort", help="Set reasoning effort for models that support it"
+        None,
+        "--reasoning-effort",
+        help="Set reasoning effort for models that support it",
     ),
     thinking_tokens: Optional[int] = typer.Option(
         None, "--thinking-tokens", help="Set thinking tokens for models that support it"
@@ -225,57 +173,85 @@ def main(
     exercises_dir: str = typer.Option(
         EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
     ),
+    legacy: bool = typer.Option(
+        False, "--legacy", help="Use legacy exercise directory structure"
+    ),
+    sets: Optional[str] = typer.Option(
+        None, "--sets", help="Only run tests for specific sets (comma separated)"
+    ),
+    hash_re: Optional[str] = typer.Option(
+        None,
+        "--hash-re",
+        help=(
+            "Regex to filter exercise hashes. Useful for dividing the set into fractions using"
+            " hex chars: '^0' for 1/16, '^[01]' for 1/8, '^[0-3]' for 1/4. Use '^.{n}x' to"
+            " match the nth character (e.g., '^.{2}[4-7]' for the 3rd char in range 4-7)."
+        ),
+    ),
+    dry: bool = typer.Option(
+        False, "--dry", help="Run in dry mode (no aider, no tests)"
+    ),
 ):
-    if stats_only and not dirnames:
-        latest_dir = find_latest_benchmark_dir()
-        dirnames = [str(latest_dir)]
-
-    if dirnames is None:
-        dirnames = []
-
-    if len(dirnames) > 1 and not (stats_only or diffs_only):
-        print("Only provide 1 dirname unless running with --stats or --diffs")
-        return 1
-
-    updated_dirnames = []
-    for dirname in dirnames:
-        dirname = Path(dirname)
-        dirname = resolve_dirname(dirname, stats_only or cont, make_new)
-        if not dirname:
-            return 1
-        updated_dirnames.append(dirname)
-
-    if stats_only:
-        return show_stats(updated_dirnames, graphs, verbose, stats_languages)
-
-    if diffs_only:
-        return show_diffs(updated_dirnames)
-
-    assert len(updated_dirnames) == 1, updated_dirnames
-    dirname = updated_dirnames[0]
+    # setup logging and verbosity
+    if quiet:
+        log_level = logging.WARNING
+    elif verbose > 0:
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
 
-    # Lazy imports for the actual benchmark run
-    import git  # Heavy; avoid for --stats/--diffs
-    import importlib_resources  # Used for model metadata registration
-    import lox  # Only needed for threaded runs
+    logging.basicConfig(level=log_level, format="%(message)s")
 
-    from aider import models, sendchat
-    from aider.coders import base_coder
+    from aider import models
 
-    repo = git.Repo(search_parent_directories=True)
-    commit_hash = repo.head.object.hexsha[:7]
-    if repo.is_dirty():
-        commit_hash += "-dirty"
+    if dry:
+        no_aider = True
+        no_unit_tests = True
+        commit_hash = "???????"
+    else:
+        # Lazy imports for the actual benchmark run
+        import git  # Heavy
+        import lox  # Only needed for threaded runs
+        from aider import sendchat
+        from aider.coders import base_coder
+
+        repo = git.Repo(search_parent_directories=True)
+        commit_hash = repo.head.object.hexsha[:7]
+        if repo.is_dirty():
+            commit_hash += "-dirty"
+
+    resolved_results_dir = resolve_dirname(results_dir, cont, make_new)
+
+    if not resolved_results_dir:
+        logger.error(f"Could not resolve results directory from slug: {results_dir}")
+        logger.error(f"Checked in {BENCHMARK_DNAME}")
+        return 1
+    results_dir = resolved_results_dir
 
-    if "AIDER_DOCKER" not in os.environ:
-        print("Warning: benchmarking runs unvetted code from GPT, run in a docker container")
+    if not dry and "AIDER_DOCKER" not in os.environ:
+        logger.warning(
+            "Warning: Benchmarking runs unvetted code. Run in a docker container."
+        )
+        logger.warning(
+            "Set AIDER_DOCKER in the environment to by-pass this check at your own risk."
+        )
         return
 
-    assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME
+    # Check dirs exist
+    if not (BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir()):
+        logger.error(f"Benchmark directory not found: {BENCHMARK_DNAME}")
+        sys.exit(1)
+    original_dname = BENCHMARK_DNAME / exercises_dir
+    if not (original_dname.exists() and original_dname.is_dir()):
+        logger.error(f"Exercises directory not found: {original_dname}")
+        sys.exit(1)
 
-    def get_exercise_dirs(base_dir, languages=None):
-        """Get all exercise directories for specified languages (or all if none specified)"""
+    def legacy_get_exercise_dirs(base_dir, languages=None):
+        """Get all exercise directories for specified languages (or all if none specified).
+        Uses the legacy `excerises/practice` pattern.
+        """
         base_dir = Path(base_dir)
+        logger.info(f"Looking for exercises in {base_dir}")
 
         # Get available language dirs
         lang_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
@@ -286,7 +262,9 @@ def get_exercise_dirs(base_dir, languages=None):
             lang_dirs = [d for d in lang_dirs if d.name.lower() in requested]
             dump(lang_dirs)
             if not lang_dirs:
-                print(f"No matching language directories found for: {languages}")
+                logger.warning(
+                    f"No matching language directories found for: {languages}"
+                )
                 return []
 
         # Get all exercise dirs under exercises/practice for each language
@@ -298,205 +276,220 @@ def get_exercise_dirs(base_dir, languages=None):
 
         return exercise_dirs
 
-    original_dname = BENCHMARK_DNAME / exercises_dir
-    assert original_dname.exists() and original_dname.is_dir(), original_dname
+    def get_exercise_dirs(
+        base_dir, languages=None, sets=None, hash_re=None, legacy=False
+    ):
+        if legacy:
+            return legacy_get_exercise_dirs(base_dir, languages)
 
-    exercise_dirs = get_exercise_dirs(original_dname, languages)
+        base_dir = Path(base_dir)
+        logger.info(f"Scanning for cat.yaml in {base_dir}")
+
+        lang_filter = (
+            set(l.strip().lower() for l in languages.split(",")) if languages else None
+        )
+        set_filter = set(s.strip().lower() for s in sets.split(",")) if sets else None
+
+        exercise_dirs = []
+        for cat_file in base_dir.rglob("cat.yaml"):
+            try:
+                with open(cat_file, "r") as f:
+                    metadata = yaml.safe_load(f)
+                    if verbose > 1:
+                        logger.debug(
+                            f"found {metadata['name']} ({metadata['language']})"
+                        )
+            except Exception as e:
+                logger.warning(f"Failed to parse {cat_file}: {e}")
+                continue
+
+            if lang_filter and metadata.get("language", "").lower() not in lang_filter:
+                continue
+
+            if set_filter:
+                cat_sets = set(s.lower() for s in metadata.get("sets", []))
+                if not (set_filter & cat_sets):
+                    continue
+
+            if hash_re and not re.search(hash_re, metadata.get("hash", "")):
+                continue
+
+            exercise_dirs.append(cat_file.parent)
+
+        logger.info(f"Found {len(exercise_dirs)} cats")
+        return exercise_dirs
+
+    exercise_dirs = get_exercise_dirs(
+        original_dname, languages, sets, hash_re, legacy=legacy
+    )
 
     if not exercise_dirs:
-        print("No exercise directories found")
+        logger.error("No exercise directories found")
         return 1
 
-    if clean and dirname.exists():
-        print("Cleaning up and replacing", dirname)
-        dir_files = set(fn.name for fn in dirname.glob("*"))
+    if clean and results_dir.exists() and not dry:
+        logger.info(f"Cleaning up and replacing {results_dir}")
+        dir_files = set(fn.name for fn in results_dir.glob("*"))
         original_files = set(fn.name for fn in original_dname.glob("*"))
         if dir_files != original_files:
-            print("ERROR: will not delete dir that does not look like original tests", dirname)
+            logger.error(
+                f"ERROR: will not delete dir that does not look like original tests {results_dir}"
+            )
             return
 
-        dest = dirname.parent / "OLD" / dirname.name
+        dest = results_dir.parent / "OLD" / results_dir.name
         if dest.exists():
             old_now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-            dest = dirname.parent / "OLD" / (old_now + dirname.name)
-
-        dirname.rename(dest)
-
-    if not dirname.exists():
-        print(f"Copying {original_dname} -> {dirname} ...")
-        # Only copy the practice subdirs with exercises
-        os.makedirs(dirname, exist_ok=True)
-        for lang_dir in original_dname.iterdir():
-            if not lang_dir.is_dir():
-                continue
-            practice_dir = lang_dir / "exercises" / "practice"
-            if practice_dir.exists():
-                dest_lang_dir = dirname / lang_dir.name / "exercises" / "practice"
-                os.makedirs(dest_lang_dir.parent, exist_ok=True)
-                shutil.copytree(practice_dir, dest_lang_dir)
-        print("...done")
-
-    test_dnames = sorted(str(d.relative_to(original_dname)) for d in exercise_dirs)
-
-    resource_metadata = importlib_resources.files("aider.resources").joinpath("model-metadata.json")
+            dest = results_dir.parent / "OLD" / (old_now + results_dir.name)
+
+        results_dir.rename(dest)
+
+    if not dry:
+        if not results_dir.exists():
+            logger.info(f"Copying {original_dname} -> {results_dir} ...")
+            os.makedirs(results_dir, exist_ok=True)
+
+        copied = False
+        for exercise_dir in exercise_dirs:
+            dest_dir = results_dir / exercise_dir.name
+            if not dest_dir.exists():
+                if not copied:
+                    logger.info(f"Adding missing exercises to {results_dir} ...")
+                shutil.copytree(exercise_dir, dest_dir)
+                copied = True
+        if copied:
+            logger.info("...done")
+
+    test_dnames = sorted(d.name for d in exercise_dirs)
+
+    resource_metadata = importlib_resources.files("aider.resources").joinpath(
+        "model-metadata.json"
+    )
     model_metadata_files_loaded = models.register_litellm_models([resource_metadata])
     dump(model_metadata_files_loaded)
 
     if read_model_settings:
         try:
             files_loaded = models.register_models([read_model_settings])
-            if verbose:
-                if files_loaded:
-                    print(f"Loaded model settings from: {files_loaded[0]}")
-                else:
-                    print(f"No model settings loaded from: {read_model_settings}")
+            if files_loaded:
+                logger.debug(f"Loaded model settings from: {files_loaded[0]}")
+            else:
+                logger.debug(f"No model settings loaded from: {read_model_settings}")
         except Exception as e:
-            print(f"Error loading model settings: {e}")
+            logger.error(f"Error loading model settings: {e}")
             return 1
 
     if keywords:
         keywords = keywords.split(",")
-        test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
+        test_dnames = [
+            dn for dn in test_dnames for keyword in keywords if keyword in dn
+        ]
 
     random.shuffle(test_dnames)
     if num_tests > 0:
         test_dnames = test_dnames[:num_tests]
 
-    # Don't give up when benchmarking
-    LONG_TIMEOUT = 24 * 60 * 60
-    sendchat.RETRY_TIMEOUT = LONG_TIMEOUT
-    base_coder.RETRY_TIMEOUT = LONG_TIMEOUT
-    models.RETRY_TIMEOUT = LONG_TIMEOUT
+    if not no_aider:
+        # Don't give up when benchmarking
+        LONG_TIMEOUT = 24 * 60 * 60
+        sendchat.RETRY_TIMEOUT = LONG_TIMEOUT
+        base_coder.RETRY_TIMEOUT = LONG_TIMEOUT
+        models.RETRY_TIMEOUT = LONG_TIMEOUT
 
     # Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention
     repomap_in_memory = threads > 1
 
-    if threads == 1:
-        all_results = []
-        for test_path in test_dnames:
-            results = run_test(
-                original_dname,
-                dirname / test_path,
-                model,
-                edit_format,
-                tries,
-                no_unit_tests,
-                no_aider,
-                verbose,
-                commit_hash,
-                replay,
-                editor_model,
-                editor_edit_format,
-                num_ctx,
-                sleep,
-                reasoning_effort,
-                thinking_tokens,
-                map_tokens,
-                repomap_in_memory,
-            )
+    test_args = dict(
+        model_name=model,
+        edit_format=edit_format,
+        tries=tries,
+        no_unit_tests=no_unit_tests,
+        no_aider=no_aider,
+        verbose=verbose,
+        commit_hash=commit_hash,
+        replay=replay,
+        editor_model=editor_model,
+        editor_edit_format=editor_edit_format,
+        num_ctx=num_ctx,
+        sleep=sleep,
+        reasoning_effort=reasoning_effort,
+        thinking_tokens=thinking_tokens,
+        map_tokens=map_tokens,
+        repomap_in_memory=repomap_in_memory,
+        dry=dry,
+        results_dir=results_dir,
+    )
 
-            all_results.append(results)
-            summarize_results(dirname, verbose)
-            if sleep:
-                time.sleep(sleep)
-    else:
+    if threads > 1:
         run_test_threaded = lox.thread(threads)(run_test)
         for test_path in test_dnames:
             run_test_threaded.scatter(
-                original_dname,
-                dirname / test_path,
-                model,
-                edit_format,
-                tries,
-                no_unit_tests,
-                no_aider,
-                verbose,
-                commit_hash,
-                replay,
-                editor_model,
-                editor_edit_format,
-                num_ctx,
-                sleep,
-                reasoning_effort,
-                thinking_tokens,
-                map_tokens,
-                repomap_in_memory,
+                original_dname, results_dir / test_path, **test_args
             )
         all_results = run_test_threaded.gather(tqdm=True)
+    else:
+        all_results = []
+        for test_path in test_dnames:
+            results = run_test(original_dname, results_dir / test_path, **test_args)
+            all_results.append(results)
+            summarize_results(results_dir, verbose)
+            if sleep:
+                time.sleep(sleep)
 
     print()
     print()
     print()
-    summarize_results(dirname, verbose)
+    summarize_results(results_dir, verbose)
 
     return 0
 
 
-def show_diffs(dirnames):
-    dirnames = sorted(dirnames)
-
-    all_results = dict((dirname, load_results(dirname)) for dirname in dirnames)
-    testcases = set()
-    for results in all_results.values():
-        testcases.update(result["testcase"] for result in results)
-
-    testcases = sorted(testcases)
-
-    unchanged = set()
-
-    for testcase in testcases:
-        all_outcomes = []
-        for dirname in dirnames:
-            results = all_results[dirname]
-            result = [r for r in results if r["testcase"] == testcase][0]
-
-            outcomes = tuple(result["tests_outcomes"])
-            all_outcomes.append(True in outcomes)
-
-        if len(set(all_outcomes)) == 1:
-            unchanged.add(testcase)
-            continue
-
-        print()
-        print(testcase)
-        for outcome, dirname in zip(all_outcomes, dirnames):
-            print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md")
-
-    changed = set(testcases) - unchanged
-    print()
-    print("changed:", len(changed), ",".join(sorted(changed)))
-    print()
-    print("unchanged:", len(unchanged), ",".join(sorted(unchanged)))
-
-
-def load_results(dirname, stats_languages=None):
-    dirname = Path(dirname)
+def load_results(results_dir, stats_languages=None):
+    results_dir = Path(results_dir)
     lang_to_results = {}
 
-    if stats_languages:
-        languages = [lang.strip().lower() for lang in stats_languages.split(",")]
-        glob_patterns = [f"{lang}/exercises/practice/*/.aider.results.json" for lang in languages]
-    else:
-        glob_patterns = ["*/exercises/practice/*/.aider.results.json"]
+    # BUG20251223
+    logger.debug(f"Globbing {results_dir} for results")
+    files = list(results_dir.glob("*/.aider.results.json"))
+    logger.debug(f"Found {len(files)} files")
 
-    for pattern in glob_patterns:
-        for fname in dirname.glob(pattern):
-            try:
-                results = json.loads(fname.read_text())
-                #      json / test / prac / exer / lang
-                lang = fname.parent.parent.parent.parent.name
-                lang_to_results.setdefault(lang, []).append(results)
-            except json.JSONDecodeError:
-                print("json.JSONDecodeError", fname)
-                continue
+    for fname in files:
+        try:
+            results = json.loads(fname.read_text())
+            # BUG20251223
+            logger.debug(f"Processing result file: {fname}")
+
+            # Try to get language from cat.yaml if it exists in the same dir
+            lang = "unknown"
+            cat_yaml = fname.parent / "cat.yaml"
+            if cat_yaml.exists():
+                try:
+                    with open(cat_yaml, "r") as f:
+                        metadata = yaml.safe_load(f)
+                        lang = metadata.get("language", "unknown")
+                except Exception:
+                    pass
+
+            if stats_languages:
+                languages = [
+                    lang.strip().lower() for lang in stats_languages.split(",")
+                ]
+                if lang.lower() not in languages:
+                    continue
+
+            logger.debug(f"Derived lang: {lang}")
+            lang_to_results.setdefault(lang, []).append(results)
+        except json.JSONDecodeError:
+            logger.warning(f"json.JSONDecodeError {fname}")
+            continue
     return lang_to_results
 
 
-def summarize_results(dirname, verbose, stats_languages=None):
-    lang_to_results = load_results(dirname, stats_languages)
+def summarize_results(results_dir, verbose, stats_languages=None):
+    lang_to_results = load_results(results_dir, stats_languages)
 
     res = SimpleNamespace()
-    res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))
+    res.total_tests = len(list(Path(results_dir).glob("*/.aider.results.json")))
 
     try:
         tries = max(
@@ -508,7 +501,7 @@ def summarize_results(dirname, verbose, stats_languages=None):
     except ValueError:
         tries = 0
 
-    res.dir_name = str(dirname)
+    res.dir_name = str(results_dir)
 
     passed_tests = [0] * tries
 
@@ -600,16 +593,30 @@ def add(attr_name, increment, global_stats, lang_stats):
             add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats)
 
             add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats)
-            add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats)
+            add(
+                "indentation_errors",
+                results.get("indentation_errors", 0),
+                res,
+                lang_stats,
+            )
 
             add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats)
-            add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats)
+            add(
+                "completion_tokens",
+                results.get("completion_tokens", 0),
+                res,
+                lang_stats,
+            )
 
             res.reasoning_effort = results.get("reasoning_effort")
             res.thinking_tokens = results.get("thinking_tokens")
             res.map_tokens = results.get("map_tokens")
 
-            for key in "model edit_format commit_hash editor_model editor_edit_format".split():
+            for (
+                key
+            ) in (
+                "model edit_format commit_hash editor_model editor_edit_format".split()
+            ):
                 val = results.get(key)
                 if val:
                     variants[key].add(val)
@@ -621,11 +628,11 @@ def add(attr_name, increment, global_stats, lang_stats):
     #    return
 
     console = Console(highlight=False)
-    console.rule(title=str(dirname))
+    console.rule(title=str(results_dir))
 
     commit_hashes = variants["commit_hash"]
     versions = get_versions(commit_hashes)
-    date = dirname.name[:10]
+    date = results_dir.name[:10]
 
     def show(stat, red="red"):
         val = getattr(res, stat)
@@ -640,7 +647,7 @@ def show(stat, red="red"):
         setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}")
         setattr(res, f"pass_num_{i + 1}", passed_tests[i])
 
-    print(f"- dirname: {dirname.name}")
+    print(f"- results_dir: {results_dir.name}")
     style = None if res.completed_tests == res.total_tests else "red"
     console.print(f"  test_cases: {res.completed_tests}", style=style)
     for key, val in variants.items():
@@ -732,7 +739,9 @@ def format_lang_stats(lang, lang_stats):
         def compute_lang_to_col_widths(lang_to_stats):
             lang_to_col_widths = {}
             for lang, lang_stats in lang_to_stats.items():
-                lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__]
+                lang_stat_attrs = [
+                    getattr(lang_stats, attr) for attr in lang_stats.__dict__
+                ]
                 lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len)))
                 lang_to_col_widths[lang] = lang_col_width
 
@@ -742,7 +751,10 @@ def compute_lang_to_col_widths(lang_to_stats):
         print("======== Stats by language ========")
         print()
 
-        [format_lang_stats(lang, lang_stats) for lang, lang_stats in lang_to_stats.items()]
+        [
+            format_lang_stats(lang, lang_stats)
+            for lang, lang_stats in lang_to_stats.items()
+        ]
         lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats)
 
         any_stats = list(lang_to_stats.values())[0]
@@ -829,24 +841,28 @@ def get_replayed_content(replay_dname, test_dname):
     return res
 
     res = res.splitlines(keepends=True)
-    res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
+    res = [
+        line
+        for line in res
+        if not line.startswith("> ") and not line.startswith("#### ")
+    ]
     return "".join(res)
 
 
 def run_test(original_dname, testdir, *args, **kwargs):
     try:
-        return run_test_real(original_dname, testdir, *args, **kwargs)
+        return asyncio.run(run_test_real(original_dname, testdir, *args, **kwargs))
     except Exception:
-        print("=" * 40)
-        print("Test failed")
-        traceback.print_exc()
+        logger.error("=" * 40)
+        logger.error("Test failed")
+        logger.error(traceback.format_exc())
 
         testdir = Path(testdir)
         results_fname = testdir / ".aider.results.json"
         results_fname.write_text(json.dumps(dict(exception=traceback.format_exc())))
 
 
-def run_test_real(
+async def run_test_real(
     original_dname,
     testdir,
     model_name,
@@ -866,6 +882,8 @@ def run_test_real(
     map_tokens: Optional[int] = None,
     read_model_settings=None,
     repomap_in_memory: bool = False,
+    dry: bool = False,
+    results_dir=None,
 ):
     # Lazy imports: only needed in the actual benchmark execution path
     import git
@@ -876,7 +894,9 @@ def run_test_real(
     from aider.io import InputOutput
 
     if not os.path.isdir(testdir):
-        print("Not a dir:", testdir)
+        if dry:
+            return
+        logger.error(f"Not a dir: {testdir}")
         return
 
     testdir = Path(testdir)
@@ -892,7 +912,7 @@ def run_test_real(
             # else:
             return res
         except JSONDecodeError:
-            print(f"{results_fname} failed to parse, redoing...")
+            logger.warning(f"{results_fname} failed to parse, redoing...")
 
     # Read solution and test files from config
     fnames = []
@@ -927,6 +947,25 @@ def run_test_real(
     # Remove any ignore files from the solution set that LLM will edit
     solution_files.difference_update(ignore_files)
 
+    # Try to find original relative path from cat.yaml
+    original_rel_path = None
+    cat_yaml = testdir / "cat.yaml"
+    if cat_yaml.exists():
+        try:
+            with open(cat_yaml, "r") as f:
+                metadata = yaml.safe_load(f)
+                # We need to find where this exercise was in original_dname.
+                # Since we don't store the full relative path in cat.yaml,
+                # we have to search for it or rely on the fact that we know
+                # it was copied from original_dname.
+                # A better way is to look for the directory with the same name (hash)
+                # in original_dname.
+                matches = list(original_dname.rglob(testdir.name))
+                if matches:
+                    original_rel_path = matches[0].relative_to(original_dname)
+        except Exception:
+            pass
+
     # Copy all solution files
     for file_path in solution_files:
         src = testdir / Path(file_path)
@@ -934,20 +973,13 @@ def run_test_real(
             fnames.append(src)
             # restore the original file, in case we interrupted a prev run
             # Find the original file in the language-specific practice dir
-            lang_part = str(testdir).split("/exercises/practice/")[0]
-            original_fname = (
-                original_dname
-                / Path(lang_part).name
-                / "exercises"
-                / "practice"
-                / testdir.name
-                / file_path
-            )
-            if original_fname.exists():
-                os.makedirs(src.parent, exist_ok=True)
-                shutil.copy(original_fname, src)
+            if not dry and original_rel_path:
+                original_fname = original_dname / original_rel_path / file_path
+                if original_fname.exists():
+                    os.makedirs(src.parent, exist_ok=True)
+                    shutil.copy(original_fname, src)
         else:
-            print(f"Warning: Solution file not found: {src}")
+            logger.warning(f"Warning: Solution file not found: {src}")
 
     file_list = " ".join(fname.name for fname in fnames)
 
@@ -997,22 +1029,28 @@ def run_test_real(
     dump(main_model)
     dump(edit_format)
     show_fnames = ",".join(map(str, fnames))
-    print("fnames:", show_fnames)
+    logger.info(f"fnames: {show_fnames}")
     # Ensure this test directory is a standalone git repo so RepoMap can be used
-    try:
-        git_dir = testdir / ".git"
-        if not git_dir.exists():
-            r = git.Repo.init(testdir)
-            # Set a local identity to avoid commit failures in clean containers
-            with r.config_writer() as cw:
-                cw.set_value("user", "name", "aider-benchmark")
-                cw.set_value("user", "email", "aider-benchmark@example.com")
-            # Add existing files (solution set and any current files)
-            r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()])
-            r.index.commit("Initial commit for aider benchmark")
-    except Exception as e:
-        if verbose:
-            print(f"Warning: failed to initialize git repo in {testdir}: {e}")
+    if not dry:
+        try:
+            git_dir = testdir / ".git"
+            if not git_dir.exists():
+                r = git.Repo.init(testdir)
+                # Set a local identity to avoid commit failures in clean containers
+                with r.config_writer() as cw:
+                    cw.set_value("user", "name", "aider-benchmark")
+                    cw.set_value("user", "email", "aider-benchmark@example.com")
+                # Add existing files (solution set and any current files)
+                r.index.add(
+                    [
+                        str(p.relative_to(testdir))
+                        for p in testdir.rglob("*")
+                        if p.is_file()
+                    ]
+                )
+                r.index.commit("Initial commit for aider benchmark")
+        except Exception as e:
+            logger.debug(f"Warning: failed to initialize git repo in {testdir}: {e}")
 
     coder_kwargs = dict(
         main_model=main_model,
@@ -1036,7 +1074,7 @@ def run_test_real(
     if map_tokens is not None:
         coder_kwargs["map_tokens"] = map_tokens
 
-    coder = Coder.create(**coder_kwargs)
+    coder = await Coder.create(**coder_kwargs)
     dump(coder.ignore_mentions)
 
     coder.show_announcements()
@@ -1063,9 +1101,9 @@ def run_test_real(
             show = [">> " + line for line in show]
             io.append_chat_history("".join(show))
 
-            coder.apply_updates()
+            await coder.apply_updates()
         else:
-            response = coder.run(with_message=instructions, preproc=False)
+            response = await coder.run(with_message=instructions, preproc=False)
 
         dur += time.time() - start
 
@@ -1103,46 +1141,45 @@ def run_test_real(
         errors = errors.splitlines()
 
         syntax_errors += sum(1 for line in errors if line.startswith("SyntaxError"))
-        indentation_errors += sum(1 for line in errors if line.startswith("IndentationError"))
+        indentation_errors += sum(
+            1 for line in errors if line.startswith("IndentationError")
+        )
 
-        print(errors[-1])
+        logger.info(errors[-1])
         errors = "\n".join(errors)
         instructions = errors
         instructions += prompts.test_failures.format(file_list=file_list)
 
-    # Clean up build directories after all attempts
-    # Rust target/debug
-    target_dir = testdir / "target" / "debug"
-    if target_dir.exists():
-        try:
-            shutil.rmtree(target_dir)
-            if verbose:
-                print(f"Cleaned up Rust target/debug directory: {target_dir}")
-        except (OSError, shutil.Error, PermissionError) as e:
-            if verbose:
-                print(f"Failed to clean up Rust target/debug directory: {e}")
-
-    # Java build directories
-    java_build_dir = testdir / "build"
-    if java_build_dir.exists():
-        try:
-            shutil.rmtree(java_build_dir)
-            if verbose:
-                print(f"Cleaned up Java build directory: {java_build_dir}")
-        except (OSError, shutil.Error, PermissionError) as e:
-            if verbose:
-                print(f"Failed to clean up Java build directory: {e}")
-
-    # Node.js node_modules directories
-    node_modules_dir = testdir / "node_modules"
-    if node_modules_dir.exists():
-        try:
-            shutil.rmtree(node_modules_dir)
-            if verbose:
-                print(f"Cleaned up Node.js node_modules directory: {node_modules_dir}")
-        except (OSError, shutil.Error, PermissionError) as e:
-            if verbose:
-                print(f"Failed to clean up Node.js node_modules directory: {e}")
+    if not dry:
+        # Clean up build directories after all attempts
+        # Rust target/debug
+        target_dir = testdir / "target" / "debug"
+        if target_dir.exists():
+            try:
+                shutil.rmtree(target_dir)
+                logger.debug(f"Cleaned up Rust target/debug directory: {target_dir}")
+            except (OSError, shutil.Error, PermissionError) as e:
+                logger.debug(f"Failed to clean up Rust target/debug directory: {e}")
+
+        # Java build directories
+        java_build_dir = testdir / "build"
+        if java_build_dir.exists():
+            try:
+                shutil.rmtree(java_build_dir)
+                logger.debug(f"Cleaned up Java build directory: {java_build_dir}")
+            except (OSError, shutil.Error, PermissionError) as e:
+                logger.debug(f"Failed to clean up Java build directory: {e}")
+
+        # Node.js node_modules directories
+        node_modules_dir = testdir / "node_modules"
+        if node_modules_dir.exists():
+            try:
+                shutil.rmtree(node_modules_dir)
+                logger.debug(
+                    f"Cleaned up Node.js node_modules directory: {node_modules_dir}"
+                )
+            except (OSError, shutil.Error, PermissionError) as e:
+                logger.debug(f"Failed to clean up Node.js node_modules directory: {e}")
 
     results = dict(
         testdir=str(testdir),
@@ -1175,7 +1212,9 @@ def run_test_real(
     )
 
     if edit_format == "architect":
-        results["editor_model"] = main_model.editor_model.name if main_model.editor_model else None
+        results["editor_model"] = (
+            main_model.editor_model.name if main_model.editor_model else None
+        )
         results["editor_edit_format"] = main_model.editor_edit_format
     dump(results)
 
@@ -1187,6 +1226,12 @@ def run_test_real(
 def run_unit_tests(original_dname, testdir, history_fname, test_files):
     timeout = 60 * 3
 
+    # Find original relative path
+    original_rel_path = None
+    matches = list(original_dname.rglob(testdir.name))
+    if matches:
+        original_rel_path = matches[0].relative_to(original_dname)
+
     # Map of file extensions to test commands
     TEST_COMMANDS = {
         ".py": ["pytest"],
@@ -1208,14 +1253,18 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files):
             break
 
     if not command:
-        raise ValueError(f"No test command found for files with extensions: {extensions}")
+        raise ValueError(
+            f"No test command found for files with extensions: {extensions}"
+        )
 
     # Copy test files from original directory
     for file_path in test_files:
-        src = original_dname / Path(*testdir.parts[-4:]) / file_path
+        if not original_rel_path:
+            break
+        src = original_dname / original_rel_path / file_path
         dst = testdir / file_path
         if src.exists():
-            print("copying", src, dst)
+            logger.info(f"copying {src} {dst}")
             os.makedirs(dst.parent, exist_ok=True)
             shutil.copy(src, dst)
 
@@ -1228,7 +1277,7 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files):
                 content = re.sub(r"@Disabled\([^)]*\)\s*\n", "", content)
                 test_file.write_text(content)
 
-    print(" ".join(command))
+    logger.info(" ".join(command))
 
     result = subprocess.run(
         command,
@@ -1250,7 +1299,7 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files):
         fh.write(f"```\n{res}\n```")
 
     if not success:
-        print(f"Tests failed: {testdir}")
+        logger.info(f"Tests failed: {testdir}")
         return res
 
 
diff --git a/benchmark/benchmark_classic.py b/benchmark/benchmark_classic.py
new file mode 100755
index 00000000000..02117242742
--- /dev/null
+++ b/benchmark/benchmark_classic.py
@@ -0,0 +1,1265 @@
+#!/usr/bin/env python3
+import datetime
+import json
+import os
+import random
+import re
+import shutil
+import subprocess
+import sys
+import time
+import traceback
+from collections import defaultdict
+from json.decoder import JSONDecodeError
+from pathlib import Path
+from types import SimpleNamespace
+from typing import List, Optional
+
+"""
+Performance-oriented refactors:
+- Avoid heavy imports unless needed for a given code path.
+- Fast path for `--stats` to skip GitPython and benchmarking deps.
+- Build DataFrame / import plotting only when `--graphs` is true.
+- Use json.load for result file parsing to reduce memory churn.
+- Cache git version lookups across a single invocation.
+"""
+
+# Heavy modules are lazily imported within the code paths that need them.
+import typer
+from dotenv import load_dotenv
+from rich.console import Console
+
+from aider.dump import dump  # noqa: F401
+
+# Cache for commit-hash -> version lookup
+_VERSION_CACHE = {}
+
+BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))
+
+EXERCISES_DIR_DEFAULT = "polyglot-benchmark"
+
+app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
+
+
+load_dotenv(override=True)
+
+
+def find_latest_benchmark_dir():
+    benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
+    if not benchmark_dirs:
+        print("Error: No benchmark directories found under tmp.benchmarks.")
+        sys.exit(1)
+
+    # Get current time and 24 hours ago
+    now = datetime.datetime.now()
+    day_ago = now - datetime.timedelta(days=1)
+
+    # Filter directories by name pattern YYYY-MM-DD-HH-MM-SS--
+    recent_dirs = []
+    for d in benchmark_dirs:
+        try:
+            # Extract datetime from directory name
+            date_str = d.name[:19]  # Takes YYYY-MM-DD-HH-MM-SS
+            dir_date = datetime.datetime.strptime(date_str, "%Y-%m-%d-%H-%M-%S")
+            if dir_date >= day_ago:
+                recent_dirs.append(d)
+        except ValueError:
+            # Skip directories that don't match the expected format
+            continue
+
+    if not recent_dirs:
+        print("Error: No benchmark directories found from the last 24 hours.")
+        sys.exit(1)
+
+    # Find directory with most recently modified .md file
+    latest_dir = None
+    latest_time = 0
+
+    for d in recent_dirs:
+        # Look for .md files in subdirectories
+        for md_file in d.glob("*/exercises/practice/*/.*.md"):
+            if md_file.is_file():
+                mtime = md_file.stat().st_mtime
+                if mtime > latest_time:
+                    latest_time = mtime
+                    latest_dir = d
+
+    if not latest_dir:
+        print("Error: No .md files found in recent benchmark directories.")
+        sys.exit(1)
+
+    print(f"Using the most recently updated benchmark directory: {latest_dir.name}")
+    return latest_dir
+
+
+def show_stats(dirnames, graphs, verbose, stats_languages=None):
+    raw_rows = []
+    for dirname in dirnames:
+        row = summarize_results(dirname, verbose, stats_languages)
+        raw_rows.append(row)
+
+    # return
+
+    seen = dict()
+    rows = []
+    for row in raw_rows:
+        if not row:
+            continue
+
+        if row.completed_tests != row.total_tests:
+            print(
+                f"Warning: {row.dir_name} is incomplete: {row.completed_tests} of {row.total_tests}"
+            )
+
+        try:
+            kind = (row.model, row.edit_format)
+        except AttributeError:
+            return
+
+        if kind in seen:
+            dump(row.dir_name)
+            dump(seen[kind])
+            return
+
+        seen[kind] = row.dir_name
+        rows.append(vars(row))
+
+    repeat_hi = repeat_lo = repeat_avg = None  # noqa: F841
+
+    # Only build a DataFrame and import plotting libs when graphs are requested
+    if graphs:
+        import pandas as pd  # Lazy import
+        from plots import plot_refactoring  # Lazy import
+
+        df = pd.DataFrame.from_records(rows)
+        # plot_timing(df)
+        # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
+        # plot_outcomes_claude(df)
+        plot_refactoring(df)
+
+
+def resolve_dirname(dirname, use_single_prior, make_new):
+    if len(dirname.parts) > 1:
+        return dirname
+
+    priors = list(BENCHMARK_DNAME.glob(f"*--{dirname}"))
+    if len(priors) == 1 and use_single_prior:
+        dirname = priors[0].name
+        print(f"Using pre-existing {dirname}")
+    elif len(priors):
+        if not make_new:
+            print(f"Prior runs of {dirname} exist, use --new or name one explicitly")
+            print()
+            for prior in priors:
+                print(prior)
+            return
+
+    if not re.match(r"\d\d\d\d-\d\d-\d\d-", str(dirname)):
+        now = datetime.datetime.now()
+        now = now.strftime("%Y-%m-%d-%H-%M-%S--")
+        dirname = now + dirname.name
+
+    dirname = BENCHMARK_DNAME / dirname
+    return dirname
+
+
+@app.command()
+def main(
+    dirnames: Optional[List[str]] = typer.Argument(None, help="Directory names"),
+    graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"),
+    model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"),
+    sleep: float = typer.Option(
+        0, "--sleep", help="Sleep seconds between tests when single threaded"
+    ),
+    languages: str = typer.Option(
+        None, "--languages", "-l", help="Only run tests for specific languages (comma separated)"
+    ),
+    edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"),
+    editor_model: str = typer.Option(None, "--editor-model", help="Editor model name"),
+    editor_edit_format: str = typer.Option(None, "--editor-edit-format", help="Editor edit format"),
+    replay: str = typer.Option(
+        None,
+        "--replay",
+        help="Replay previous .aider.chat.history.md responses from previous benchmark run",
+    ),
+    keywords: str = typer.Option(
+        None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)"
+    ),
+    clean: bool = typer.Option(
+        False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
+    ),
+    cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
+    make_new: bool = typer.Option(False, "--new", help="Make a new dated testdir"),
+    no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
+    no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
+    stats_only: bool = typer.Option(
+        False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests"
+    ),
+    stats_languages: str = typer.Option(
+        None,
+        "--stats-languages",
+        help="Only include stats for specific languages (comma separated)",
+    ),
+    diffs_only: bool = typer.Option(False, "--diffs", help="Just diff the provided stats dirs"),
+    tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"),
+    threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"),
+    num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"),
+    num_ctx: Optional[int] = typer.Option(
+        None, "--num-ctx", help="Override model context window size"
+    ),
+    read_model_settings: str = typer.Option(
+        None, "--read-model-settings", help="Load aider model settings from YAML file"
+    ),
+    reasoning_effort: Optional[str] = typer.Option(
+        None, "--reasoning-effort", help="Set reasoning effort for models that support it"
+    ),
+    thinking_tokens: Optional[int] = typer.Option(
+        None, "--thinking-tokens", help="Set thinking tokens for models that support it"
+    ),
+    map_tokens: Optional[int] = typer.Option(
+        None,
+        "--map-tokens",
+        help="Suggested number of tokens for repo map (0 to disable)",
+    ),
+    exercises_dir: str = typer.Option(
+        EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
+    ),
+):
+    if stats_only and not dirnames:
+        latest_dir = find_latest_benchmark_dir()
+        dirnames = [str(latest_dir)]
+
+    if dirnames is None:
+        dirnames = []
+
+    if len(dirnames) > 1 and not (stats_only or diffs_only):
+        print("Only provide 1 dirname unless running with --stats or --diffs")
+        return 1
+
+    updated_dirnames = []
+    for dirname in dirnames:
+        dirname = Path(dirname)
+        dirname = resolve_dirname(dirname, stats_only or cont, make_new)
+        if not dirname:
+            return 1
+        updated_dirnames.append(dirname)
+
+    if stats_only:
+        return show_stats(updated_dirnames, graphs, verbose, stats_languages)
+
+    if diffs_only:
+        return show_diffs(updated_dirnames)
+
+    assert len(updated_dirnames) == 1, updated_dirnames
+    dirname = updated_dirnames[0]
+
+    # Lazy imports for the actual benchmark run
+    import git  # Heavy; avoid for --stats/--diffs
+    import importlib_resources  # Used for model metadata registration
+    import lox  # Only needed for threaded runs
+
+    from aider import models, sendchat
+    from aider.coders import base_coder
+
+    repo = git.Repo(search_parent_directories=True)
+    commit_hash = repo.head.object.hexsha[:7]
+    if repo.is_dirty():
+        commit_hash += "-dirty"
+
+    if "AIDER_DOCKER" not in os.environ:
+        print("Warning: benchmarking runs unvetted code from GPT, run in a docker container")
+        return
+
+    assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME
+
+    def get_exercise_dirs(base_dir, languages=None):
+        """Get all exercise directories for specified languages (or all if none specified)"""
+        base_dir = Path(base_dir)
+
+        # Get available language dirs
+        lang_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
+
+        # Filter to requested languages if specified
+        if languages:
+            requested = set(lang.strip().lower() for lang in languages.split(","))
+            lang_dirs = [d for d in lang_dirs if d.name.lower() in requested]
+            dump(lang_dirs)
+            if not lang_dirs:
+                print(f"No matching language directories found for: {languages}")
+                return []
+
+        # Get all exercise dirs under exercises/practice for each language
+        exercise_dirs = []
+        for lang_dir in lang_dirs:
+            practice_dir = lang_dir / "exercises" / "practice"
+            if practice_dir.exists():
+                exercise_dirs.extend(d for d in practice_dir.iterdir() if d.is_dir())
+
+        return exercise_dirs
+
+    original_dname = BENCHMARK_DNAME / exercises_dir
+    assert original_dname.exists() and original_dname.is_dir(), original_dname
+
+    exercise_dirs = get_exercise_dirs(original_dname, languages)
+
+    if not exercise_dirs:
+        print("No exercise directories found")
+        return 1
+
+    if clean and dirname.exists():
+        print("Cleaning up and replacing", dirname)
+        dir_files = set(fn.name for fn in dirname.glob("*"))
+        original_files = set(fn.name for fn in original_dname.glob("*"))
+        if dir_files != original_files:
+            print("ERROR: will not delete dir that does not look like original tests", dirname)
+            return
+
+        dest = dirname.parent / "OLD" / dirname.name
+        if dest.exists():
+            old_now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+            dest = dirname.parent / "OLD" / (old_now + dirname.name)
+
+        dirname.rename(dest)
+
+    if not dirname.exists():
+        print(f"Copying {original_dname} -> {dirname} ...")
+        # Only copy the practice subdirs with exercises
+        os.makedirs(dirname, exist_ok=True)
+        for lang_dir in original_dname.iterdir():
+            if not lang_dir.is_dir():
+                continue
+            practice_dir = lang_dir / "exercises" / "practice"
+            if practice_dir.exists():
+                dest_lang_dir = dirname / lang_dir.name / "exercises" / "practice"
+                os.makedirs(dest_lang_dir.parent, exist_ok=True)
+                shutil.copytree(practice_dir, dest_lang_dir)
+        print("...done")
+
+    test_dnames = sorted(str(d.relative_to(original_dname)) for d in exercise_dirs)
+
+    resource_metadata = importlib_resources.files("aider.resources").joinpath("model-metadata.json")
+    model_metadata_files_loaded = models.register_litellm_models([resource_metadata])
+    dump(model_metadata_files_loaded)
+
+    if read_model_settings:
+        try:
+            files_loaded = models.register_models([read_model_settings])
+            if verbose:
+                if files_loaded:
+                    print(f"Loaded model settings from: {files_loaded[0]}")
+                else:
+                    print(f"No model settings loaded from: {read_model_settings}")
+        except Exception as e:
+            print(f"Error loading model settings: {e}")
+            return 1
+
+    if keywords:
+        keywords = keywords.split(",")
+        test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
+
+    random.shuffle(test_dnames)
+    if num_tests > 0:
+        test_dnames = test_dnames[:num_tests]
+
+    # Don't give up when benchmarking
+    LONG_TIMEOUT = 24 * 60 * 60
+    sendchat.RETRY_TIMEOUT = LONG_TIMEOUT
+    base_coder.RETRY_TIMEOUT = LONG_TIMEOUT
+    models.RETRY_TIMEOUT = LONG_TIMEOUT
+
+    # Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention
+    repomap_in_memory = threads > 1
+
+    if threads == 1:
+        all_results = []
+        for test_path in test_dnames:
+            results = run_test(
+                original_dname,
+                dirname / test_path,
+                model,
+                edit_format,
+                tries,
+                no_unit_tests,
+                no_aider,
+                verbose,
+                commit_hash,
+                replay,
+                editor_model,
+                editor_edit_format,
+                num_ctx,
+                sleep,
+                reasoning_effort,
+                thinking_tokens,
+                map_tokens,
+                repomap_in_memory,
+            )
+
+            all_results.append(results)
+            summarize_results(dirname, verbose)
+            if sleep:
+                time.sleep(sleep)
+    else:
+        run_test_threaded = lox.thread(threads)(run_test)
+        for test_path in test_dnames:
+            run_test_threaded.scatter(
+                original_dname,
+                dirname / test_path,
+                model,
+                edit_format,
+                tries,
+                no_unit_tests,
+                no_aider,
+                verbose,
+                commit_hash,
+                replay,
+                editor_model,
+                editor_edit_format,
+                num_ctx,
+                sleep,
+                reasoning_effort,
+                thinking_tokens,
+                map_tokens,
+                repomap_in_memory,
+            )
+        all_results = run_test_threaded.gather(tqdm=True)
+
+    print()
+    print()
+    print()
+    summarize_results(dirname, verbose)
+
+    return 0
+
+
+def show_diffs(dirnames):
+    dirnames = sorted(dirnames)
+
+    all_results = dict((dirname, load_results(dirname)) for dirname in dirnames)
+    testcases = set()
+    for results in all_results.values():
+        testcases.update(result["testcase"] for result in results)
+
+    testcases = sorted(testcases)
+
+    unchanged = set()
+
+    for testcase in testcases:
+        all_outcomes = []
+        for dirname in dirnames:
+            results = all_results[dirname]
+            result = [r for r in results if r["testcase"] == testcase][0]
+
+            outcomes = tuple(result["tests_outcomes"])
+            all_outcomes.append(True in outcomes)
+
+        if len(set(all_outcomes)) == 1:
+            unchanged.add(testcase)
+            continue
+
+        print()
+        print(testcase)
+        for outcome, dirname in zip(all_outcomes, dirnames):
+            print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md")
+
+    changed = set(testcases) - unchanged
+    print()
+    print("changed:", len(changed), ",".join(sorted(changed)))
+    print()
+    print("unchanged:", len(unchanged), ",".join(sorted(unchanged)))
+
+
+def load_results(dirname, stats_languages=None):
+    dirname = Path(dirname)
+    lang_to_results = {}
+
+    if stats_languages:
+        languages = [lang.strip().lower() for lang in stats_languages.split(",")]
+        glob_patterns = [f"{lang}/exercises/practice/*/.aider.results.json" for lang in languages]
+    else:
+        glob_patterns = ["*/exercises/practice/*/.aider.results.json"]
+
+    for pattern in glob_patterns:
+        for fname in dirname.glob(pattern):
+            try:
+                results = json.loads(fname.read_text())
+                #      json / test / prac / exer / lang
+                lang = fname.parent.parent.parent.parent.name
+                lang_to_results.setdefault(lang, []).append(results)
+            except json.JSONDecodeError:
+                print("json.JSONDecodeError", fname)
+                continue
+    return lang_to_results
+
+
+def summarize_results(dirname, verbose, stats_languages=None):
+    lang_to_results = load_results(dirname, stats_languages)
+
+    res = SimpleNamespace()
+    res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))
+
+    try:
+        tries = max(
+            len(results.get("tests_outcomes", []))
+            for results_list in lang_to_results.values()
+            for results in results_list
+            if results
+        )
+    except ValueError:
+        tries = 0
+
+    res.dir_name = str(dirname)
+
+    passed_tests = [0] * tries
+
+    res.completed_tests = 0
+    res.duration = 0
+    res.cost = 0
+    res.error_outputs = 0
+    res.user_asks = 0
+    res.test_timeouts = 0
+    res.exhausted_context_windows = 0
+    res.num_malformed_responses = 0
+    res.num_with_malformed_responses = 0
+    res.syntax_errors = 0
+    res.indentation_errors = 0
+    res.lazy_comments = 0
+    res.prompt_tokens = 0
+    res.completion_tokens = 0
+
+    res.reasoning_effort = None
+    res.thinking_tokens = None
+    res.map_tokens = None
+    variants = defaultdict(set)
+
+    def add(attr_name, increment, global_stats, lang_stats):
+        global_prev = getattr(global_stats, attr_name)
+        setattr(global_stats, attr_name, global_prev + increment)
+
+        lang_prev = getattr(lang_stats, attr_name)
+        setattr(lang_stats, attr_name, lang_prev + increment)
+
+    lang_to_stats = {}
+    lang_to_passed_tests = {}
+    for lang, results_list in lang_to_results.items():
+        lang_stats = SimpleNamespace()
+        lang_stats.completed_tests = 0
+        lang_stats.duration = 0
+        lang_stats.avg_duration_per_test = 0
+        lang_stats.cost = 0
+        for i in range(tries):
+            setattr(lang_stats, f"pass_rate_{i + 1}", 0)
+        for i in range(tries):
+            setattr(lang_stats, f"pass_num_{i + 1}", 0)
+        lang_stats.error_outputs = 0
+        lang_stats.user_asks = 0
+        lang_stats.test_timeouts = 0
+        lang_stats.exhausted_context_windows = 0
+        lang_stats.num_malformed_responses = 0
+        lang_stats.num_with_malformed_responses = 0
+        lang_stats.syntax_errors = 0
+        lang_stats.indentation_errors = 0
+        lang_stats.lazy_comments = 0
+        lang_stats.prompt_tokens = 0
+        lang_stats.completion_tokens = 0
+        lang_to_stats[lang] = lang_stats
+        lang_to_passed_tests[lang] = [0] * tries
+
+        for results in results_list:
+            if not results:
+                continue
+
+            add("completed_tests", 1, res, lang_stats)
+            tests_outcomes = results.get("tests_outcomes", [])
+            passed = tests_outcomes and tests_outcomes[-1]
+            if passed:
+                for i in range(len(tests_outcomes) - 1, tries):
+                    passed_tests[i] += 1
+                    lang_to_passed_tests[lang][i] += 1
+
+            add("cost", results.get("cost", 0), res, lang_stats)
+            add("duration", results.get("duration", 0), res, lang_stats)
+            add("test_timeouts", results.get("test_timeouts", 0), res, lang_stats)
+
+            add("error_outputs", results.get("num_error_outputs", 0), res, lang_stats)
+            add("user_asks", results.get("num_user_asks", 0), res, lang_stats)
+            add(
+                "exhausted_context_windows",
+                results.get("num_exhausted_context_windows", 0),
+                res,
+                lang_stats,
+            )
+            add(
+                "num_malformed_responses",
+                results.get("num_malformed_responses", 0),
+                res,
+                lang_stats,
+            )
+            if results.get("num_malformed_responses"):
+                add("num_with_malformed_responses", 1, res, lang_stats)
+            add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats)
+
+            add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats)
+            add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats)
+
+            add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats)
+            add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats)
+
+            res.reasoning_effort = results.get("reasoning_effort")
+            res.thinking_tokens = results.get("thinking_tokens")
+            res.map_tokens = results.get("map_tokens")
+
+            for key in "model edit_format commit_hash editor_model editor_edit_format".split():
+                val = results.get(key)
+                if val:
+                    variants[key].add(val)
+
+    if not res.completed_tests:
+        return
+
+    # if res.completed_tests < 133:
+    #    return
+
+    console = Console(highlight=False)
+    console.rule(title=str(dirname))
+
+    commit_hashes = variants["commit_hash"]
+    versions = get_versions(commit_hashes)
+    date = dirname.name[:10]
+
+    def show(stat, red="red"):
+        val = getattr(res, stat)
+        style = red if val else None
+        console.print(f"  {stat}: {val}", style=style)
+
+    percents = dict()
+    for i in range(tries):
+        pass_rate = 100 * passed_tests[i] / res.completed_tests
+        percents[i] = pass_rate
+        # console.print(f"{pass_rate:.1f}% correct after try {i + 1}")
+        setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}")
+        setattr(res, f"pass_num_{i + 1}", passed_tests[i])
+
+    print(f"- dirname: {dirname.name}")
+    style = None if res.completed_tests == res.total_tests else "red"
+    console.print(f"  test_cases: {res.completed_tests}", style=style)
+    for key, val in variants.items():
+        if len(val) > 1:
+            style = "red"
+        else:
+            style = None
+        val = ", ".join(map(str, val))
+        setattr(res, key, val)
+        console.print(f"  {key}: {val}", style=style)
+
+    if res.reasoning_effort is not None:
+        print(f"  reasoning_effort: {res.reasoning_effort}")
+    if res.thinking_tokens is not None:
+        print(f"  thinking_tokens: {res.thinking_tokens}")
+    if res.map_tokens is not None:
+        print(f"  map_tokens: {res.map_tokens}")
+
+    for i in range(tries):
+        print(f"  pass_rate_{i + 1}: {percents[i]:.1f}")
+    for i in range(tries):
+        print(f"  pass_num_{i + 1}: {passed_tests[i]}")
+
+    pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
+    print(f"  percent_cases_well_formed: {pct_well_formed * 100:.1f}")
+
+    show("error_outputs")
+    show("num_malformed_responses")
+    show("num_with_malformed_responses")
+    show("user_asks")
+    show("lazy_comments")
+    show("syntax_errors")
+    show("indentation_errors")
+    show("exhausted_context_windows")
+    show("prompt_tokens", red=None)
+    show("completion_tokens", red=None)
+    show("test_timeouts")
+    print(f"  total_tests: {res.total_tests}")
+
+    if variants["model"]:
+        a_model = set(variants["model"]).pop()
+        command = f"aider-ce --model {a_model}"
+        print(f"  command: {command}")
+
+    print(f"  date: {date}")
+    print("  versions:", ",".join(versions))
+
+    res.avg_duration = res.duration / res.completed_tests
+    print(f"  seconds_per_case: {res.avg_duration:.1f}")
+
+    print(f"  total_cost: {res.cost:.4f}")
+
+    res.avg_cost = res.cost / res.completed_tests
+
+    projected_cost = res.avg_cost * res.total_tests
+
+    print()
+    print(
+        f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total,"
+        f" ${projected_cost:.2f} projected"
+    )
+
+    if verbose and len(lang_to_stats) > 0:
+
+        def format_lang_stats(lang, lang_stats):
+            # First, postprocess attributes for easier printing
+            if lang_stats.completed_tests > 0:
+                lang_stats.avg_duration_per_test = lang_stats.duration / float(
+                    lang_stats.completed_tests
+                )
+            for i in range(tries):
+                num_passed = lang_to_passed_tests[lang][i]
+                setattr(lang_stats, f"pass_num_{i + 1}", num_passed)
+                pass_rate = 100 * num_passed / float(lang_stats.completed_tests)
+                setattr(lang_stats, f"pass_rate_{i + 1}", pass_rate)
+
+            # Then format attributes into ready-to-print strings
+            for attr in lang_stats.__dict__:
+                val = getattr(lang_stats, attr)
+                if val == 0:
+                    val = "-"
+                elif isinstance(val, float):
+                    val = f"{val:,.2f}"
+                else:
+                    val = f"{val:,}"
+
+                setattr(lang_stats, attr, val)
+
+        def compute_lang_to_col_widths(lang_to_stats):
+            lang_to_col_widths = {}
+            for lang, lang_stats in lang_to_stats.items():
+                lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__]
+                lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len)))
+                lang_to_col_widths[lang] = lang_col_width
+
+            return lang_to_col_widths
+
+        print()
+        print("======== Stats by language ========")
+        print()
+
+        [format_lang_stats(lang, lang_stats) for lang, lang_stats in lang_to_stats.items()]
+        lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats)
+
+        any_stats = list(lang_to_stats.values())[0]
+        attrs = list(any_stats.__dict__)
+        attr_col_width = len(max(["language"] + attrs, key=len))
+        langs = list(lang_to_stats.keys())
+
+        print("| " + ("-" * attr_col_width), end="")
+        for lang in langs:
+            col_width = lang_to_col_widths[lang]
+            print(" | " + ("-" * col_width), end="")
+        print(" |")
+
+        print(f"| {' '.center(attr_col_width)}", end="")
+        for lang in langs:
+            col_width = lang_to_col_widths[lang]
+            print(f" | {lang.center(col_width)}", end="")
+        print(" |")
+
+        print("| " + ("-" * attr_col_width), end="")
+        for lang in langs:
+            col_width = lang_to_col_widths[lang]
+            print(" | " + ("-" * col_width), end="")
+        print(" |")
+
+        for attr in attrs:
+            print(f"| {attr:<{attr_col_width}}", end="")
+            for lang in langs:
+                lang_stats = lang_to_stats[lang]
+                col_width = lang_to_col_widths[lang]
+                print(f" | {getattr(lang_stats, attr):>{col_width}}", end="")
+            print(" |")
+
+        print("| " + ("-" * attr_col_width), end="")
+        for lang in langs:
+            col_width = lang_to_col_widths[lang]
+            print(" | " + ("-" * col_width), end="")
+        print(" |")
+        print()
+
+    console.rule()
+
+    # print(json.dumps(vars(res), indent=4, sort_keys=True))
+    return res
+
+
+def get_versions(commit_hashes):
+    versions = set()
+    for hsh in commit_hashes:
+        if not hsh:
+            continue
+        short = hsh.split("-")[0]
+        if short in _VERSION_CACHE:
+            ver = _VERSION_CACHE.get(short)
+            if ver:
+                versions.add(ver)
+            continue
+
+        try:
+            version_src = subprocess.check_output(
+                ["git", "show", f"{short}:aider/__init__.py"], universal_newlines=True
+            )
+            match = re.search(r'__version__ = "(.*)"', version_src)
+            ver = match.group(1) if match else None
+            _VERSION_CACHE[short] = ver
+            if ver:
+                versions.add(ver)
+        except subprocess.CalledProcessError:
+            _VERSION_CACHE[short] = None
+            pass
+    return versions
+
+
+def get_replayed_content(replay_dname, test_dname):
+    replay_dname = Path(replay_dname)
+    test_dname = Path(test_dname)
+    dump(replay_dname, test_dname)
+
+    test_name = test_dname.name
+    replay_fname = replay_dname / test_name / ".aider.chat.history.md"
+    dump(replay_fname)
+
+    res = replay_fname.read_text()
+    return res
+
+    res = res.splitlines(keepends=True)
+    res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
+    return "".join(res)
+
+
+def run_test(original_dname, testdir, *args, **kwargs):
+    try:
+        return run_test_real(original_dname, testdir, *args, **kwargs)
+    except Exception:
+        print("=" * 40)
+        print("Test failed")
+        traceback.print_exc()
+
+        testdir = Path(testdir)
+        results_fname = testdir / ".aider.results.json"
+        results_fname.write_text(json.dumps(dict(exception=traceback.format_exc())))
+
+
+def run_test_real(
+    original_dname,
+    testdir,
+    model_name,
+    edit_format,
+    tries,
+    no_unit_tests,
+    no_aider,
+    verbose,
+    commit_hash,
+    replay,
+    editor_model,
+    editor_edit_format,
+    num_ctx=None,
+    sleep=0,
+    reasoning_effort: Optional[str] = None,
+    thinking_tokens: Optional[int] = None,
+    map_tokens: Optional[int] = None,
+    read_model_settings=None,
+    repomap_in_memory: bool = False,
+):
+    # Lazy imports: only needed in the actual benchmark execution path
+    import git
+    import prompts
+
+    from aider import models
+    from aider.coders import Coder
+    from aider.io import InputOutput
+
+    if not os.path.isdir(testdir):
+        print("Not a dir:", testdir)
+        return
+
+    testdir = Path(testdir)
+
+    history_fname = testdir / ".aider.chat.history.md"
+
+    results_fname = testdir / ".aider.results.json"
+    if results_fname.exists():
+        try:
+            res = json.loads(results_fname.read_text())
+            # if res.get("test_timeouts", 0) > 0:
+            #    print(f"{results_fname} test timeouts, redoing...")
+            # else:
+            return res
+        except JSONDecodeError:
+            print(f"{results_fname} failed to parse, redoing...")
+
+    # Read solution and test files from config
+    fnames = []
+    config_file = testdir / ".meta/config.json"
+    if not config_file.exists():
+        raise ValueError(f"No config file found: {config_file}")
+
+    with open(config_file) as f:
+        config = json.loads(f.read())
+
+    # Get file sets from config
+    test_files = config.get("files", {}).get("test", [])
+    example_files = config.get("files", {}).get("example", [])
+    solution_files = set(config.get("files", {}).get("solution", []))
+
+    # Forcibly ignore certain files not covered by test_files and example_files
+    ignore_files = set(
+        [
+            "CMakeLists.txt",
+            "Cargo.toml",
+        ]
+    )
+
+    # Add all files under .meta and .docs directories
+    ignore_files.update(str(p.relative_to(testdir)) for p in testdir.glob(".meta/**/*"))
+    ignore_files.update(str(p.relative_to(testdir)) for p in testdir.glob(".docs/**/*"))
+
+    # Also ignore test & example files
+    ignore_files.update(test_files)
+    ignore_files.update(example_files)
+
+    # Remove any ignore files from the solution set that LLM will edit
+    solution_files.difference_update(ignore_files)
+
+    # Copy all solution files
+    for file_path in solution_files:
+        src = testdir / Path(file_path)
+        if src.exists():
+            fnames.append(src)
+            # restore the original file, in case we interrupted a prev run
+            # Find the original file in the language-specific practice dir
+            lang_part = str(testdir).split("/exercises/practice/")[0]
+            original_fname = (
+                original_dname
+                / Path(lang_part).name
+                / "exercises"
+                / "practice"
+                / testdir.name
+                / file_path
+            )
+            if original_fname.exists():
+                os.makedirs(src.parent, exist_ok=True)
+                shutil.copy(original_fname, src)
+        else:
+            print(f"Warning: Solution file not found: {src}")
+
+    file_list = " ".join(fname.name for fname in fnames)
+
+    instructions = ""
+
+    introduction = testdir / ".docs/introduction.md"
+    if introduction.exists():
+        instructions += introduction.read_text()
+    instructions += (testdir / ".docs/instructions.md").read_text()
+    instructions_append = testdir / ".docs/instructions.append.md"
+    if instructions_append.exists():
+        instructions += instructions_append.read_text()
+
+    instructions += prompts.instructions_addendum.format(file_list=file_list)
+
+    io = InputOutput(
+        pretty=False,
+        yes=True,
+        chat_history_file=history_fname,
+    )
+
+    # weak_model_name = model_name
+    weak_model_name = None
+
+    main_model = models.Model(
+        model_name,
+        weak_model=weak_model_name,
+        editor_model=editor_model,
+        editor_edit_format=editor_edit_format,
+        verbose=verbose,
+    )
+
+    if reasoning_effort is not None:
+        main_model.set_reasoning_effort(reasoning_effort)
+
+    if thinking_tokens is not None:
+        main_model.set_thinking_tokens(thinking_tokens)
+
+    dump(main_model.max_chat_history_tokens)
+
+    if num_ctx:
+        if not main_model.extra_params:
+            main_model.extra_params = {}
+        main_model.extra_params["num_ctx"] = num_ctx
+    edit_format = edit_format or main_model.edit_format
+
+    dump(main_model)
+    dump(edit_format)
+    show_fnames = ",".join(map(str, fnames))
+    print("fnames:", show_fnames)
+    # Ensure this test directory is a standalone git repo so RepoMap can be used
+    try:
+        git_dir = testdir / ".git"
+        if not git_dir.exists():
+            r = git.Repo.init(testdir)
+            # Set a local identity to avoid commit failures in clean containers
+            with r.config_writer() as cw:
+                cw.set_value("user", "name", "aider-benchmark")
+                cw.set_value("user", "email", "aider-benchmark@example.com")
+            # Add existing files (solution set and any current files)
+            r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()])
+            r.index.commit("Initial commit for aider benchmark")
+    except Exception as e:
+        if verbose:
+            print(f"Warning: failed to initialize git repo in {testdir}: {e}")
+
+    coder_kwargs = dict(
+        main_model=main_model,
+        edit_format=edit_format,
+        io=io,
+        fnames=fnames,
+        use_git=True,
+        auto_commits=False,
+        dirty_commits=False,
+        stream=False,
+        verbose=verbose,
+        # auto_lint=False,  # disabled for code-in-json experiments
+        cache_prompts=True,
+        suggest_shell_commands=False,
+        ignore_mentions=ignore_files,
+        # Reduce repo map contention and size for benchmarks
+        map_cache_dir=str(testdir),
+        repomap_in_memory=repomap_in_memory,
+        map_mul_no_files=4,
+    )
+    if map_tokens is not None:
+        coder_kwargs["map_tokens"] = map_tokens
+
+    coder = Coder.create(**coder_kwargs)
+    dump(coder.ignore_mentions)
+
+    coder.show_announcements()
+    coder.get_file_mentions = lambda x: set()  # No loading of any other files
+
+    timeouts = 0
+
+    syntax_errors = 0
+    indentation_errors = 0
+    lazy_comments = 0
+
+    dur = 0
+    test_outcomes = []
+    for i in range(tries):
+        start = time.time()
+
+        if no_aider:
+            pass
+        elif replay:
+            response = get_replayed_content(replay, testdir)
+            coder.partial_response_content = response
+
+            show = response.splitlines(keepends=True)
+            show = [">> " + line for line in show]
+            io.append_chat_history("".join(show))
+
+            coder.apply_updates()
+        else:
+            response = coder.run(with_message=instructions, preproc=False)
+
+        dur += time.time() - start
+
+        if not no_aider:
+            pat = r"^[+]? *[#].* [.][.][.] "
+            # Count the number of lines that match pat in response
+            dump(response)
+            lazy_comments += len(re.findall(pat, response, re.MULTILINE))
+            dump(lazy_comments)
+
+        if coder.last_keyboard_interrupt:
+            raise KeyboardInterrupt
+
+        if no_unit_tests:
+            break
+
+        try:
+            errors = run_unit_tests(original_dname, testdir, history_fname, test_files)
+        except subprocess.TimeoutExpired:
+            # try:
+            #    errors = run_unit_tests(original_dname, testdir, history_fname, test_files)
+            # except subprocess.TimeoutExpired:
+            errors = "Tests timed out!"
+            timeouts += 1
+
+        if errors:
+            test_outcomes.append(False)
+        else:
+            test_outcomes.append(True)
+            break
+
+        if replay:
+            io.append_chat_history(errors)
+
+        errors = errors.splitlines()
+
+        syntax_errors += sum(1 for line in errors if line.startswith("SyntaxError"))
+        indentation_errors += sum(1 for line in errors if line.startswith("IndentationError"))
+
+        print(errors[-1])
+        errors = "\n".join(errors)
+        instructions = errors
+        instructions += prompts.test_failures.format(file_list=file_list)
+
+    # Clean up build directories after all attempts
+    # Rust target/debug
+    target_dir = testdir / "target" / "debug"
+    if target_dir.exists():
+        try:
+            shutil.rmtree(target_dir)
+            if verbose:
+                print(f"Cleaned up Rust target/debug directory: {target_dir}")
+        except (OSError, shutil.Error, PermissionError) as e:
+            if verbose:
+                print(f"Failed to clean up Rust target/debug directory: {e}")
+
+    # Java build directories
+    java_build_dir = testdir / "build"
+    if java_build_dir.exists():
+        try:
+            shutil.rmtree(java_build_dir)
+            if verbose:
+                print(f"Cleaned up Java build directory: {java_build_dir}")
+        except (OSError, shutil.Error, PermissionError) as e:
+            if verbose:
+                print(f"Failed to clean up Java build directory: {e}")
+
+    # Node.js node_modules directories
+    node_modules_dir = testdir / "node_modules"
+    if node_modules_dir.exists():
+        try:
+            shutil.rmtree(node_modules_dir)
+            if verbose:
+                print(f"Cleaned up Node.js node_modules directory: {node_modules_dir}")
+        except (OSError, shutil.Error, PermissionError) as e:
+            if verbose:
+                print(f"Failed to clean up Node.js node_modules directory: {e}")
+
+    results = dict(
+        testdir=str(testdir),
+        testcase=testdir.name,
+        model=main_model.name,
+        edit_format=edit_format,
+        tests_outcomes=test_outcomes,
+        cost=coder.total_cost,
+        duration=dur,
+        test_timeouts=timeouts,
+        commit_hash=commit_hash,
+        num_error_outputs=io.num_error_outputs,
+        num_user_asks=io.num_user_asks,
+        num_exhausted_context_windows=coder.num_exhausted_context_windows,
+        num_malformed_responses=coder.num_malformed_responses,
+        syntax_errors=syntax_errors,
+        indentation_errors=indentation_errors,
+        lazy_comments=lazy_comments,  # Add the count of pattern matches to the results
+        reasoning_effort=reasoning_effort,
+        prompt_tokens=coder.total_tokens_sent,
+        completion_tokens=coder.total_tokens_received,
+        thinking_tokens=thinking_tokens,
+        map_tokens=map_tokens,
+        chat_hashes=list(
+            zip(
+                coder.chat_completion_call_hashes,
+                coder.chat_completion_response_hashes,
+            )
+        ),
+    )
+
+    if edit_format == "architect":
+        results["editor_model"] = main_model.editor_model.name if main_model.editor_model else None
+        results["editor_edit_format"] = main_model.editor_edit_format
+    dump(results)
+
+    results_fname.write_text(json.dumps(results, indent=4))
+
+    return results
+
+
+def run_unit_tests(original_dname, testdir, history_fname, test_files):
+    timeout = 60 * 3
+
+    # Map of file extensions to test commands
+    TEST_COMMANDS = {
+        ".py": ["pytest"],
+        ".rs": ["cargo", "test", "--", "--include-ignored"],
+        ".go": ["go", "test", "./..."],
+        ".js": ["/aider/benchmark/npm-test.sh"],
+        ".cpp": ["/aider/benchmark/cpp-test.sh"],
+        ".java": ["./gradlew", "test"],
+    }
+
+    # Get unique file extensions from test files
+    extensions = {Path(f).suffix for f in test_files}
+
+    # Find matching test command
+    command = None
+    for ext in extensions:
+        if ext in TEST_COMMANDS:
+            command = TEST_COMMANDS[ext]
+            break
+
+    if not command:
+        raise ValueError(f"No test command found for files with extensions: {extensions}")
+
+    # Copy test files from original directory
+    for file_path in test_files:
+        src = original_dname / Path(*testdir.parts[-4:]) / file_path
+        dst = testdir / file_path
+        if src.exists():
+            print("copying", src, dst)
+            os.makedirs(dst.parent, exist_ok=True)
+            shutil.copy(src, dst)
+
+    # Remove @Disabled annotations from Java test files
+    for file_path in test_files:
+        if file_path.endswith(".java"):
+            test_file = testdir / file_path
+            if test_file.exists():
+                content = test_file.read_text()
+                content = re.sub(r"@Disabled\([^)]*\)\s*\n", "", content)
+                test_file.write_text(content)
+
+    print(" ".join(command))
+
+    result = subprocess.run(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        timeout=timeout,
+        cwd=testdir,
+        encoding="utf-8",
+        errors="replace",
+    )
+
+    success = result.returncode == 0
+    res = result.stdout
+    res = cleanup_test_output(res, testdir)
+    dump(res)
+
+    with history_fname.open("a") as fh:
+        fh.write(f"```\n{res}\n```")
+
+    if not success:
+        print(f"Tests failed: {testdir}")
+        return res
+
+
+def cleanup_test_output(output, testdir):
+    # remove timing info, to avoid randomizing the response to GPT
+    res = re.sub(r"\bin \d+\.\d+s\b", "", output)
+    res = res.replace(str(testdir), str(testdir.name))
+    return res
+
+
+if __name__ == "__main__":
+    app()
diff --git a/benchmark/docker.sh b/benchmark/docker.sh
index 6f97b865e19..b4265a69401 100755
--- a/benchmark/docker.sh
+++ b/benchmark/docker.sh
@@ -1,19 +1,20 @@
 #!/bin/bash
 
+# FIXME - should be able to choose the keys to pass internal
+#
 docker run \
-       -it --rm \
-       --memory=12g \
-       --memory-swap=12g \
-       --add-host=host.docker.internal:host-gateway \
-       -v `pwd`:/aider \
-       -v `pwd`/tmp.benchmarks/.:/benchmarks \
-       -e OPENAI_API_KEY=$OPENAI_API_KEY \
-       -e HISTFILE=/aider/.bash_history \
-       -e PROMPT_COMMAND='history -a' \
-       -e HISTCONTROL=ignoredups \
-       -e HISTSIZE=10000 \
-       -e HISTFILESIZE=20000 \
-       -e AIDER_DOCKER=1 \
-       -e AIDER_BENCHMARK_DIR=/benchmarks \
-       aider-benchmark \
-       bash
+  -it --rm \
+  --memory=12g \
+  --memory-swap=12g \
+  --add-host=host.docker.internal:host-gateway \
+  -v $(pwd):/cecli \
+  -v $(pwd)/tmp.benchmarks/.:/benchmarks \
+  -e GEMINI_API_KEY=$GEMINI_API_KEY \
+  -e PROMPT_COMMAND='history -a' \
+  -e HISTCONTROL=ignoredups \
+  -e HISTSIZE=10000 \
+  -e HISTFILESIZE=20000 \
+  -e AIDER_DOCKER=1 \
+  -e AIDER_BENCHMARK_DIR=/benchmarks \
+  cecli-cat \
+  bash
diff --git a/benchmark/docker_build.sh b/benchmark/docker_build.sh
index a6619bb5ce1..a132463ef17 100755
--- a/benchmark/docker_build.sh
+++ b/benchmark/docker_build.sh
@@ -3,6 +3,6 @@
 set -e
 
 docker build \
-       --file benchmark/Dockerfile \
-       -t aider-benchmark \
-       .
+  --file benchmark/Dockerfile \
+  -t cecli-cat \
+  .