From 925cbb72be101aa10e612f80b07c1cad8e298434 Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Tue, 9 Jun 2026 01:48:37 -0700 Subject: [PATCH] chore(release): release v0.4.3 - Bump crate, script package, lockfile, README, and citation metadata to v0.4.3 - Promote the v0.4.3 performance report and archive the v0.4.2 comparison - Refresh README nalgebra/faer benchmark data and plot assets - Reorganize benchmark documentation around release, local, and artifact workflows - Update release guidance and roadmap follow-up items for v0.4.4 performance work --- CHANGELOG.md | 42 +- CITATION.cff | 2 +- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 22 +- docs/BENCHMARKING.md | 419 ++++++++++-------- docs/PERFORMANCE.md | 144 +++--- docs/RELEASING.md | 28 +- docs/archive/performance/README.md | 1 + docs/archive/performance/v0.4.2-vs-v0.4.1.md | 119 +++++ .../bench/vs_linalg_lu_solve_median.csv | 16 +- .../bench/vs_linalg_lu_solve_median.svg | 146 +++--- docs/roadmap.md | 29 ++ pyproject.toml | 2 +- scripts/archive_performance.py | 2 + scripts/bench_compare.py | 2 + scripts/tests/test_archive_performance.py | 5 + scripts/tests/test_bench_compare.py | 1 + uv.lock | 2 +- 19 files changed, 609 insertions(+), 377 deletions(-) create mode 100644 docs/archive/performance/v0.4.2-vs-v0.4.1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index eed0080..8d09cb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.4.3] - 2026-06-09 ### ⚠️ Breaking Changes @@ -49,6 +49,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Return typed Unrepresentable reasons when strict exact-to-f64 conversion would round or become non-finite. - Specialize D4 exact determinants and keep determinant/error-bound zero coefficients from evaluating overflowing absent terms. - Update exact benchmark comparison reporting to compare strict and rounded APIs against legacy v0.4.2 rows. +- Archive release performance reports [`2817d01`](https://github.com/acgetchell/la-stack/commit/2817d01374ad0aeab98d6f48a3dae9b30f878a8a) + - Add an archive-performance utility that promotes curated benchmark reports into docs/PERFORMANCE.md while archiving prior release comparisons + - Generate release comparisons in isolated temporary worktrees, including legacy command fallback for published tags + - Wire release and historical archive recipes into just, Python packaging, and release documentation +- Automate published performance report archiving [`d31e26a`](https://github.com/acgetchell/la-stack/commit/d31e26a9d7a47a6c3089028630640bcff5afe7c0) + - Track the latest curated release comparison in docs/PERFORMANCE.md and archive older comparisons under docs/archive/performance/ + - Let performance-archive-published discover the latest stable GitHub release and previous stable baseline automatically + - Generate release comparisons in isolated temporary worktrees, with release-asset restore and local baseline fallback paths + - Update benchmark and release docs to use the scripted workflow instead of manual checkout steps +- Split local and release performance comparisons [`7258525`](https://github.com/acgetchell/la-stack/commit/7258525590f2ed68d41879e71c833010e408e7f7) + - Add default performance-local and performance-release workflows that infer the relevant release tags and run in temporary worktrees. + - Add a performance-github-assets workflow for comparing stored GitHub Actions release benchmark assets without local cargo runs. + - Normalize release tags before fetching, downloading assets, or checking out detached worktrees. + - Update performance docs, release guidance, and generated report instructions to use the new benchmark workflows. +- Add vs_linalg-only performance checks [`d7c1487`](https://github.com/acgetchell/la-stack/commit/d7c1487115e1a8e5bb1ec4fcc7592786e300e2ce) + - Add local workflows for comparing current non-exact la-stack kernels against a release baseline without rerunning current nalgebra/faer or exact benchmarks. + - Route archive-performance baseline and current benchmark commands by suite, with legacy fallback support for older release worktrees. + - Document the faster release-signal workflow and expand Semgrep fixtures for benchmark, example, doctest, and public panic-path rules. ### Changed @@ -61,6 +79,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Replace mypy with strict Ty checking in the Python workflow. - Parse TOML, JSON, argparse, and Semgrep inputs into typed boundary objects before downstream use. - Reject malformed Criterion estimates, non-finite timings, invalid confidence intervals, and malformed Semgrep result shapes. +- Harden Rust release hygiene [`8e12c93`](https://github.com/acgetchell/la-stack/commit/8e12c935fe54e265e8ceb640702267ec0e71b7b1) + - Promote missing documentation and dead code lints to deny-level checks. + - Forbid unsafe code explicitly across Rust modules and benchmark targets. + - Document the LU/LDLT empty-matrix convention for D=0. + - Move exact benchmark input generation into typed helpers and consolidate exact benchmark operation dispatch. ### Documentation @@ -82,6 +105,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Return matrix-cell metadata when inf-norm row sums or symmetry tolerance scaling overflow. - Avoid reparsing finite-by-construction RHS vectors in LU and LDLT solves. +- Re-raise unexpected archive failures [`7938386`](https://github.com/acgetchell/la-stack/commit/7938386166f1f3f5cf594c5def67458d48e19a98) + - Limit archive-performance CLI error handling to expected validation, filesystem, subprocess, and runtime failures. + - Let unexpected exceptions propagate so benchmark archiving bugs surface during development. + +### Performance + +- Improve factorization kernel [`8837df1`](https://github.com/acgetchell/la-stack/commit/8837df1f54a9fa2c20abc1487cfce4de8c8e09c5) + + - Preserve the tiny-dimension update shape for D2-D5 to avoid regressing the core fixed-size path + - Fuse multiplier computation with trailing updates for larger dimensions to reduce extra column walks + - Rely on the LDLT factorization proof instead of a redundant final finite-storage scan +- Optimize exact and factorized solve kernels [`1690355`](https://github.com/acgetchell/la-stack/commit/1690355bf27c2cbba685ba0cd70486275c7620b8) + - Split LU and LDLT solve paths so tiny matrices keep the direct kernels while larger fixed dimensions avoid extra substitution work. + - Convert dyadic exact solve results directly to finite f64 and preserve UnrepresentableReason recovery semantics on strict conversion failures. + - Modernize release branch commands and keep just recipes sorted. ## [0.4.2] - 2026-06-04 @@ -634,7 +672,7 @@ Older releases are archived by minor series: - [0.2.x](docs/archive/changelog/0.2.md) - [0.1.x](docs/archive/changelog/0.1.md) -[Unreleased]: https://github.com/acgetchell/la-stack/compare/v0.4.2...HEAD +[0.4.3]: https://github.com/acgetchell/la-stack/compare/v0.4.2...v0.4.3 [0.4.2]: https://github.com/acgetchell/la-stack/compare/v0.4.1...v0.4.2 [0.4.1]: https://github.com/acgetchell/la-stack/compare/v0.4.0...v0.4.1 [0.4.0]: https://github.com/acgetchell/la-stack/compare/v0.3.0...v0.4.0 diff --git a/CITATION.cff b/CITATION.cff index cde70f6..4f7b4db 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,7 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." type: software title: "la-stack: Fast, stack-allocated linear algebra for fixed dimensions in Rust" -version: 0.4.2 +version: 0.4.3 date-released: 2026-06-04 url: "https://github.com/acgetchell/la-stack" repository-code: "https://github.com/acgetchell/la-stack" diff --git a/Cargo.lock b/Cargo.lock index ae84265..b0dcf0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,7 +603,7 @@ dependencies = [ [[package]] name = "la-stack" -version = "0.4.2" +version = "0.4.3" dependencies = [ "approx", "criterion", diff --git a/Cargo.toml b/Cargo.toml index 083635a..b64550f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "la-stack" -version = "0.4.2" +version = "0.4.3" edition = "2024" rust-version = "1.96" license = "BSD-3-Clause" diff --git a/README.md b/README.md index 7c92b06..9a2aee0 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -la-stack = "0.4.2" +la-stack = "0.4.3" ``` ### Feature flags @@ -207,7 +207,7 @@ rationals (this pulls in `num-bigint`, `num-rational`, and `num-traits` for ```toml [dependencies] -la-stack = { version = "0.4.2", features = ["exact"] } +la-stack = { version = "0.4.3", features = ["exact"] } ``` **Determinants:** @@ -383,19 +383,21 @@ operations. For the full per-kernel comparison methodology, input construction, and release-comparison workflow details, see [docs/BENCHMARKING.md](docs/BENCHMARKING.md). +For the current release-to-release performance snapshot, see +[docs/PERFORMANCE.md](docs/PERFORMANCE.md). | D | la-stack median (ns) | nalgebra median (ns) | faer median (ns) | la-stack vs nalgebra | la-stack vs faer | |---:|--------------------:|--------------------:|----------------:|---------------------:|----------------:| -| 2 | 2.585 | 4.486 | 137.653 | +42.4% | +98.1% | -| 3 | 12.204 | 22.990 | 182.618 | +46.9% | +93.3% | -| 4 | 27.228 | 51.660 | 208.181 | +47.3% | +86.9% | -| 5 | 53.141 | 68.714 | 272.117 | +22.7% | +80.5% | -| 8 | 141.279 | 162.225 | 348.216 | +12.9% | +59.4% | -| 16 | 626.561 | 574.115 | 854.941 | -9.1% | +26.7% | -| 32 | 2,862.795 | 2,709.532 | 2,806.698 | -5.7% | -2.0% | -| 64 | 19,703.239 | 14,388.285 | 12,085.453 | -36.9% | -63.0% | +| 2 | 2.044 | 4.542 | 143.958 | +55.0% | +98.6% | +| 3 | 9.596 | 23.599 | 185.466 | +59.3% | +94.8% | +| 4 | 23.338 | 50.717 | 210.976 | +54.0% | +88.9% | +| 5 | 45.368 | 69.065 | 277.564 | +34.3% | +83.7% | +| 8 | 127.861 | 164.412 | 364.864 | +22.2% | +65.0% | +| 16 | 631.997 | 663.822 | 882.674 | +4.8% | +28.4% | +| 32 | 2,745.604 | 2,424.540 | 2,867.431 | -13.2% | +4.2% | +| 64 | 17,543.034 | 14,747.731 | 12,266.271 | -19.0% | -43.0% | diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index 4eb3c66..15e9738 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -1,233 +1,278 @@ # Benchmarking -This guide covers how to run, compare, and track performance for la-stack. - -## Benchmark suites - -la-stack has two Criterion benchmark suites: - -- **`vs_linalg`** (`benches/vs_linalg.rs`) — compares la-stack against - nalgebra and faer across D=2–64 for LU, solve, det, dot, norm, etc. - Use this to answer "why choose la-stack over other crates?" - The suite also includes SPD factorization rows for la-stack LDLT, faer - LDLT, and nalgebra Cholesky. The nalgebra rows are labelled Cholesky - because nalgebra does not expose a dense LDLT factorization in the - dependency version used here. - -- **`exact`** (`benches/exact.rs`) — measures exact-arithmetic methods - (`det_exact`, `solve_exact`, `det_sign_exact`, strict `*_result` - conversions, and lossy `*_rounded_f64` conversions) alongside f64 - baselines (`det`, `det_direct`) across D=2–5. Use this to understand - the cost of exact arithmetic and track optimization progress. - In addition to the fixed per-dimension groups (`exact_d{2..5}`), the - suite includes random percentile and adversarial-input groups designed - to capture variance and stress specific corners of the pipeline: - - - `exact_random_percentile_d{2..5}` — fixed-seed corpora of 50 - strictly diagonally-dominant random matrices per dimension. Each - operation is pre-timed across the corpus to select representative - p50/p95/p99 inputs, then Criterion measures those inputs normally. - - `exact_near_singular_3x3` — a 2^-50 perturbation of a singular base - matrix; forces the Bareiss fallback in `det_sign_exact` and - exercises the largest intermediate `BigInt` values in `solve_exact`. - - `exact_large_entries_3x3` — diagonal entries near `f64::MAX / 2` - stress `BigInt` growth during Bareiss forward elimination. - - `exact_hilbert_4x4` / `exact_hilbert_5x5` — classically - ill-conditioned matrices whose non-terminating-in-binary entries - stress the `f64_decompose → BigInt` scaling path. - - Each random percentile and adversarial group runs the same five - exact-arithmetic benches (`det_sign_exact`, `det_exact`, `solve_exact`, - `solve_exact_f64_result`, `solve_exact_rounded_f64`) so the resulting tables - are directly comparable across input classes. Rows with a `_result` suffix - measure the strict fallible conversion path, including valid - `Err(Unrepresentable)` outcomes when the exact answer is not - finite-binary64 representable. Rows with a `_rounded_f64` suffix measure the - intentionally lossy finite-binary64 conversion path. - -## `vs_linalg` methodology - -`vs_linalg` is a per-kernel comparison, not a single aggregate score. Each -reported row compares one operation for one dimension `D`, using Criterion's -selected statistic from `target/criterion/d{D}/{benchmark}/{sample}/estimates.json`. -The default report and README table use Criterion's `median.point_estimate` -in nanoseconds. Lower is better. - -All three crates receive equivalent deterministic inputs for a given -dimension: +This guide explains how to run, compare, and publish performance results for +`la-stack`. Start with the workflow table below; the later sections explain what +the commands measure and where their outputs go. + +## Contents + +- [Start Here](#start-here) +- [Benchmark Suites](#benchmark-suites) +- [Common Workflows](#common-workflows) + - [Compare Current Code With The Latest Release](#compare-current-code-with-the-latest-release) + - [Compare Current Code With A Specific Release](#compare-current-code-with-a-specific-release) + - [Iterate Against A Local Saved Baseline](#iterate-against-a-local-saved-baseline) + - [Update The README nalgebra/faer Table](#update-the-readme-nalgebrafaer-table) + - [Create The Release Performance Report](#create-the-release-performance-report) + - [Compare Published Release Artifacts](#compare-published-release-artifacts) +- [Output Locations](#output-locations) +- [`vs_linalg` Methodology](#vs_linalg-methodology) +- [Exact-Arithmetic Notes](#exact-arithmetic-notes) +- [Release Notes](#release-notes) + +## Start Here + +| Goal | Use | Output | Notes | +|------|-----|--------|-------| +| Clean local audit against the latest published release | `just performance-local` | `target/bench-reports/performance.md` | Self-contained; creates temporary worktrees and regenerates the release baseline locally. | +| Non-exact release-signal check against a specific release | `just performance-local-vs-linalg v0.4.3 v0.4.2` | `target/bench-reports/performance.md` | Narrower than `performance-local`; useful for LU/LDLT/dot/norm work. | +| Fast repeated comparisons while tuning one kernel | `just bench-save-baseline ` then `just bench-compare all-benches` | `target/bench-reports/performance.md` | Uses local `target/criterion/`; fastest loop after the baseline exists. | +| Full current la-stack vs nalgebra/faer comparison | `just bench-vs-linalg` | `target/criterion/` | Measures current la-stack, nalgebra, and faer rows. | +| README benchmark table and SVG plot | `just plot-vs-linalg-readme` after `just bench-vs-linalg` | `README.md`, `docs/assets/bench/` | Uses current `target/criterion` data. | +| Release PR performance artifact | `just performance-release v0.4.3 v0.4.2` | `docs/PERFORMANCE.md`, `docs/archive/performance/` | Mutates committed docs. Run during release preparation. | +| Compare already-published release assets | `just performance-github-assets v0.4.3 v0.4.2` | `target/bench-reports/github-assets-performance.md` | Uses GitHub Release baseline assets instead of local cargo runs. | + +Rule of thumb: + +- Use `performance-local*` for clean, self-contained answers. +- Use `bench-save-*` plus `bench-compare` for tight local optimization loops. +- Use `bench-vs-linalg` plus plotting when updating README crate-to-crate + comparisons. +- Use `performance-release` only when preparing committed release artifacts. + +## Benchmark Suites + +`la-stack` has two Criterion benchmark suites. + +**`vs_linalg`** (`benches/vs_linalg.rs`) compares `la-stack` against +`nalgebra` and `faer` across D=2-64 for LU, solve, determinant, dot, norm, and +SPD factorization operations. Use this suite to answer "why choose la-stack over +other crates?" + +The SPD rows compare la-stack LDLT, faer LDLT, and nalgebra Cholesky. They are +labelled by algorithm because nalgebra does not expose a dense LDLT +factorization in the dependency version used here. + +**`exact`** (`benches/exact.rs`) measures exact-arithmetic methods +(`det_exact`, `solve_exact`, `det_sign_exact`, strict `*_result` conversions, +and lossy `*_rounded_f64` conversions) alongside f64 baselines (`det`, +`det_direct`) across D=2-5. Use this suite to understand exact-arithmetic cost +and track optimization progress. + +## Common Workflows + +### Compare Current Code With The Latest Release + +Use this when you want a clean local answer to "how does this checkout compare +with the latest published release?" -- matrix entries come from the same strictly diagonally-dominant generator - (`matrix_entry::`) -- right-hand sides and vector inputs come from the same deterministic vector - generator -- each benchmark uses `black_box` around inputs and outputs to keep the - measured operation visible to the optimizer +```bash +just performance-local +``` -The integration smoke test `tests/vs_linalg_inputs.rs` reuses the benchmark -input helpers and verifies that la-stack, nalgebra, and faer agree on the -determinant, solve, dot, and infinity-norm results for D=2..=5. Run it with -`cargo test --features bench --test vs_linalg_inputs` when changing benchmark -input construction, adding comparable kernels, or updating the `faer` or -`nalgebra` benchmark dependencies. +This creates isolated temporary worktrees, generates the latest published +release baseline locally, benchmarks the current tree on the same machine, and +writes `target/bench-reports/performance.md`. -The main comparable metrics are: +This command does not depend on existing local `target/criterion/` baselines. +It is slower than reusing a saved baseline, but less sensitive to stale local +benchmark state. -- `det_via_lu` — factor the matrix and compute determinant from the LU factor -- `lu` — factorization only -- `lu_solve` — factor the matrix and solve one right-hand side -- `solve_from_lu` — solve one right-hand side using a precomputed LU factor -- `det_from_lu` — compute determinant using a precomputed LU factor -- `dot` — vector dot product -- `norm2_sq` — squared Euclidean vector norm -- `inf_norm` — matrix infinity norm, implemented as maximum absolute row sum +### Compare Current Code With A Specific Release -Additional SPD metrics compare la-stack LDLT against faer LDLT and nalgebra -Cholesky. These rows are labelled by algorithm (`ldlt` or `cholesky`) because -nalgebra does not expose a dense LDLT factorization in the dependency version -used here. They should be read as SPD factorization/solve/determinant -comparisons, not as identical algorithm comparisons across all three crates. +For a narrower non-exact check against a known release pair, run: -Release-signal reports compare latest la-stack measurements against a saved -la-stack baseline, and show saved nalgebra/faer baseline timings as context -where a matching peer benchmark exists. That keeps iteration cheap while still -making the release signal auditable. The full `vs_linalg` run remains the -source of README plots and crate-to-crate comparison tables. +```bash +just performance-local-vs-linalg v0.4.3 v0.4.2 +``` + +This generates a local `v0.4.2` `vs_linalg` baseline, measures the current +la-stack `vs_linalg` rows, and renders a `vs_linalg` report. The report includes +saved baseline nalgebra/faer timings as context where matching peer rows exist, +without rerunning current peer crates. + +### Iterate Against A Local Saved Baseline + +Use local saved baselines when tuning one kernel and comparing several edits +against the same starting point. These baselines are local scratch data, not +release artifacts. -## Quick reference +For example, before optimizing `Matrix::inf_norm`, save a named baseline: ```bash -# Run vs_linalg benchmarks -just bench-vs-linalg +just bench-save-baseline inf-norm-before vs_linalg +``` + +Then make a change, rerun only the current measurements you care about, and +compare: -# Run only la-stack rows from vs_linalg +```bash just bench-vs-linalg-la-stack +just bench-compare inf-norm-before vs_linalg all-benches +``` -# Run exact-arithmetic benchmarks -just bench-exact +The `just bench-compare` recipe uses positional arguments: +`just bench-compare `. The underlying +`uv run bench-compare` CLI accepts the explicit `--suite` and `--scope` flags. -# Run the cheaper latest measurements used for latest-vs-last reports -just bench-latest +`just bench-save-baseline ` writes Criterion samples under +`target/criterion/`. `just bench-save-last` saves the conventional local +baseline named `last`, which enables shortcuts such as: -# Save a full baseline named "last" -just bench-save-last - -# Compare latest measurements against the saved "last" baseline +```bash +just bench-latest-vs-last +just bench-vs-linalg-latest-vs just bench-compare +``` -# Run latest measurements and compare against "last" -just bench-latest-vs-last +Saved baselines persist across `git checkout` but not across `cargo clean`, and +they are not pushed to GitHub. -# Run only non-exact la-stack rows from vs_linalg and compare against "last" -just bench-vs-linalg-latest-vs +### Update The README nalgebra/faer Table + +The README benchmark table and SVG plot are crate-to-crate comparisons from the +current checkout: + +```bash +just bench-vs-linalg +just plot-vs-linalg-readme ``` -## Comparing performance across releases +`just bench-vs-linalg` measures current la-stack, nalgebra, and faer rows. +`just plot-vs-linalg-readme` reads those Criterion results and updates: -Criterion baselines are saved into `target/criterion/` and persist across -`git checkout` but **not** across `cargo clean`. Published releases also attach -a compressed Criterion baseline to the GitHub Release so historical release -baselines can be restored later. +- `README.md` +- `docs/assets/bench/vs_linalg_lu_solve_median.csv` +- `docs/assets/bench/vs_linalg_lu_solve_median.svg` -### Latest vs last +See `scripts/criterion_dim_plot.py --help` for plotting options. -The default workflow is optimized for the common maintenance question: -"how does latest la-stack compare to the last release?" +### Create The Release Performance Report -At release time, save a full baseline: +Release PRs promote one curated release-to-release comparison into committed +docs: ```bash -just bench-save-last +just performance-release v0.4.3 v0.4.2 ``` -During development, run the cheaper latest path: +With no arguments, `just performance-release` infers the current release tag +from `Cargo.toml` and discovers the previous stable published release. During +release preparation, passing both tags explicitly removes ambiguity. -```bash -just bench-latest-vs-last -``` +This command creates temporary worktrees, generates the comparison, writes +`docs/PERFORMANCE.md`, and archives the previous committed report under +`docs/archive/performance/`. Archive filenames are release-pair names such as +`v0.4.2-vs-v0.4.1.md`. -`bench-latest` runs exact arithmetic plus only the la-stack rows from -`vs_linalg`. The comparison report still shows the last-release nalgebra -and faer timings for matching rows, so you can see whether a la-stack -change improves or weakens the release signal without rerunning third-party -benchmarks on every iteration. +### Compare Published Release Artifacts -For a faster non-exact check, run: +After releases are published, the GitHub Release benchmark workflow attaches a +compressed Criterion baseline artifact. To compare those stored artifacts +without running cargo locally: ```bash -just performance-local-vs-linalg v0.4.3 v0.4.2 +just performance-github-assets v0.4.3 v0.4.2 ``` -This generates a local `v0.4.2` baseline for `vs_linalg`, measures only the -current la-stack rows from `vs_linalg`, then compares them using `--suite -vs_linalg`. The report shows saved baseline nalgebra/faer timings as context -without rerunning the peer crates on the current checkout. +With no arguments, the recipe discovers the latest stable published GitHub +release and its previous stable release automatically. -### Workflow +## Output Locations -```bash -# Current in-tree code vs latest published release, all measured locally -just performance-local +| Path | Committed? | Producer | Purpose | +|------|------------|----------|---------| +| `target/criterion/` | No | `cargo bench`, `bench-save-*` | Local Criterion measurements and named baselines. | +| `target/bench-reports/performance.md` | No | `bench-compare`, `performance-local*` | Local comparison report. | +| `target/bench-reports/github-assets-performance.md` | No | `performance-github-assets` | Local report from published release artifacts. | +| `docs/PERFORMANCE.md` | Yes | `performance-release` | Latest curated release-to-release comparison. | +| `docs/archive/performance/` | Yes | `performance-release` | Older curated release-to-release comparisons. | +| `docs/assets/bench/` | Yes | `plot-vs-linalg-readme` | README benchmark CSV/SVG assets. | +| GitHub Release asset `la-stack-$TAG-criterion-baseline.tar.gz` | Remote release artifact | `.github/workflows/release-benchmarks.yml` | Durable Criterion baseline archive for published releases. | -# Current in-tree non-exact kernels vs a release baseline -just performance-local-vs-linalg v0.4.3 v0.4.2 +## `vs_linalg` Methodology -# Stored GitHub Actions release assets, no local cargo runs -just performance-github-assets -``` +`vs_linalg` is a per-kernel comparison, not a single aggregate score. Each row +compares one operation for one dimension `D`, using Criterion's selected +statistic from `target/criterion/d{D}/{benchmark}/{sample}/estimates.json`. +The README table uses `median.point_estimate` in nanoseconds. Lower is better. -`performance-local` creates isolated temporary worktrees, generates the latest -published release baseline locally, then benchmarks the current in-tree code on -the same machine. It uses the current checkout's Rust toolchain for both sides -unless `RUSTUP_TOOLCHAIN` is already set. `performance-github-assets` compares -stored GitHub Actions release artifacts and does not run cargo locally. +All three crates receive equivalent deterministic inputs for a given dimension: -For local scratch comparisons, you can save multiple baselines and compare -against any of them. If the release baseline is already present in -`target/criterion/`, compare directly: +- matrix entries come from the same strictly diagonally-dominant generator + (`matrix_entry::`) +- right-hand sides and vector inputs come from the same deterministic vector + generator +- each benchmark uses `black_box` around inputs and outputs to keep the + measured operation visible to the optimizer + +The integration smoke test `tests/vs_linalg_inputs.rs` reuses the benchmark +input helpers and verifies that la-stack, nalgebra, and faer agree on the +determinant, solve, dot, and infinity-norm results for D=2..=5. Run it with: ```bash -just bench-latest # gather latest la-stack measurements -just bench-compare v0.4.2 # compare latest measurements against v0.4.2 +cargo test --features bench --test vs_linalg_inputs ``` -### Output - -`just bench-compare` writes `target/bench-reports/performance.md` by -default. The file contains machine-specific timings and is intentionally -local. The report includes per-dimension tables showing median times, -percent change, speedup, and last-release nalgebra/faer context where a -matching `vs_linalg` peer exists. +Run that test when changing benchmark input construction, adding comparable +kernels, or updating the `faer` or `nalgebra` benchmark dependencies. -Release PRs promote one curated comparison into committed docs: +The main comparable metrics are: -```bash -just performance-release -``` +- `det_via_lu` — factor the matrix and compute determinant from the LU factor +- `lu` — LU factorization only +- `lu_solve` — factor the matrix and solve one right-hand side +- `solve_from_lu` — solve one right-hand side using a precomputed LU factor +- `det_from_lu` — compute determinant using a precomputed LU factor +- `dot` — vector dot product +- `norm2_sq` — squared Euclidean vector norm +- `inf_norm` — matrix infinity norm, implemented as maximum absolute row sum -This infers the current release tag from `Cargo.toml`, discovers the previous -stable published release, generates both sides locally in temporary worktrees, -copies the finished report to `docs/PERFORMANCE.md`, and archives the previous -committed report under `docs/archive/performance/`. Archive filenames are -release-pair names such as `v0.4.2-vs-v0.4.1.md`, so the directory and generated -index stay lexicographically sorted. For explicit release repair, pass both -tags: `just performance-release v0.4.3 v0.4.2`. +Additional SPD metrics compare la-stack LDLT against faer LDLT and nalgebra +Cholesky: -To compare the latest stored GitHub Actions release assets without touching the -current checkout: +- `ldlt` / `cholesky` — SPD factorization only +- `ldlt_solve` / `cholesky_solve` — factor and solve one right-hand side +- `solve_from_ldlt` / `solve_from_cholesky` — solve using a precomputed factor +- `det_from_ldlt` / `det_from_cholesky` — determinant from a precomputed factor -```bash -just performance-github-assets -``` +Read these as SPD factorization/solve/determinant comparisons, not as identical +algorithm comparisons across all three crates. -The recipe discovers the latest stable published GitHub release and its previous -stable release automatically. For explicit historical repair, pass both tags: -`just performance-github-assets v0.4.2 v0.4.1`. +Release-signal reports compare latest la-stack measurements against a saved +la-stack baseline, and show saved nalgebra/faer baseline timings as context +where a matching peer benchmark exists. That keeps iteration cheap while still +making the release signal auditable. The full `vs_linalg` run remains the source +of README plots and crate-to-crate comparison tables. + +## Exact-Arithmetic Notes + +The exact suite includes fixed per-dimension groups (`exact_d{2..5}`), random +percentile groups, and adversarial-input groups: + +- `exact_random_percentile_d{2..5}` — fixed-seed corpora of 50 strictly + diagonally-dominant random matrices per dimension. Each operation is + pre-timed across the corpus to select representative p50/p95/p99 inputs, then + Criterion measures those inputs normally. +- `exact_near_singular_3x3` — a 2^-50 perturbation of a singular base matrix; + forces the Bareiss fallback in `det_sign_exact` and exercises the largest + intermediate `BigInt` values in `solve_exact`. +- `exact_large_entries_3x3` — diagonal entries near `f64::MAX / 2` stress + `BigInt` growth during Bareiss forward elimination. +- `exact_hilbert_4x4` / `exact_hilbert_5x5` — classically ill-conditioned + matrices whose non-terminating-in-binary entries stress the + `f64_decompose -> BigInt` scaling path. + +Each random percentile and adversarial group runs the same exact-arithmetic +benches (`det_sign_exact`, `det_exact`, `solve_exact`, +`solve_exact_f64_result`, `solve_exact_rounded_f64`) so tables are comparable +across input classes. For exact-arithmetic comparisons against v0.4.2 or older baselines, rows such as `det_exact_rounded_f64 (vs det_exact_f64)` mean the current rounded API is being compared to the historical lossy `*_exact_f64` benchmark. Rows such as -`det_exact_f64_result (vs det_exact_f64)` intentionally show the overhead of -the new strict conversion contract against that same historical baseline. +`det_exact_f64_result (vs det_exact_f64)` intentionally show the overhead of the +new strict conversion contract against that same historical baseline. The default `release-signal` scope reports exact-arithmetic rows whose inputs are fixed across versions: deterministic D=2..=5 cases plus adversarial fixed @@ -246,30 +291,16 @@ To generate a current snapshot without a saved baseline: uv run bench-compare --snapshot ``` -## vs\_linalg plotting - -The `criterion_dim_plot.py` script generates CSV/SVG plots and updates the -README benchmark table from vs\_linalg results: - -```bash -# Run benchmarks + update README table and SVG plot -just bench-vs-linalg -just plot-vs-linalg-readme -``` - -See `scripts/criterion_dim_plot.py --help` for options. - -## Release workflow +## Release Notes -At release time, save a local baseline so future work can compare against it: +Local Criterion baselines are optional during release. Save them only if you +want convenience baselines for follow-up development on the same machine: ```bash just bench-save-baseline just bench-save-last ``` -When the GitHub Release is published, `.github/workflows/release-benchmarks.yml` -saves a full release baseline and attaches -`la-stack-$TAG-criterion-baseline.tar.gz` to the release as the durable archive. -See the `just performance-release` step in `docs/RELEASING.md` for where the -curated `docs/PERFORMANCE.md` comparison fits in the release process. +The durable published baseline is the GitHub Release artifact created by +`.github/workflows/release-benchmarks.yml`. The committed release comparison is +`docs/PERFORMANCE.md`, created by `just performance-release`. diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index 961762b..205cd47 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -1,97 +1,113 @@ -# Exact Arithmetic Performance +# Benchmark Performance -**la-stack** v0.4.2 · `7e11f93` (HEAD) · 2026-06-08 20:39:03 UTC +**la-stack** v0.4.3 · `45affa8` (HEAD) · 2026-06-09 08:41:32 UTC **Statistic**: median +**Suite**: all +**Scope**: release-signal ## Benchmark Results -Comparison against baseline **v0.4.1**: +Comparison against baseline **v0.4.2**: Negative change = faster. Speedup > 1.00x = improvement. +## Exact arithmetic + ### D=2 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det | 0.6 ns | 0.9 ns | +61.1% | 0.62x | -| det_direct | 0.7 ns | 1.0 ns | +44.7% | 0.69x | -| det_exact | 315.5 ns | 318.4 ns | +0.9% | 0.99x | -| det_exact_f64 | 555.7 ns | 555.7 ns | -0.0% | 1.00x | -| det_sign_exact | 0.7 ns | 1.5 ns | +128.2% | 0.44x | -| solve_exact | 7.05 µs | 7.06 µs | +0.2% | 1.00x | -| solve_exact_f64 | 7.50 µs | 7.67 µs | +2.3% | 0.98x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det | 0.9 ns | 0.7 ns | **-24.0%** | 1.32x | +| det_direct | 1.0 ns | 1.0 ns | +2.1% | 0.98x | +| det_exact | 248.9 ns | 195.6 ns | **-21.4%** | 1.27x | +| det_exact_f64_result (vs det_exact_f64) | 429.1 ns | 167.6 ns | **-60.9%** | 2.56x | +| det_exact_rounded_f64 (vs det_exact_f64) | 429.1 ns | 375.2 ns | **-12.6%** | 1.14x | +| det_sign_exact | 1.5 ns | 3.2 ns | +115.9% | 0.46x | +| solve_exact | 6.53 µs | 6.45 µs | **-1.1%** | 1.01x | +| solve_exact_f64_result (vs solve_exact_f64) | 6.90 µs | 6.60 µs | **-4.4%** | 1.05x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 6.90 µs | 7.02 µs | +1.7% | 0.98x | ### D=3 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det | 1.3 ns | 1.8 ns | +36.3% | 0.73x | -| det_direct | 4.7 ns | 2.2 ns | **-51.9%** | 2.08x | -| det_exact | 936.9 ns | 924.3 ns | **-1.3%** | 1.01x | -| det_exact_f64 | 1.18 µs | 1.19 µs | +1.1% | 0.99x | -| det_sign_exact | 2.4 ns | 4.2 ns | +78.1% | 0.56x | -| solve_exact | 27.02 µs | 27.41 µs | +1.5% | 0.99x | -| solve_exact_f64 | 28.06 µs | 27.98 µs | -0.3% | 1.00x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det | 1.8 ns | 1.5 ns | **-19.4%** | 1.24x | +| det_direct | 2.0 ns | 2.0 ns | +2.4% | 0.98x | +| det_exact | 739.0 ns | 468.6 ns | **-36.6%** | 1.58x | +| det_exact_f64_result (vs det_exact_f64) | 913.1 ns | 435.6 ns | **-52.3%** | 2.10x | +| det_exact_rounded_f64 (vs det_exact_f64) | 913.1 ns | 648.1 ns | **-29.0%** | 1.41x | +| det_sign_exact | 4.2 ns | 5.5 ns | +30.9% | 0.76x | +| solve_exact | 25.69 µs | 25.16 µs | **-2.1%** | 1.02x | +| solve_exact_f64_result (vs solve_exact_f64) | 26.16 µs | 25.42 µs | **-2.8%** | 1.03x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 26.16 µs | 25.67 µs | **-1.9%** | 1.02x | ### D=4 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det | 2.4 ns | 3.3 ns | +36.8% | 0.73x | -| det_direct | 2.4 ns | 4.1 ns | +70.2% | 0.59x | -| det_exact | 2.33 µs | 2.33 µs | -0.0% | 1.00x | -| det_exact_f64 | 2.59 µs | 2.58 µs | -0.7% | 1.01x | -| det_sign_exact | 5.3 ns | 6.9 ns | +30.5% | 0.77x | -| solve_exact | 67.14 µs | 67.99 µs | +1.3% | 0.99x | -| solve_exact_f64 | 67.86 µs | 68.51 µs | +1.0% | 0.99x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det | 3.3 ns | 4.5 ns | +38.1% | 0.72x | +| det_direct | 3.7 ns | 4.3 ns | +17.6% | 0.85x | +| det_exact | 1.87 µs | 1.47 µs | **-21.8%** | 1.28x | +| det_exact_f64_result (vs det_exact_f64) | 2.04 µs | 1.47 µs | **-27.9%** | 1.39x | +| det_exact_rounded_f64 (vs det_exact_f64) | 2.04 µs | 1.63 µs | **-19.8%** | 1.25x | +| det_sign_exact | 6.9 ns | 11.5 ns | +67.1% | 0.60x | +| solve_exact | 64.95 µs | 61.67 µs | **-5.1%** | 1.05x | +| solve_exact_f64_result (vs solve_exact_f64) | 66.35 µs | 62.37 µs | **-6.0%** | 1.06x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 66.35 µs | 63.59 µs | **-4.2%** | 1.04x | ### D=5 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det | 21.6 ns | 24.5 ns | +13.7% | 0.88x | -| det_direct | 2.3 ns | 4.7 ns | +104.8% | 0.49x | -| det_exact | 5.04 µs | 4.99 µs | -1.0% | 1.01x | -| det_exact_f64 | 5.32 µs | 5.31 µs | -0.1% | 1.00x | -| det_sign_exact | 4.97 µs | 4.99 µs | +0.3% | 1.00x | -| solve_exact | 134.99 µs | 136.04 µs | +0.8% | 0.99x | -| solve_exact_f64 | 137.11 µs | 138.97 µs | +1.4% | 0.99x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det | 26.0 ns | 23.3 ns | **-10.6%** | 1.12x | +| det_direct | 4.5 ns | 2.5 ns | **-44.2%** | 1.79x | +| det_exact | 4.10 µs | 4.05 µs | **-1.3%** | 1.01x | +| det_exact_f64_result (vs det_exact_f64) | 4.21 µs | 4.02 µs | **-4.4%** | 1.05x | +| det_exact_rounded_f64 (vs det_exact_f64) | 4.21 µs | 4.33 µs | +2.8% | 0.97x | +| det_sign_exact | 3.94 µs | 3.96 µs | +0.6% | 0.99x | +| solve_exact | 130.82 µs | 126.75 µs | **-3.1%** | 1.03x | +| solve_exact_f64_result (vs solve_exact_f64) | 132.70 µs | 127.37 µs | **-4.0%** | 1.04x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 132.70 µs | 128.15 µs | **-3.4%** | 1.04x | ### Near-singular 3x3 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det_sign_exact | 871.8 ns | 877.6 ns | +0.7% | 0.99x | -| det_exact | 907.3 ns | 904.4 ns | -0.3% | 1.00x | -| solve_exact | 4.31 µs | 4.25 µs | **-1.5%** | 1.02x | -| solve_exact_f64 | 4.29 µs | 4.32 µs | +0.7% | 0.99x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det_sign_exact | 705.2 ns | 444.2 ns | **-37.0%** | 1.59x | +| det_exact | 724.0 ns | 478.9 ns | **-33.9%** | 1.51x | +| solve_exact | 3.44 µs | 3.39 µs | **-1.6%** | 1.02x | +| solve_exact_f64_result (vs solve_exact_f64) | 3.47 µs | 3.36 µs | **-3.2%** | 1.03x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 3.47 µs | 3.39 µs | **-2.5%** | 1.03x | ### Large entries 3x3 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det_sign_exact | 3.14 µs | 3.09 µs | **-1.3%** | 1.01x | -| det_exact | 3.19 µs | 3.11 µs | **-2.3%** | 1.02x | -| solve_exact | 84.77 µs | 83.89 µs | **-1.0%** | 1.01x | -| solve_exact_f64 | 84.62 µs | 83.92 µs | -0.8% | 1.01x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det_sign_exact | 2.91 µs | 402.4 ns | **-86.2%** | 7.23x | +| det_exact | 2.94 µs | 434.0 ns | **-85.2%** | 6.76x | +| solve_exact | 82.81 µs | 81.57 µs | **-1.5%** | 1.02x | +| solve_exact_f64_result (vs solve_exact_f64) | 84.32 µs | 81.66 µs | **-3.1%** | 1.03x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 84.32 µs | 82.04 µs | **-2.7%** | 1.03x | ### Hilbert 4x4 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det_sign_exact | 5.3 ns | 6.9 ns | +30.4% | 0.77x | -| det_exact | 2.39 µs | 2.31 µs | **-3.2%** | 1.03x | -| solve_exact | 51.69 µs | 52.27 µs | +1.1% | 0.99x | -| solve_exact_f64 | 52.90 µs | 53.26 µs | +0.7% | 0.99x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det_sign_exact | 6.9 ns | 11.5 ns | +66.4% | 0.60x | +| det_exact | 1.91 µs | 1.50 µs | **-21.7%** | 1.28x | +| solve_exact | 49.42 µs | 47.77 µs | **-3.3%** | 1.03x | +| solve_exact_f64_result (vs solve_exact_f64) | 50.38 µs | 47.67 µs | **-5.4%** | 1.06x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 50.38 µs | 48.17 µs | **-4.4%** | 1.05x | ### Hilbert 5x5 -| Benchmark | v0.4.1 | Current | Change | Speedup | -|-----------|-------:|--------:|-------:|--------:| -| det_sign_exact | 5.03 µs | 4.88 µs | **-2.9%** | 1.03x | -| det_exact | 5.07 µs | 4.96 µs | **-2.1%** | 1.02x | -| solve_exact | 105.35 µs | 102.72 µs | **-2.5%** | 1.03x | -| solve_exact_f64 | 104.99 µs | 103.94 µs | -1.0% | 1.01x | +| Benchmark | v0.4.2 | Latest | Change | Speedup | +|-----------|-------:|-------:|-------:|--------:| +| det_sign_exact | 4.09 µs | 3.91 µs | **-4.6%** | 1.05x | +| det_exact | 4.00 µs | 4.02 µs | +0.6% | 0.99x | +| solve_exact | 98.71 µs | 95.41 µs | **-3.4%** | 1.03x | +| solve_exact_f64_result (vs solve_exact_f64) | 99.88 µs | 98.14 µs | **-1.7%** | 1.02x | +| solve_exact_rounded_f64 (vs solve_exact_f64) | 99.88 µs | 97.50 µs | **-2.4%** | 1.02x | ## How to Update @@ -114,4 +130,6 @@ just performance-release `just performance-local` writes `target/bench-reports/performance.md`. `just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`. +Older curated release-to-release reports are archived in `docs/archive/performance/`. + See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/docs/RELEASING.md b/docs/RELEASING.md index afd9712..07d7c00 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -118,40 +118,24 @@ repair, run `just performance-release `. To compare the stored GitHub Actions release assets instead of running cargo locally, use `just performance-github-assets`. -6. Save benchmark baselines for this release - -```bash -# Save a named full baseline for this release -just bench-save-baseline $TAG - -# Also refresh the conventional "last" baseline used by local -# latest-vs-last performance checks -just bench-save-last -``` - -These baselines can be compared against in future optimization work on the -release branch. The default local report command, `just bench-compare`, compares -latest measurements against `last` and writes -`target/bench-reports/performance.md`; it does not update README benchmark -tables or committed release artifacts. - After the GitHub Release is published, the `Release Benchmarks` workflow checks out the release tag, saves a full Criterion baseline, and attaches `la-stack-$TAG-criterion-baseline.tar.gz` to the release. That release asset is the durable archive for historical baseline comparisons; the workflow also uploads a short-lived Actions artifact for debugging the run. -See `docs/BENCHMARKING.md` for the full comparison workflow. +See `docs/BENCHMARKING.md` for local saved-baseline workflows and the full +comparison command reference. -7. Validate the release branch +6. Validate the release branch ```bash just ci just citation-check -cargo publish --locked --dry-run +cargo publish --locked --allow-dirty --dry-run ``` -8. Stage and commit release artifacts +7. Stage and commit release artifacts ```bash git add Cargo.toml Cargo.lock CITATION.cff pyproject.toml CHANGELOG.md README.md docs/ @@ -165,7 +149,7 @@ git commit -m "chore(release): release $TAG - Update documentation for release" ``` -9. Push the branch and open a PR +8. Push the branch and open a PR ```bash git push -u origin "release/$TAG" diff --git a/docs/archive/performance/README.md b/docs/archive/performance/README.md index dd5fc68..087b380 100644 --- a/docs/archive/performance/README.md +++ b/docs/archive/performance/README.md @@ -4,3 +4,4 @@ Older release-to-release benchmark comparisons are archived here. `docs/PERFORMANCE.md` contains the latest curated comparison. - [v0.4.1-vs-v0.4.0](v0.4.1-vs-v0.4.0.md) +- [v0.4.2-vs-v0.4.1](v0.4.2-vs-v0.4.1.md) diff --git a/docs/archive/performance/v0.4.2-vs-v0.4.1.md b/docs/archive/performance/v0.4.2-vs-v0.4.1.md new file mode 100644 index 0000000..01a68be --- /dev/null +++ b/docs/archive/performance/v0.4.2-vs-v0.4.1.md @@ -0,0 +1,119 @@ +# Exact Arithmetic Performance + +**la-stack** v0.4.2 · `7e11f93` (HEAD) · 2026-06-08 20:39:03 UTC +**Statistic**: median + +## Benchmark Results + +Comparison against baseline **v0.4.1**: + +Negative change = faster. Speedup > 1.00x = improvement. + +### D=2 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 0.6 ns | 0.9 ns | +61.1% | 0.62x | +| det_direct | 0.7 ns | 1.0 ns | +44.7% | 0.69x | +| det_exact | 315.5 ns | 318.4 ns | +0.9% | 0.99x | +| det_exact_f64 | 555.7 ns | 555.7 ns | -0.0% | 1.00x | +| det_sign_exact | 0.7 ns | 1.5 ns | +128.2% | 0.44x | +| solve_exact | 7.05 µs | 7.06 µs | +0.2% | 1.00x | +| solve_exact_f64 | 7.50 µs | 7.67 µs | +2.3% | 0.98x | + +### D=3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 1.3 ns | 1.8 ns | +36.3% | 0.73x | +| det_direct | 4.7 ns | 2.2 ns | **-51.9%** | 2.08x | +| det_exact | 936.9 ns | 924.3 ns | **-1.3%** | 1.01x | +| det_exact_f64 | 1.18 µs | 1.19 µs | +1.1% | 0.99x | +| det_sign_exact | 2.4 ns | 4.2 ns | +78.1% | 0.56x | +| solve_exact | 27.02 µs | 27.41 µs | +1.5% | 0.99x | +| solve_exact_f64 | 28.06 µs | 27.98 µs | -0.3% | 1.00x | + +### D=4 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 2.4 ns | 3.3 ns | +36.8% | 0.73x | +| det_direct | 2.4 ns | 4.1 ns | +70.2% | 0.59x | +| det_exact | 2.33 µs | 2.33 µs | -0.0% | 1.00x | +| det_exact_f64 | 2.59 µs | 2.58 µs | -0.7% | 1.01x | +| det_sign_exact | 5.3 ns | 6.9 ns | +30.5% | 0.77x | +| solve_exact | 67.14 µs | 67.99 µs | +1.3% | 0.99x | +| solve_exact_f64 | 67.86 µs | 68.51 µs | +1.0% | 0.99x | + +### D=5 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det | 21.6 ns | 24.5 ns | +13.7% | 0.88x | +| det_direct | 2.3 ns | 4.7 ns | +104.8% | 0.49x | +| det_exact | 5.04 µs | 4.99 µs | -1.0% | 1.01x | +| det_exact_f64 | 5.32 µs | 5.31 µs | -0.1% | 1.00x | +| det_sign_exact | 4.97 µs | 4.99 µs | +0.3% | 1.00x | +| solve_exact | 134.99 µs | 136.04 µs | +0.8% | 0.99x | +| solve_exact_f64 | 137.11 µs | 138.97 µs | +1.4% | 0.99x | + +### Near-singular 3x3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 871.8 ns | 877.6 ns | +0.7% | 0.99x | +| det_exact | 907.3 ns | 904.4 ns | -0.3% | 1.00x | +| solve_exact | 4.31 µs | 4.25 µs | **-1.5%** | 1.02x | +| solve_exact_f64 | 4.29 µs | 4.32 µs | +0.7% | 0.99x | + +### Large entries 3x3 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 3.14 µs | 3.09 µs | **-1.3%** | 1.01x | +| det_exact | 3.19 µs | 3.11 µs | **-2.3%** | 1.02x | +| solve_exact | 84.77 µs | 83.89 µs | **-1.0%** | 1.01x | +| solve_exact_f64 | 84.62 µs | 83.92 µs | -0.8% | 1.01x | + +### Hilbert 4x4 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 5.3 ns | 6.9 ns | +30.4% | 0.77x | +| det_exact | 2.39 µs | 2.31 µs | **-3.2%** | 1.03x | +| solve_exact | 51.69 µs | 52.27 µs | +1.1% | 0.99x | +| solve_exact_f64 | 52.90 µs | 53.26 µs | +0.7% | 0.99x | + +### Hilbert 5x5 + +| Benchmark | v0.4.1 | Current | Change | Speedup | +|-----------|-------:|--------:|-------:|--------:| +| det_sign_exact | 5.03 µs | 4.88 µs | **-2.9%** | 1.03x | +| det_exact | 5.07 µs | 4.96 µs | **-2.1%** | 1.02x | +| solve_exact | 105.35 µs | 102.72 µs | **-2.5%** | 1.03x | +| solve_exact_f64 | 104.99 µs | 103.94 µs | -1.0% | 1.01x | + +## How to Update + +Local performance reports are generated in isolated temporary worktrees: + +```bash +# Local development: compare the current tree with the latest release +just performance-local + +# Release PR: update docs/PERFORMANCE.md and archive the previous report +just performance-release + +# GitHub Actions release assets +just performance-github-assets + +# Explicit repair +just performance-release +``` + +`just performance-local` writes `target/bench-reports/performance.md`. +`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`. + +Older curated release-to-release reports are archived in `docs/archive/performance/`. + +See `docs/BENCHMARKING.md` for the full comparison workflow. diff --git a/docs/assets/bench/vs_linalg_lu_solve_median.csv b/docs/assets/bench/vs_linalg_lu_solve_median.csv index b47bf82..55a3e34 100644 --- a/docs/assets/bench/vs_linalg_lu_solve_median.csv +++ b/docs/assets/bench/vs_linalg_lu_solve_median.csv @@ -1,9 +1,9 @@ D,la_stack,la_lo,la_hi,nalgebra,na_lo,na_hi,faer,fa_lo,fa_hi -2,2.585300857336246,2.582809844321982,2.586669800817735,4.486088257090016,4.479229681572247,4.492362317908142,137.6529309582465,137.31998885139873,137.93586339517975 -3,12.203757609083176,12.183054403617847,12.224101270062874,22.990257557091617,22.91154128675699,23.12912235259086,182.61806110946827,181.880047375976,183.37040632194638 -4,27.227520807109173,27.202299533674857,27.266723521091606,51.66011127074051,51.555715086488036,51.840625331770724,208.18091469437718,207.68326634854134,209.09477296109242 -5,53.14134517762996,53.09941497989714,53.281547651078945,68.71356228143745,68.56318667878111,68.77114355114793,272.1171465104768,270.47097586206894,274.01601221264366 -8,141.2790253849833,140.95206481826406,141.68049543712607,162.22478601970167,161.86865626610683,162.56549560477754,348.21559402824914,347.18022267396856,349.1754040441267 -16,626.5606138541871,624.4956536182669,627.6291586692258,574.1147371436412,572.7849165188117,575.6888051044084,854.9407805348176,853.314408186121,859.1660992820005 -32,2862.794789510007,2860.7804785793915,2864.5392419175027,2709.532471877147,2702.3908894171072,2716.620725916311,2806.6981316154734,2802.5897840011316,2814.7045584045586 -64,19703.238514957266,19501.774005848587,19923.90512820513,14388.28538961039,14376.077992277993,14407.440476190477,12085.452737127373,12070.825515947467,12098.666085946574 +2,2.0437070181606085,2.0425631264853448,2.04607019228461,4.542175209414122,4.538937481352276,4.545658680820327,143.95823454981254,143.35650513307058,144.55261268043853 +3,9.595660662683073,9.588797748121927,9.613326467320519,23.59900523298633,23.193334672880862,23.89156203624332,185.46552169511642,184.30770285322595,186.6833373684646 +4,23.338056895226114,23.241755833807627,23.450110414132816,50.71697549754131,50.61114270941055,50.84226416178329,210.97635296260154,210.09558569349872,211.8172518966618 +5,45.36802780858268,45.33368856824674,45.45857791494792,69.06519772701617,68.91705833943838,69.22990310070215,277.56412002670754,276.40090137857896,278.9175994658497 +8,127.86115337657483,127.67784037443938,127.97259027100068,164.41175356549803,163.98918099473198,164.89947719932118,364.864449238967,363.9575332348597,365.67189807976365 +16,631.9974053918763,630.5943769720783,634.117961205346,663.82156895873,608.9343639517663,684.2256402276,882.673648246476,880.5187023299551,885.2058986456484 +32,2745.604342979343,2733.882221204255,2755.327066196631,2424.5398969497583,2422.7844615775653,2425.665479115479,2867.431290847727,2862.8613304608957,2879.1905179193345 +64,17543.03432206594,17378.64912280702,17669.562753036436,14747.730784813924,14732.373598480488,14759.246031746032,12266.27068329904,12250.220156695157,12279.630177029401 diff --git a/docs/assets/bench/vs_linalg_lu_solve_median.svg b/docs/assets/bench/vs_linalg_lu_solve_median.svg index a7e281e..deb7b3e 100644 --- a/docs/assets/bench/vs_linalg_lu_solve_median.svg +++ b/docs/assets/bench/vs_linalg_lu_solve_median.svg @@ -244,51 +244,51 @@ - la-stack v0.4.2 + la-stack v0.4.3 - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + @@ -302,47 +302,47 @@ - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + @@ -356,47 +356,47 @@ - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + diff --git a/docs/roadmap.md b/docs/roadmap.md index ae3d49f..7e81b93 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -100,6 +100,35 @@ a modern typed baseline. - [#142](https://github.com/acgetchell/la-stack/issues/142) - Update Python tooling to 3.13 and parse scripts at boundaries. +Release posture: + +- Release `v0.4.3` before starting another performance-focused implementation + branch. The current release-signal comparison against `v0.4.2` shows broad + improvement across LU, solve, determinant-via-LU, and vector helper rows. +- Treat the remaining `D=4` direct determinant regression as a tracked + performance note rather than a release blocker because the LU-backed + determinant and solve paths improved. +- Defer `Matrix::inf_norm` optimization to follow-up work after `v0.4.3`. + Larger-dimension `vs_linalg` measurements suggest it is the most interesting + leaf-kernel target, but it is not required for the release. + +### v0.4.4 Focused Leaf-Kernel Performance + +After `v0.4.3`, use the improved benchmark workflow to investigate narrow +leaf-kernel performance gaps without broadening the crate's scope or weakening +the small fixed-dimension API model. + +- [#154](https://github.com/acgetchell/la-stack/issues/154) - Investigate + `Matrix::inf_norm` performance against `nalgebra` and `faer`. +- [#155](https://github.com/acgetchell/la-stack/issues/155) - Investigate + `Vector::dot` and `Vector::norm2_sq` performance against `nalgebra` and + `faer`. + +The goal is targeted profiling and implementation cleanup for operations where +`vs_linalg` shows a meaningful peer-crate gap. Release scope should stay limited +to changes that preserve numerical behavior, allocation-free fixed-size storage, +and clear const-generic code. + ### v0.5.0 Generic Const Expressions `v0.5.0` is reserved for the post-stabilization const-generic API revision. diff --git a/pyproject.toml b/pyproject.toml index 9136cf4..bd35633 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "la-stack-scripts" -version = "0.4.2" +version = "0.4.3" description = "Python utility scripts for the la-stack Rust library" readme = "README.md" requires-python = ">=3.13" diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py index 8e0d8d1..1caf958 100644 --- a/scripts/archive_performance.py +++ b/scripts/archive_performance.py @@ -309,6 +309,8 @@ def _how_to_update_section() -> str: "`just performance-local` writes `target/bench-reports/performance.md`.", "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.", "", + "Older curated release-to-release reports are archived in `docs/archive/performance/`.", + "", "See `docs/BENCHMARKING.md` for the full comparison workflow.", "", ] diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py index df9b689..8a1776c 100644 --- a/scripts/bench_compare.py +++ b/scripts/bench_compare.py @@ -726,6 +726,8 @@ def _generate_markdown( "`just performance-local` writes `target/bench-reports/performance.md`.", "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.", "", + "Older curated release-to-release reports are archived in `docs/archive/performance/`.", + "", "See `docs/BENCHMARKING.md` for the full comparison workflow.", ] ) diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py index 78f04fb..cc7665b 100644 --- a/scripts/tests/test_archive_performance.py +++ b/scripts/tests/test_archive_performance.py @@ -53,6 +53,11 @@ def _normalized_report(version: str, baseline: str) -> str: return archive_performance._normalize_how_to_update(_report(version, baseline)) +def test_normalized_report_links_archived_performance_reports() -> None: + text = _normalized_report("0.4.3", "v0.4.2") + assert "Older curated release-to-release reports are archived in `docs/archive/performance/`." in text + + def _legacy_report(version: str, baseline: str) -> str: return ( _report(version, baseline) diff --git a/scripts/tests/test_bench_compare.py b/scripts/tests/test_bench_compare.py index 04c59bc..1d78e98 100644 --- a/scripts/tests/test_bench_compare.py +++ b/scripts/tests/test_bench_compare.py @@ -379,6 +379,7 @@ def test_main_snapshot_writes_output(tmp_path: Path) -> None: assert "just performance-release" in text assert "just performance-github-assets" in text assert "just performance-release " in text + assert "Older curated release-to-release reports are archived in `docs/archive/performance/`." in text assert "git checkout" not in text diff --git a/uv.lock b/uv.lock index c8f71d0..6259155 100644 --- a/uv.lock +++ b/uv.lock @@ -409,7 +409,7 @@ wheels = [ [[package]] name = "la-stack-scripts" -version = "0.4.2" +version = "0.4.3" source = { editable = "." } [package.dev-dependencies]