From 925cbb72be101aa10e612f80b07c1cad8e298434 Mon Sep 17 00:00:00 2001
From: Adam Getchell <acgetchell@ucdavis.edu>
Date: Tue, 9 Jun 2026 01:48:37 -0700
Subject: [PATCH] chore(release): release v0.4.3

- Bump crate, script package, lockfile, README, and citation metadata to v0.4.3
- Promote the v0.4.3 performance report and archive the v0.4.2 comparison
- Refresh README nalgebra/faer benchmark data and plot assets
- Reorganize benchmark documentation around release, local, and artifact workflows
- Update release guidance and roadmap follow-up items for v0.4.4 performance work
---
 CHANGELOG.md                                  |  42 +-
 CITATION.cff                                  |   2 +-
 Cargo.lock                                    |   2 +-
 Cargo.toml                                    |   2 +-
 README.md                                     |  22 +-
 docs/BENCHMARKING.md                          | 419 ++++++++++--------
 docs/PERFORMANCE.md                           | 144 +++---
 docs/RELEASING.md                             |  28 +-
 docs/archive/performance/README.md            |   1 +
 docs/archive/performance/v0.4.2-vs-v0.4.1.md  | 119 +++++
 .../bench/vs_linalg_lu_solve_median.csv       |  16 +-
 .../bench/vs_linalg_lu_solve_median.svg       | 146 +++---
 docs/roadmap.md                               |  29 ++
 pyproject.toml                                |   2 +-
 scripts/archive_performance.py                |   2 +
 scripts/bench_compare.py                      |   2 +
 scripts/tests/test_archive_performance.py     |   5 +
 scripts/tests/test_bench_compare.py           |   1 +
 uv.lock                                       |   2 +-
 19 files changed, 609 insertions(+), 377 deletions(-)
 create mode 100644 docs/archive/performance/v0.4.2-vs-v0.4.1.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eed0080..8d09cb9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.4.3] - 2026-06-09
 
 ### ⚠️ Breaking Changes
 
@@ -49,6 +49,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Return typed Unrepresentable reasons when strict exact-to-f64 conversion would round or become non-finite.
   - Specialize D4 exact determinants and keep determinant/error-bound zero coefficients from evaluating overflowing absent terms.
   - Update exact benchmark comparison reporting to compare strict and rounded APIs against legacy v0.4.2 rows.
+- Archive release performance reports [`2817d01`](https://github.com/acgetchell/la-stack/commit/2817d01374ad0aeab98d6f48a3dae9b30f878a8a)
+  - Add an archive-performance utility that promotes curated benchmark reports into docs/PERFORMANCE.md while archiving prior release comparisons
+  - Generate release comparisons in isolated temporary worktrees, including legacy command fallback for published tags
+  - Wire release and historical archive recipes into just, Python packaging, and release documentation
+- Automate published performance report archiving [`d31e26a`](https://github.com/acgetchell/la-stack/commit/d31e26a9d7a47a6c3089028630640bcff5afe7c0)
+  - Track the latest curated release comparison in docs/PERFORMANCE.md and archive older comparisons under docs/archive/performance/
+  - Let performance-archive-published discover the latest stable GitHub release and previous stable baseline automatically
+  - Generate release comparisons in isolated temporary worktrees, with release-asset restore and local baseline fallback paths
+  - Update benchmark and release docs to use the scripted workflow instead of manual checkout steps
+- Split local and release performance comparisons [`7258525`](https://github.com/acgetchell/la-stack/commit/7258525590f2ed68d41879e71c833010e408e7f7)
+  - Add default performance-local and performance-release workflows that infer the relevant release tags and run in temporary worktrees.
+  - Add a performance-github-assets workflow for comparing stored GitHub Actions release benchmark assets without local cargo runs.
+  - Normalize release tags before fetching, downloading assets, or checking out detached worktrees.
+  - Update performance docs, release guidance, and generated report instructions to use the new benchmark workflows.
+- Add vs_linalg-only performance checks [`d7c1487`](https://github.com/acgetchell/la-stack/commit/d7c1487115e1a8e5bb1ec4fcc7592786e300e2ce)
+  - Add local workflows for comparing current non-exact la-stack kernels against a release baseline without rerunning current nalgebra/faer or exact benchmarks.
+  - Route archive-performance baseline and current benchmark commands by suite, with legacy fallback support for older release worktrees.
+  - Document the faster release-signal workflow and expand Semgrep fixtures for benchmark, example, doctest, and public panic-path rules.
 
 ### Changed
 
@@ -61,6 +79,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Replace mypy with strict Ty checking in the Python workflow.
   - Parse TOML, JSON, argparse, and Semgrep inputs into typed boundary objects before downstream use.
   - Reject malformed Criterion estimates, non-finite timings, invalid confidence intervals, and malformed Semgrep result shapes.
+- Harden Rust release hygiene [`8e12c93`](https://github.com/acgetchell/la-stack/commit/8e12c935fe54e265e8ceb640702267ec0e71b7b1)
+  - Promote missing documentation and dead code lints to deny-level checks.
+  - Forbid unsafe code explicitly across Rust modules and benchmark targets.
+  - Document the LU/LDLT empty-matrix convention for D=0.
+  - Move exact benchmark input generation into typed helpers and consolidate exact benchmark operation dispatch.
 
 ### Documentation
 
@@ -82,6 +105,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
   - Return matrix-cell metadata when inf-norm row sums or symmetry tolerance scaling overflow.
   - Avoid reparsing finite-by-construction RHS vectors in LU and LDLT solves.
+- Re-raise unexpected archive failures [`7938386`](https://github.com/acgetchell/la-stack/commit/7938386166f1f3f5cf594c5def67458d48e19a98)
+  - Limit archive-performance CLI error handling to expected validation, filesystem, subprocess, and runtime failures.
+  - Let unexpected exceptions propagate so benchmark archiving bugs surface during development.
+
+### Performance
+
+- Improve factorization kernel [`8837df1`](https://github.com/acgetchell/la-stack/commit/8837df1f54a9fa2c20abc1487cfce4de8c8e09c5)
+
+  - Preserve the tiny-dimension update shape for D2-D5 to avoid regressing the core fixed-size path
+  - Fuse multiplier computation with trailing updates for larger dimensions to reduce extra column walks
+  - Rely on the LDLT factorization proof instead of a redundant final finite-storage scan
+- Optimize exact and factorized solve kernels [`1690355`](https://github.com/acgetchell/la-stack/commit/1690355bf27c2cbba685ba0cd70486275c7620b8)
+  - Split LU and LDLT solve paths so tiny matrices keep the direct kernels while larger fixed dimensions avoid extra substitution work.
+  - Convert dyadic exact solve results directly to finite f64 and preserve UnrepresentableReason recovery semantics on strict conversion failures.
+  - Modernize release branch commands and keep just recipes sorted.
 
 ## [0.4.2] - 2026-06-04
 
@@ -634,7 +672,7 @@ Older releases are archived by minor series:
 - [0.2.x](docs/archive/changelog/0.2.md)
 - [0.1.x](docs/archive/changelog/0.1.md)
 
-[Unreleased]: https://github.com/acgetchell/la-stack/compare/v0.4.2...HEAD
+[0.4.3]: https://github.com/acgetchell/la-stack/compare/v0.4.2...v0.4.3
 [0.4.2]: https://github.com/acgetchell/la-stack/compare/v0.4.1...v0.4.2
 [0.4.1]: https://github.com/acgetchell/la-stack/compare/v0.4.0...v0.4.1
 [0.4.0]: https://github.com/acgetchell/la-stack/compare/v0.3.0...v0.4.0
diff --git a/CITATION.cff b/CITATION.cff
index cde70f6..4f7b4db 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -2,7 +2,7 @@ cff-version: 1.2.0
 message: "If you use this software, please cite it as below."
 type: software
 title: "la-stack: Fast, stack-allocated linear algebra for fixed dimensions in Rust"
-version: 0.4.2
+version: 0.4.3
 date-released: 2026-06-04
 url: "https://github.com/acgetchell/la-stack"
 repository-code: "https://github.com/acgetchell/la-stack"
diff --git a/Cargo.lock b/Cargo.lock
index ae84265..b0dcf0c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -603,7 +603,7 @@ dependencies = [
 
 [[package]]
 name = "la-stack"
-version = "0.4.2"
+version = "0.4.3"
 dependencies = [
  "approx",
  "criterion",
diff --git a/Cargo.toml b/Cargo.toml
index 083635a..b64550f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "la-stack"
-version = "0.4.2"
+version = "0.4.3"
 edition = "2024"
 rust-version = "1.96"
 license = "BSD-3-Clause"
diff --git a/README.md b/README.md
index 7c92b06..9a2aee0 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ Add this to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-la-stack = "0.4.2"
+la-stack = "0.4.3"
 ```
 
 ### Feature flags
@@ -207,7 +207,7 @@ rationals (this pulls in `num-bigint`, `num-rational`, and `num-traits` for
 
 ```toml
 [dependencies]
-la-stack = { version = "0.4.2", features = ["exact"] }
+la-stack = { version = "0.4.3", features = ["exact"] }
 ```
 
 **Determinants:**
@@ -383,19 +383,21 @@ operations.
 
 For the full per-kernel comparison methodology, input construction, and
 release-comparison workflow details, see [docs/BENCHMARKING.md](docs/BENCHMARKING.md).
+For the current release-to-release performance snapshot, see
+[docs/PERFORMANCE.md](docs/PERFORMANCE.md).
 
 <!-- BENCH_TABLE:lu_solve:median:new:BEGIN -->
 
 | D | la-stack median (ns) | nalgebra median (ns) | faer median (ns) | la-stack vs nalgebra | la-stack vs faer |
 |---:|--------------------:|--------------------:|----------------:|---------------------:|----------------:|
-| 2 | 2.585 | 4.486 | 137.653 | +42.4% | +98.1% |
-| 3 | 12.204 | 22.990 | 182.618 | +46.9% | +93.3% |
-| 4 | 27.228 | 51.660 | 208.181 | +47.3% | +86.9% |
-| 5 | 53.141 | 68.714 | 272.117 | +22.7% | +80.5% |
-| 8 | 141.279 | 162.225 | 348.216 | +12.9% | +59.4% |
-| 16 | 626.561 | 574.115 | 854.941 | -9.1% | +26.7% |
-| 32 | 2,862.795 | 2,709.532 | 2,806.698 | -5.7% | -2.0% |
-| 64 | 19,703.239 | 14,388.285 | 12,085.453 | -36.9% | -63.0% |
+| 2 | 2.044 | 4.542 | 143.958 | +55.0% | +98.6% |
+| 3 | 9.596 | 23.599 | 185.466 | +59.3% | +94.8% |
+| 4 | 23.338 | 50.717 | 210.976 | +54.0% | +88.9% |
+| 5 | 45.368 | 69.065 | 277.564 | +34.3% | +83.7% |
+| 8 | 127.861 | 164.412 | 364.864 | +22.2% | +65.0% |
+| 16 | 631.997 | 663.822 | 882.674 | +4.8% | +28.4% |
+| 32 | 2,745.604 | 2,424.540 | 2,867.431 | -13.2% | +4.2% |
+| 64 | 17,543.034 | 14,747.731 | 12,266.271 | -19.0% | -43.0% |
 
 <!-- BENCH_TABLE:lu_solve:median:new:END -->
 
diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md
index 4eb3c66..15e9738 100644
--- a/docs/BENCHMARKING.md
+++ b/docs/BENCHMARKING.md
@@ -1,233 +1,278 @@
 # Benchmarking
 
-This guide covers how to run, compare, and track performance for la-stack.
-
-## Benchmark suites
-
-la-stack has two Criterion benchmark suites:
-
-- **`vs_linalg`** (`benches/vs_linalg.rs`) — compares la-stack against
-  nalgebra and faer across D=2–64 for LU, solve, det, dot, norm, etc.
-  Use this to answer "why choose la-stack over other crates?"
-  The suite also includes SPD factorization rows for la-stack LDLT, faer
-  LDLT, and nalgebra Cholesky. The nalgebra rows are labelled Cholesky
-  because nalgebra does not expose a dense LDLT factorization in the
-  dependency version used here.
-
-- **`exact`** (`benches/exact.rs`) — measures exact-arithmetic methods
-  (`det_exact`, `solve_exact`, `det_sign_exact`, strict `*_result`
-  conversions, and lossy `*_rounded_f64` conversions) alongside f64
-  baselines (`det`, `det_direct`) across D=2–5. Use this to understand
-  the cost of exact arithmetic and track optimization progress.
-  In addition to the fixed per-dimension groups (`exact_d{2..5}`), the
-  suite includes random percentile and adversarial-input groups designed
-  to capture variance and stress specific corners of the pipeline:
-
-  - `exact_random_percentile_d{2..5}` — fixed-seed corpora of 50
-    strictly diagonally-dominant random matrices per dimension. Each
-    operation is pre-timed across the corpus to select representative
-    p50/p95/p99 inputs, then Criterion measures those inputs normally.
-  - `exact_near_singular_3x3` — a 2^-50 perturbation of a singular base
-    matrix; forces the Bareiss fallback in `det_sign_exact` and
-    exercises the largest intermediate `BigInt` values in `solve_exact`.
-  - `exact_large_entries_3x3` — diagonal entries near `f64::MAX / 2`
-    stress `BigInt` growth during Bareiss forward elimination.
-  - `exact_hilbert_4x4` / `exact_hilbert_5x5` — classically
-    ill-conditioned matrices whose non-terminating-in-binary entries
-    stress the `f64_decompose → BigInt` scaling path.
-
-  Each random percentile and adversarial group runs the same five
-  exact-arithmetic benches (`det_sign_exact`, `det_exact`, `solve_exact`,
-  `solve_exact_f64_result`, `solve_exact_rounded_f64`) so the resulting tables
-  are directly comparable across input classes. Rows with a `_result` suffix
-  measure the strict fallible conversion path, including valid
-  `Err(Unrepresentable)` outcomes when the exact answer is not
-  finite-binary64 representable. Rows with a `_rounded_f64` suffix measure the
-  intentionally lossy finite-binary64 conversion path.
-
-## `vs_linalg` methodology
-
-`vs_linalg` is a per-kernel comparison, not a single aggregate score. Each
-reported row compares one operation for one dimension `D`, using Criterion's
-selected statistic from `target/criterion/d{D}/{benchmark}/{sample}/estimates.json`.
-The default report and README table use Criterion's `median.point_estimate`
-in nanoseconds. Lower is better.
-
-All three crates receive equivalent deterministic inputs for a given
-dimension:
+This guide explains how to run, compare, and publish performance results for
+`la-stack`. Start with the workflow table below; the later sections explain what
+the commands measure and where their outputs go.
+
+## Contents
+
+- [Start Here](#start-here)
+- [Benchmark Suites](#benchmark-suites)
+- [Common Workflows](#common-workflows)
+  - [Compare Current Code With The Latest Release](#compare-current-code-with-the-latest-release)
+  - [Compare Current Code With A Specific Release](#compare-current-code-with-a-specific-release)
+  - [Iterate Against A Local Saved Baseline](#iterate-against-a-local-saved-baseline)
+  - [Update The README nalgebra/faer Table](#update-the-readme-nalgebrafaer-table)
+  - [Create The Release Performance Report](#create-the-release-performance-report)
+  - [Compare Published Release Artifacts](#compare-published-release-artifacts)
+- [Output Locations](#output-locations)
+- [`vs_linalg` Methodology](#vs_linalg-methodology)
+- [Exact-Arithmetic Notes](#exact-arithmetic-notes)
+- [Release Notes](#release-notes)
+
+## Start Here
+
+| Goal | Use | Output | Notes |
+|------|-----|--------|-------|
+| Clean local audit against the latest published release | `just performance-local` | `target/bench-reports/performance.md` | Self-contained; creates temporary worktrees and regenerates the release baseline locally. |
+| Non-exact release-signal check against a specific release | `just performance-local-vs-linalg v0.4.3 v0.4.2` | `target/bench-reports/performance.md` | Narrower than `performance-local`; useful for LU/LDLT/dot/norm work. |
+| Fast repeated comparisons while tuning one kernel | `just bench-save-baseline <name> <suite>` then `just bench-compare <name> <suite> all-benches` | `target/bench-reports/performance.md` | Uses local `target/criterion/`; fastest loop after the baseline exists. |
+| Full current la-stack vs nalgebra/faer comparison | `just bench-vs-linalg` | `target/criterion/` | Measures current la-stack, nalgebra, and faer rows. |
+| README benchmark table and SVG plot | `just plot-vs-linalg-readme` after `just bench-vs-linalg` | `README.md`, `docs/assets/bench/` | Uses current `target/criterion` data. |
+| Release PR performance artifact | `just performance-release v0.4.3 v0.4.2` | `docs/PERFORMANCE.md`, `docs/archive/performance/` | Mutates committed docs. Run during release preparation. |
+| Compare already-published release assets | `just performance-github-assets v0.4.3 v0.4.2` | `target/bench-reports/github-assets-performance.md` | Uses GitHub Release baseline assets instead of local cargo runs. |
+
+Rule of thumb:
+
+- Use `performance-local*` for clean, self-contained answers.
+- Use `bench-save-*` plus `bench-compare` for tight local optimization loops.
+- Use `bench-vs-linalg` plus plotting when updating README crate-to-crate
+  comparisons.
+- Use `performance-release` only when preparing committed release artifacts.
+
+## Benchmark Suites
+
+`la-stack` has two Criterion benchmark suites.
+
+**`vs_linalg`** (`benches/vs_linalg.rs`) compares `la-stack` against
+`nalgebra` and `faer` across D=2-64 for LU, solve, determinant, dot, norm, and
+SPD factorization operations. Use this suite to answer "why choose la-stack over
+other crates?"
+
+The SPD rows compare la-stack LDLT, faer LDLT, and nalgebra Cholesky. They are
+labelled by algorithm because nalgebra does not expose a dense LDLT
+factorization in the dependency version used here.
+
+**`exact`** (`benches/exact.rs`) measures exact-arithmetic methods
+(`det_exact`, `solve_exact`, `det_sign_exact`, strict `*_result` conversions,
+and lossy `*_rounded_f64` conversions) alongside f64 baselines (`det`,
+`det_direct`) across D=2-5. Use this suite to understand exact-arithmetic cost
+and track optimization progress.
+
+## Common Workflows
+
+### Compare Current Code With The Latest Release
+
+Use this when you want a clean local answer to "how does this checkout compare
+with the latest published release?"
 
-- matrix entries come from the same strictly diagonally-dominant generator
-  (`matrix_entry::<D>`)
-- right-hand sides and vector inputs come from the same deterministic vector
-  generator
-- each benchmark uses `black_box` around inputs and outputs to keep the
-  measured operation visible to the optimizer
+```bash
+just performance-local
+```
 
-The integration smoke test `tests/vs_linalg_inputs.rs` reuses the benchmark
-input helpers and verifies that la-stack, nalgebra, and faer agree on the
-determinant, solve, dot, and infinity-norm results for D=2..=5. Run it with
-`cargo test --features bench --test vs_linalg_inputs` when changing benchmark
-input construction, adding comparable kernels, or updating the `faer` or
-`nalgebra` benchmark dependencies.
+This creates isolated temporary worktrees, generates the latest published
+release baseline locally, benchmarks the current tree on the same machine, and
+writes `target/bench-reports/performance.md`.
 
-The main comparable metrics are:
+This command does not depend on existing local `target/criterion/` baselines.
+It is slower than reusing a saved baseline, but less sensitive to stale local
+benchmark state.
 
-- `det_via_lu` — factor the matrix and compute determinant from the LU factor
-- `lu` — factorization only
-- `lu_solve` — factor the matrix and solve one right-hand side
-- `solve_from_lu` — solve one right-hand side using a precomputed LU factor
-- `det_from_lu` — compute determinant using a precomputed LU factor
-- `dot` — vector dot product
-- `norm2_sq` — squared Euclidean vector norm
-- `inf_norm` — matrix infinity norm, implemented as maximum absolute row sum
+### Compare Current Code With A Specific Release
 
-Additional SPD metrics compare la-stack LDLT against faer LDLT and nalgebra
-Cholesky. These rows are labelled by algorithm (`ldlt` or `cholesky`) because
-nalgebra does not expose a dense LDLT factorization in the dependency version
-used here. They should be read as SPD factorization/solve/determinant
-comparisons, not as identical algorithm comparisons across all three crates.
+For a narrower non-exact check against a known release pair, run:
 
-Release-signal reports compare latest la-stack measurements against a saved
-la-stack baseline, and show saved nalgebra/faer baseline timings as context
-where a matching peer benchmark exists. That keeps iteration cheap while still
-making the release signal auditable. The full `vs_linalg` run remains the
-source of README plots and crate-to-crate comparison tables.
+```bash
+just performance-local-vs-linalg v0.4.3 v0.4.2
+```
+
+This generates a local `v0.4.2` `vs_linalg` baseline, measures the current
+la-stack `vs_linalg` rows, and renders a `vs_linalg` report. The report includes
+saved baseline nalgebra/faer timings as context where matching peer rows exist,
+without rerunning current peer crates.
+
+### Iterate Against A Local Saved Baseline
+
+Use local saved baselines when tuning one kernel and comparing several edits
+against the same starting point. These baselines are local scratch data, not
+release artifacts.
 
-## Quick reference
+For example, before optimizing `Matrix::inf_norm`, save a named baseline:
 
 ```bash
-# Run vs_linalg benchmarks
-just bench-vs-linalg
+just bench-save-baseline inf-norm-before vs_linalg
+```
+
+Then make a change, rerun only the current measurements you care about, and
+compare:
 
-# Run only la-stack rows from vs_linalg
+```bash
 just bench-vs-linalg-la-stack
+just bench-compare inf-norm-before vs_linalg all-benches
+```
 
-# Run exact-arithmetic benchmarks
-just bench-exact
+The `just bench-compare` recipe uses positional arguments:
+`just bench-compare <baseline> <suite> <scope>`. The underlying
+`uv run bench-compare` CLI accepts the explicit `--suite` and `--scope` flags.
 
-# Run the cheaper latest measurements used for latest-vs-last reports
-just bench-latest
+`just bench-save-baseline <name>` writes Criterion samples under
+`target/criterion/`. `just bench-save-last` saves the conventional local
+baseline named `last`, which enables shortcuts such as:
 
-# Save a full baseline named "last"
-just bench-save-last
-
-# Compare latest measurements against the saved "last" baseline
+```bash
+just bench-latest-vs-last
+just bench-vs-linalg-latest-vs
 just bench-compare
+```
 
-# Run latest measurements and compare against "last"
-just bench-latest-vs-last
+Saved baselines persist across `git checkout` but not across `cargo clean`, and
+they are not pushed to GitHub.
 
-# Run only non-exact la-stack rows from vs_linalg and compare against "last"
-just bench-vs-linalg-latest-vs
+### Update The README nalgebra/faer Table
+
+The README benchmark table and SVG plot are crate-to-crate comparisons from the
+current checkout:
+
+```bash
+just bench-vs-linalg
+just plot-vs-linalg-readme
 ```
 
-## Comparing performance across releases
+`just bench-vs-linalg` measures current la-stack, nalgebra, and faer rows.
+`just plot-vs-linalg-readme` reads those Criterion results and updates:
 
-Criterion baselines are saved into `target/criterion/` and persist across
-`git checkout` but **not** across `cargo clean`. Published releases also attach
-a compressed Criterion baseline to the GitHub Release so historical release
-baselines can be restored later.
+- `README.md`
+- `docs/assets/bench/vs_linalg_lu_solve_median.csv`
+- `docs/assets/bench/vs_linalg_lu_solve_median.svg`
 
-### Latest vs last
+See `scripts/criterion_dim_plot.py --help` for plotting options.
 
-The default workflow is optimized for the common maintenance question:
-"how does latest la-stack compare to the last release?"
+### Create The Release Performance Report
 
-At release time, save a full baseline:
+Release PRs promote one curated release-to-release comparison into committed
+docs:
 
 ```bash
-just bench-save-last
+just performance-release v0.4.3 v0.4.2
 ```
 
-During development, run the cheaper latest path:
+With no arguments, `just performance-release` infers the current release tag
+from `Cargo.toml` and discovers the previous stable published release. During
+release preparation, passing both tags explicitly removes ambiguity.
 
-```bash
-just bench-latest-vs-last
-```
+This command creates temporary worktrees, generates the comparison, writes
+`docs/PERFORMANCE.md`, and archives the previous committed report under
+`docs/archive/performance/`. Archive filenames are release-pair names such as
+`v0.4.2-vs-v0.4.1.md`.
 
-`bench-latest` runs exact arithmetic plus only the la-stack rows from
-`vs_linalg`. The comparison report still shows the last-release nalgebra
-and faer timings for matching rows, so you can see whether a la-stack
-change improves or weakens the release signal without rerunning third-party
-benchmarks on every iteration.
+### Compare Published Release Artifacts
 
-For a faster non-exact check, run:
+After releases are published, the GitHub Release benchmark workflow attaches a
+compressed Criterion baseline artifact. To compare those stored artifacts
+without running cargo locally:
 
 ```bash
-just performance-local-vs-linalg v0.4.3 v0.4.2
+just performance-github-assets v0.4.3 v0.4.2
 ```
 
-This generates a local `v0.4.2` baseline for `vs_linalg`, measures only the
-current la-stack rows from `vs_linalg`, then compares them using `--suite
-vs_linalg`. The report shows saved baseline nalgebra/faer timings as context
-without rerunning the peer crates on the current checkout.
+With no arguments, the recipe discovers the latest stable published GitHub
+release and its previous stable release automatically.
 
-### Workflow
+## Output Locations
 
-```bash
-# Current in-tree code vs latest published release, all measured locally
-just performance-local
+| Path | Committed? | Producer | Purpose |
+|------|------------|----------|---------|
+| `target/criterion/` | No | `cargo bench`, `bench-save-*` | Local Criterion measurements and named baselines. |
+| `target/bench-reports/performance.md` | No | `bench-compare`, `performance-local*` | Local comparison report. |
+| `target/bench-reports/github-assets-performance.md` | No | `performance-github-assets` | Local report from published release artifacts. |
+| `docs/PERFORMANCE.md` | Yes | `performance-release` | Latest curated release-to-release comparison. |
+| `docs/archive/performance/` | Yes | `performance-release` | Older curated release-to-release comparisons. |
+| `docs/assets/bench/` | Yes | `plot-vs-linalg-readme` | README benchmark CSV/SVG assets. |
+| GitHub Release asset `la-stack-$TAG-criterion-baseline.tar.gz` | Remote release artifact | `.github/workflows/release-benchmarks.yml` | Durable Criterion baseline archive for published releases. |
 
-# Current in-tree non-exact kernels vs a release baseline
-just performance-local-vs-linalg v0.4.3 v0.4.2
+## `vs_linalg` Methodology
 
-# Stored GitHub Actions release assets, no local cargo runs
-just performance-github-assets
-```
+`vs_linalg` is a per-kernel comparison, not a single aggregate score. Each row
+compares one operation for one dimension `D`, using Criterion's selected
+statistic from `target/criterion/d{D}/{benchmark}/{sample}/estimates.json`.
+The README table uses `median.point_estimate` in nanoseconds. Lower is better.
 
-`performance-local` creates isolated temporary worktrees, generates the latest
-published release baseline locally, then benchmarks the current in-tree code on
-the same machine. It uses the current checkout's Rust toolchain for both sides
-unless `RUSTUP_TOOLCHAIN` is already set. `performance-github-assets` compares
-stored GitHub Actions release artifacts and does not run cargo locally.
+All three crates receive equivalent deterministic inputs for a given dimension:
 
-For local scratch comparisons, you can save multiple baselines and compare
-against any of them. If the release baseline is already present in
-`target/criterion/`, compare directly:
+- matrix entries come from the same strictly diagonally-dominant generator
+  (`matrix_entry::<D>`)
+- right-hand sides and vector inputs come from the same deterministic vector
+  generator
+- each benchmark uses `black_box` around inputs and outputs to keep the
+  measured operation visible to the optimizer
+
+The integration smoke test `tests/vs_linalg_inputs.rs` reuses the benchmark
+input helpers and verifies that la-stack, nalgebra, and faer agree on the
+determinant, solve, dot, and infinity-norm results for D=2..=5. Run it with:
 
 ```bash
-just bench-latest          # gather latest la-stack measurements
-just bench-compare v0.4.2  # compare latest measurements against v0.4.2
+cargo test --features bench --test vs_linalg_inputs
 ```
 
-### Output
-
-`just bench-compare` writes `target/bench-reports/performance.md` by
-default. The file contains machine-specific timings and is intentionally
-local. The report includes per-dimension tables showing median times,
-percent change, speedup, and last-release nalgebra/faer context where a
-matching `vs_linalg` peer exists.
+Run that test when changing benchmark input construction, adding comparable
+kernels, or updating the `faer` or `nalgebra` benchmark dependencies.
 
-Release PRs promote one curated comparison into committed docs:
+The main comparable metrics are:
 
-```bash
-just performance-release
-```
+- `det_via_lu` — factor the matrix and compute determinant from the LU factor
+- `lu` — LU factorization only
+- `lu_solve` — factor the matrix and solve one right-hand side
+- `solve_from_lu` — solve one right-hand side using a precomputed LU factor
+- `det_from_lu` — compute determinant using a precomputed LU factor
+- `dot` — vector dot product
+- `norm2_sq` — squared Euclidean vector norm
+- `inf_norm` — matrix infinity norm, implemented as maximum absolute row sum
 
-This infers the current release tag from `Cargo.toml`, discovers the previous
-stable published release, generates both sides locally in temporary worktrees,
-copies the finished report to `docs/PERFORMANCE.md`, and archives the previous
-committed report under `docs/archive/performance/`. Archive filenames are
-release-pair names such as `v0.4.2-vs-v0.4.1.md`, so the directory and generated
-index stay lexicographically sorted. For explicit release repair, pass both
-tags: `just performance-release v0.4.3 v0.4.2`.
+Additional SPD metrics compare la-stack LDLT against faer LDLT and nalgebra
+Cholesky:
 
-To compare the latest stored GitHub Actions release assets without touching the
-current checkout:
+- `ldlt` / `cholesky` — SPD factorization only
+- `ldlt_solve` / `cholesky_solve` — factor and solve one right-hand side
+- `solve_from_ldlt` / `solve_from_cholesky` — solve using a precomputed factor
+- `det_from_ldlt` / `det_from_cholesky` — determinant from a precomputed factor
 
-```bash
-just performance-github-assets
-```
+Read these as SPD factorization/solve/determinant comparisons, not as identical
+algorithm comparisons across all three crates.
 
-The recipe discovers the latest stable published GitHub release and its previous
-stable release automatically. For explicit historical repair, pass both tags:
-`just performance-github-assets v0.4.2 v0.4.1`.
+Release-signal reports compare latest la-stack measurements against a saved
+la-stack baseline, and show saved nalgebra/faer baseline timings as context
+where a matching peer benchmark exists. That keeps iteration cheap while still
+making the release signal auditable. The full `vs_linalg` run remains the source
+of README plots and crate-to-crate comparison tables.
+
+## Exact-Arithmetic Notes
+
+The exact suite includes fixed per-dimension groups (`exact_d{2..5}`), random
+percentile groups, and adversarial-input groups:
+
+- `exact_random_percentile_d{2..5}` — fixed-seed corpora of 50 strictly
+  diagonally-dominant random matrices per dimension. Each operation is
+  pre-timed across the corpus to select representative p50/p95/p99 inputs, then
+  Criterion measures those inputs normally.
+- `exact_near_singular_3x3` — a 2^-50 perturbation of a singular base matrix;
+  forces the Bareiss fallback in `det_sign_exact` and exercises the largest
+  intermediate `BigInt` values in `solve_exact`.
+- `exact_large_entries_3x3` — diagonal entries near `f64::MAX / 2` stress
+  `BigInt` growth during Bareiss forward elimination.
+- `exact_hilbert_4x4` / `exact_hilbert_5x5` — classically ill-conditioned
+  matrices whose non-terminating-in-binary entries stress the
+  `f64_decompose -> BigInt` scaling path.
+
+Each random percentile and adversarial group runs the same exact-arithmetic
+benches (`det_sign_exact`, `det_exact`, `solve_exact`,
+`solve_exact_f64_result`, `solve_exact_rounded_f64`) so tables are comparable
+across input classes.
 
 For exact-arithmetic comparisons against v0.4.2 or older baselines, rows such
 as `det_exact_rounded_f64 (vs det_exact_f64)` mean the current rounded API is
 being compared to the historical lossy `*_exact_f64` benchmark. Rows such as
-`det_exact_f64_result (vs det_exact_f64)` intentionally show the overhead of
-the new strict conversion contract against that same historical baseline.
+`det_exact_f64_result (vs det_exact_f64)` intentionally show the overhead of the
+new strict conversion contract against that same historical baseline.
 
 The default `release-signal` scope reports exact-arithmetic rows whose inputs
 are fixed across versions: deterministic D=2..=5 cases plus adversarial fixed
@@ -246,30 +291,16 @@ To generate a current snapshot without a saved baseline:
 uv run bench-compare --snapshot
 ```
 
-## vs\_linalg plotting
-
-The `criterion_dim_plot.py` script generates CSV/SVG plots and updates the
-README benchmark table from vs\_linalg results:
-
-```bash
-# Run benchmarks + update README table and SVG plot
-just bench-vs-linalg
-just plot-vs-linalg-readme
-```
-
-See `scripts/criterion_dim_plot.py --help` for options.
-
-## Release workflow
+## Release Notes
 
-At release time, save a local baseline so future work can compare against it:
+Local Criterion baselines are optional during release. Save them only if you
+want convenience baselines for follow-up development on the same machine:
 
 ```bash
 just bench-save-baseline <tag>
 just bench-save-last
 ```
 
-When the GitHub Release is published, `.github/workflows/release-benchmarks.yml`
-saves a full release baseline and attaches
-`la-stack-$TAG-criterion-baseline.tar.gz` to the release as the durable archive.
-See the `just performance-release` step in `docs/RELEASING.md` for where the
-curated `docs/PERFORMANCE.md` comparison fits in the release process.
+The durable published baseline is the GitHub Release artifact created by
+`.github/workflows/release-benchmarks.yml`. The committed release comparison is
+`docs/PERFORMANCE.md`, created by `just performance-release`.
diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md
index 961762b..205cd47 100644
--- a/docs/PERFORMANCE.md
+++ b/docs/PERFORMANCE.md
@@ -1,97 +1,113 @@
-# Exact Arithmetic Performance
+# Benchmark Performance
 
-**la-stack** v0.4.2 · `7e11f93` (HEAD) · 2026-06-08 20:39:03 UTC
+**la-stack** v0.4.3 · `45affa8` (HEAD) · 2026-06-09 08:41:32 UTC
 **Statistic**: median
+**Suite**: all
+**Scope**: release-signal
 
 ## Benchmark Results
 
-Comparison against baseline **v0.4.1**:
+Comparison against baseline **v0.4.2**:
 
 Negative change = faster. Speedup > 1.00x = improvement.
 
+## Exact arithmetic
+
 ### D=2
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det | 0.6 ns | 0.9 ns | +61.1% | 0.62x |
-| det_direct | 0.7 ns | 1.0 ns | +44.7% | 0.69x |
-| det_exact | 315.5 ns | 318.4 ns | +0.9% | 0.99x |
-| det_exact_f64 | 555.7 ns | 555.7 ns | -0.0% | 1.00x |
-| det_sign_exact | 0.7 ns | 1.5 ns | +128.2% | 0.44x |
-| solve_exact | 7.05 µs | 7.06 µs | +0.2% | 1.00x |
-| solve_exact_f64 | 7.50 µs | 7.67 µs | +2.3% | 0.98x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det | 0.9 ns | 0.7 ns | **-24.0%** | 1.32x |
+| det_direct | 1.0 ns | 1.0 ns | +2.1% | 0.98x |
+| det_exact | 248.9 ns | 195.6 ns | **-21.4%** | 1.27x |
+| det_exact_f64_result (vs det_exact_f64) | 429.1 ns | 167.6 ns | **-60.9%** | 2.56x |
+| det_exact_rounded_f64 (vs det_exact_f64) | 429.1 ns | 375.2 ns | **-12.6%** | 1.14x |
+| det_sign_exact | 1.5 ns | 3.2 ns | +115.9% | 0.46x |
+| solve_exact | 6.53 µs | 6.45 µs | **-1.1%** | 1.01x |
+| solve_exact_f64_result (vs solve_exact_f64) | 6.90 µs | 6.60 µs | **-4.4%** | 1.05x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 6.90 µs | 7.02 µs | +1.7% | 0.98x |
 
 ### D=3
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det | 1.3 ns | 1.8 ns | +36.3% | 0.73x |
-| det_direct | 4.7 ns | 2.2 ns | **-51.9%** | 2.08x |
-| det_exact | 936.9 ns | 924.3 ns | **-1.3%** | 1.01x |
-| det_exact_f64 | 1.18 µs | 1.19 µs | +1.1% | 0.99x |
-| det_sign_exact | 2.4 ns | 4.2 ns | +78.1% | 0.56x |
-| solve_exact | 27.02 µs | 27.41 µs | +1.5% | 0.99x |
-| solve_exact_f64 | 28.06 µs | 27.98 µs | -0.3% | 1.00x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det | 1.8 ns | 1.5 ns | **-19.4%** | 1.24x |
+| det_direct | 2.0 ns | 2.0 ns | +2.4% | 0.98x |
+| det_exact | 739.0 ns | 468.6 ns | **-36.6%** | 1.58x |
+| det_exact_f64_result (vs det_exact_f64) | 913.1 ns | 435.6 ns | **-52.3%** | 2.10x |
+| det_exact_rounded_f64 (vs det_exact_f64) | 913.1 ns | 648.1 ns | **-29.0%** | 1.41x |
+| det_sign_exact | 4.2 ns | 5.5 ns | +30.9% | 0.76x |
+| solve_exact | 25.69 µs | 25.16 µs | **-2.1%** | 1.02x |
+| solve_exact_f64_result (vs solve_exact_f64) | 26.16 µs | 25.42 µs | **-2.8%** | 1.03x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 26.16 µs | 25.67 µs | **-1.9%** | 1.02x |
 
 ### D=4
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det | 2.4 ns | 3.3 ns | +36.8% | 0.73x |
-| det_direct | 2.4 ns | 4.1 ns | +70.2% | 0.59x |
-| det_exact | 2.33 µs | 2.33 µs | -0.0% | 1.00x |
-| det_exact_f64 | 2.59 µs | 2.58 µs | -0.7% | 1.01x |
-| det_sign_exact | 5.3 ns | 6.9 ns | +30.5% | 0.77x |
-| solve_exact | 67.14 µs | 67.99 µs | +1.3% | 0.99x |
-| solve_exact_f64 | 67.86 µs | 68.51 µs | +1.0% | 0.99x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det | 3.3 ns | 4.5 ns | +38.1% | 0.72x |
+| det_direct | 3.7 ns | 4.3 ns | +17.6% | 0.85x |
+| det_exact | 1.87 µs | 1.47 µs | **-21.8%** | 1.28x |
+| det_exact_f64_result (vs det_exact_f64) | 2.04 µs | 1.47 µs | **-27.9%** | 1.39x |
+| det_exact_rounded_f64 (vs det_exact_f64) | 2.04 µs | 1.63 µs | **-19.8%** | 1.25x |
+| det_sign_exact | 6.9 ns | 11.5 ns | +67.1% | 0.60x |
+| solve_exact | 64.95 µs | 61.67 µs | **-5.1%** | 1.05x |
+| solve_exact_f64_result (vs solve_exact_f64) | 66.35 µs | 62.37 µs | **-6.0%** | 1.06x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 66.35 µs | 63.59 µs | **-4.2%** | 1.04x |
 
 ### D=5
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det | 21.6 ns | 24.5 ns | +13.7% | 0.88x |
-| det_direct | 2.3 ns | 4.7 ns | +104.8% | 0.49x |
-| det_exact | 5.04 µs | 4.99 µs | -1.0% | 1.01x |
-| det_exact_f64 | 5.32 µs | 5.31 µs | -0.1% | 1.00x |
-| det_sign_exact | 4.97 µs | 4.99 µs | +0.3% | 1.00x |
-| solve_exact | 134.99 µs | 136.04 µs | +0.8% | 0.99x |
-| solve_exact_f64 | 137.11 µs | 138.97 µs | +1.4% | 0.99x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det | 26.0 ns | 23.3 ns | **-10.6%** | 1.12x |
+| det_direct | 4.5 ns | 2.5 ns | **-44.2%** | 1.79x |
+| det_exact | 4.10 µs | 4.05 µs | **-1.3%** | 1.01x |
+| det_exact_f64_result (vs det_exact_f64) | 4.21 µs | 4.02 µs | **-4.4%** | 1.05x |
+| det_exact_rounded_f64 (vs det_exact_f64) | 4.21 µs | 4.33 µs | +2.8% | 0.97x |
+| det_sign_exact | 3.94 µs | 3.96 µs | +0.6% | 0.99x |
+| solve_exact | 130.82 µs | 126.75 µs | **-3.1%** | 1.03x |
+| solve_exact_f64_result (vs solve_exact_f64) | 132.70 µs | 127.37 µs | **-4.0%** | 1.04x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 132.70 µs | 128.15 µs | **-3.4%** | 1.04x |
 
 ### Near-singular 3x3
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det_sign_exact | 871.8 ns | 877.6 ns | +0.7% | 0.99x |
-| det_exact | 907.3 ns | 904.4 ns | -0.3% | 1.00x |
-| solve_exact | 4.31 µs | 4.25 µs | **-1.5%** | 1.02x |
-| solve_exact_f64 | 4.29 µs | 4.32 µs | +0.7% | 0.99x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det_sign_exact | 705.2 ns | 444.2 ns | **-37.0%** | 1.59x |
+| det_exact | 724.0 ns | 478.9 ns | **-33.9%** | 1.51x |
+| solve_exact | 3.44 µs | 3.39 µs | **-1.6%** | 1.02x |
+| solve_exact_f64_result (vs solve_exact_f64) | 3.47 µs | 3.36 µs | **-3.2%** | 1.03x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 3.47 µs | 3.39 µs | **-2.5%** | 1.03x |
 
 ### Large entries 3x3
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det_sign_exact | 3.14 µs | 3.09 µs | **-1.3%** | 1.01x |
-| det_exact | 3.19 µs | 3.11 µs | **-2.3%** | 1.02x |
-| solve_exact | 84.77 µs | 83.89 µs | **-1.0%** | 1.01x |
-| solve_exact_f64 | 84.62 µs | 83.92 µs | -0.8% | 1.01x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det_sign_exact | 2.91 µs | 402.4 ns | **-86.2%** | 7.23x |
+| det_exact | 2.94 µs | 434.0 ns | **-85.2%** | 6.76x |
+| solve_exact | 82.81 µs | 81.57 µs | **-1.5%** | 1.02x |
+| solve_exact_f64_result (vs solve_exact_f64) | 84.32 µs | 81.66 µs | **-3.1%** | 1.03x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 84.32 µs | 82.04 µs | **-2.7%** | 1.03x |
 
 ### Hilbert 4x4
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det_sign_exact | 5.3 ns | 6.9 ns | +30.4% | 0.77x |
-| det_exact | 2.39 µs | 2.31 µs | **-3.2%** | 1.03x |
-| solve_exact | 51.69 µs | 52.27 µs | +1.1% | 0.99x |
-| solve_exact_f64 | 52.90 µs | 53.26 µs | +0.7% | 0.99x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det_sign_exact | 6.9 ns | 11.5 ns | +66.4% | 0.60x |
+| det_exact | 1.91 µs | 1.50 µs | **-21.7%** | 1.28x |
+| solve_exact | 49.42 µs | 47.77 µs | **-3.3%** | 1.03x |
+| solve_exact_f64_result (vs solve_exact_f64) | 50.38 µs | 47.67 µs | **-5.4%** | 1.06x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 50.38 µs | 48.17 µs | **-4.4%** | 1.05x |
 
 ### Hilbert 5x5
 
-| Benchmark | v0.4.1 | Current | Change | Speedup |
-|-----------|-------:|--------:|-------:|--------:|
-| det_sign_exact | 5.03 µs | 4.88 µs | **-2.9%** | 1.03x |
-| det_exact | 5.07 µs | 4.96 µs | **-2.1%** | 1.02x |
-| solve_exact | 105.35 µs | 102.72 µs | **-2.5%** | 1.03x |
-| solve_exact_f64 | 104.99 µs | 103.94 µs | -1.0% | 1.01x |
+| Benchmark | v0.4.2 | Latest | Change | Speedup |
+|-----------|-------:|-------:|-------:|--------:|
+| det_sign_exact | 4.09 µs | 3.91 µs | **-4.6%** | 1.05x |
+| det_exact | 4.00 µs | 4.02 µs | +0.6% | 0.99x |
+| solve_exact | 98.71 µs | 95.41 µs | **-3.4%** | 1.03x |
+| solve_exact_f64_result (vs solve_exact_f64) | 99.88 µs | 98.14 µs | **-1.7%** | 1.02x |
+| solve_exact_rounded_f64 (vs solve_exact_f64) | 99.88 µs | 97.50 µs | **-2.4%** | 1.02x |
 
 ## How to Update
 
@@ -114,4 +130,6 @@ just performance-release <current-tag> <previous-tag>
 `just performance-local` writes `target/bench-reports/performance.md`.
 `just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.
 
+Older curated release-to-release reports are archived in `docs/archive/performance/`.
+
 See `docs/BENCHMARKING.md` for the full comparison workflow.
diff --git a/docs/RELEASING.md b/docs/RELEASING.md
index afd9712..07d7c00 100644
--- a/docs/RELEASING.md
+++ b/docs/RELEASING.md
@@ -118,40 +118,24 @@ repair, run `just performance-release <current-tag> <previous-tag>`. To compare
 the stored GitHub Actions release assets instead of running cargo locally, use
 `just performance-github-assets`.
 
-6. Save benchmark baselines for this release
-
-```bash
-# Save a named full baseline for this release
-just bench-save-baseline $TAG
-
-# Also refresh the conventional "last" baseline used by local
-# latest-vs-last performance checks
-just bench-save-last
-```
-
-These baselines can be compared against in future optimization work on the
-release branch. The default local report command, `just bench-compare`, compares
-latest measurements against `last` and writes
-`target/bench-reports/performance.md`; it does not update README benchmark
-tables or committed release artifacts.
-
 After the GitHub Release is published, the `Release Benchmarks` workflow checks
 out the release tag, saves a full Criterion baseline, and attaches
 `la-stack-$TAG-criterion-baseline.tar.gz` to the release. That release asset is
 the durable archive for historical baseline comparisons; the workflow also
 uploads a short-lived Actions artifact for debugging the run.
 
-See `docs/BENCHMARKING.md` for the full comparison workflow.
+See `docs/BENCHMARKING.md` for local saved-baseline workflows and the full
+comparison command reference.
 
-7. Validate the release branch
+6. Validate the release branch
 
 ```bash
 just ci
 just citation-check
-cargo publish --locked --dry-run
+cargo publish --locked --allow-dirty --dry-run
 ```
 
-8. Stage and commit release artifacts
+7. Stage and commit release artifacts
 
 ```bash
 git add Cargo.toml Cargo.lock CITATION.cff pyproject.toml CHANGELOG.md README.md docs/
@@ -165,7 +149,7 @@ git commit -m "chore(release): release $TAG
 - Update documentation for release"
 ```
 
-9. Push the branch and open a PR
+8. Push the branch and open a PR
 
 ```bash
 git push -u origin "release/$TAG"
diff --git a/docs/archive/performance/README.md b/docs/archive/performance/README.md
index dd5fc68..087b380 100644
--- a/docs/archive/performance/README.md
+++ b/docs/archive/performance/README.md
@@ -4,3 +4,4 @@ Older release-to-release benchmark comparisons are archived here.
 `docs/PERFORMANCE.md` contains the latest curated comparison.
 
 - [v0.4.1-vs-v0.4.0](v0.4.1-vs-v0.4.0.md)
+- [v0.4.2-vs-v0.4.1](v0.4.2-vs-v0.4.1.md)
diff --git a/docs/archive/performance/v0.4.2-vs-v0.4.1.md b/docs/archive/performance/v0.4.2-vs-v0.4.1.md
new file mode 100644
index 0000000..01a68be
--- /dev/null
+++ b/docs/archive/performance/v0.4.2-vs-v0.4.1.md
@@ -0,0 +1,119 @@
+# Exact Arithmetic Performance
+
+**la-stack** v0.4.2 · `7e11f93` (HEAD) · 2026-06-08 20:39:03 UTC
+**Statistic**: median
+
+## Benchmark Results
+
+Comparison against baseline **v0.4.1**:
+
+Negative change = faster. Speedup > 1.00x = improvement.
+
+### D=2
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det | 0.6 ns | 0.9 ns | +61.1% | 0.62x |
+| det_direct | 0.7 ns | 1.0 ns | +44.7% | 0.69x |
+| det_exact | 315.5 ns | 318.4 ns | +0.9% | 0.99x |
+| det_exact_f64 | 555.7 ns | 555.7 ns | -0.0% | 1.00x |
+| det_sign_exact | 0.7 ns | 1.5 ns | +128.2% | 0.44x |
+| solve_exact | 7.05 µs | 7.06 µs | +0.2% | 1.00x |
+| solve_exact_f64 | 7.50 µs | 7.67 µs | +2.3% | 0.98x |
+
+### D=3
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det | 1.3 ns | 1.8 ns | +36.3% | 0.73x |
+| det_direct | 4.7 ns | 2.2 ns | **-51.9%** | 2.08x |
+| det_exact | 936.9 ns | 924.3 ns | **-1.3%** | 1.01x |
+| det_exact_f64 | 1.18 µs | 1.19 µs | +1.1% | 0.99x |
+| det_sign_exact | 2.4 ns | 4.2 ns | +78.1% | 0.56x |
+| solve_exact | 27.02 µs | 27.41 µs | +1.5% | 0.99x |
+| solve_exact_f64 | 28.06 µs | 27.98 µs | -0.3% | 1.00x |
+
+### D=4
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det | 2.4 ns | 3.3 ns | +36.8% | 0.73x |
+| det_direct | 2.4 ns | 4.1 ns | +70.2% | 0.59x |
+| det_exact | 2.33 µs | 2.33 µs | -0.0% | 1.00x |
+| det_exact_f64 | 2.59 µs | 2.58 µs | -0.7% | 1.01x |
+| det_sign_exact | 5.3 ns | 6.9 ns | +30.5% | 0.77x |
+| solve_exact | 67.14 µs | 67.99 µs | +1.3% | 0.99x |
+| solve_exact_f64 | 67.86 µs | 68.51 µs | +1.0% | 0.99x |
+
+### D=5
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det | 21.6 ns | 24.5 ns | +13.7% | 0.88x |
+| det_direct | 2.3 ns | 4.7 ns | +104.8% | 0.49x |
+| det_exact | 5.04 µs | 4.99 µs | -1.0% | 1.01x |
+| det_exact_f64 | 5.32 µs | 5.31 µs | -0.1% | 1.00x |
+| det_sign_exact | 4.97 µs | 4.99 µs | +0.3% | 1.00x |
+| solve_exact | 134.99 µs | 136.04 µs | +0.8% | 0.99x |
+| solve_exact_f64 | 137.11 µs | 138.97 µs | +1.4% | 0.99x |
+
+### Near-singular 3x3
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det_sign_exact | 871.8 ns | 877.6 ns | +0.7% | 0.99x |
+| det_exact | 907.3 ns | 904.4 ns | -0.3% | 1.00x |
+| solve_exact | 4.31 µs | 4.25 µs | **-1.5%** | 1.02x |
+| solve_exact_f64 | 4.29 µs | 4.32 µs | +0.7% | 0.99x |
+
+### Large entries 3x3
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det_sign_exact | 3.14 µs | 3.09 µs | **-1.3%** | 1.01x |
+| det_exact | 3.19 µs | 3.11 µs | **-2.3%** | 1.02x |
+| solve_exact | 84.77 µs | 83.89 µs | **-1.0%** | 1.01x |
+| solve_exact_f64 | 84.62 µs | 83.92 µs | -0.8% | 1.01x |
+
+### Hilbert 4x4
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det_sign_exact | 5.3 ns | 6.9 ns | +30.4% | 0.77x |
+| det_exact | 2.39 µs | 2.31 µs | **-3.2%** | 1.03x |
+| solve_exact | 51.69 µs | 52.27 µs | +1.1% | 0.99x |
+| solve_exact_f64 | 52.90 µs | 53.26 µs | +0.7% | 0.99x |
+
+### Hilbert 5x5
+
+| Benchmark | v0.4.1 | Current | Change | Speedup |
+|-----------|-------:|--------:|-------:|--------:|
+| det_sign_exact | 5.03 µs | 4.88 µs | **-2.9%** | 1.03x |
+| det_exact | 5.07 µs | 4.96 µs | **-2.1%** | 1.02x |
+| solve_exact | 105.35 µs | 102.72 µs | **-2.5%** | 1.03x |
+| solve_exact_f64 | 104.99 µs | 103.94 µs | -1.0% | 1.01x |
+
+## How to Update
+
+Local performance reports are generated in isolated temporary worktrees:
+
+```bash
+# Local development: compare the current tree with the latest release
+just performance-local
+
+# Release PR: update docs/PERFORMANCE.md and archive the previous report
+just performance-release
+
+# GitHub Actions release assets
+just performance-github-assets
+
+# Explicit repair
+just performance-release <current-tag> <previous-tag>
+```
+
+`just performance-local` writes `target/bench-reports/performance.md`.
+`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.
+
+Older curated release-to-release reports are archived in `docs/archive/performance/`.
+
+See `docs/BENCHMARKING.md` for the full comparison workflow.
diff --git a/docs/assets/bench/vs_linalg_lu_solve_median.csv b/docs/assets/bench/vs_linalg_lu_solve_median.csv
index b47bf82..55a3e34 100644
--- a/docs/assets/bench/vs_linalg_lu_solve_median.csv
+++ b/docs/assets/bench/vs_linalg_lu_solve_median.csv
@@ -1,9 +1,9 @@
 D,la_stack,la_lo,la_hi,nalgebra,na_lo,na_hi,faer,fa_lo,fa_hi
-2,2.585300857336246,2.582809844321982,2.586669800817735,4.486088257090016,4.479229681572247,4.492362317908142,137.6529309582465,137.31998885139873,137.93586339517975
-3,12.203757609083176,12.183054403617847,12.224101270062874,22.990257557091617,22.91154128675699,23.12912235259086,182.61806110946827,181.880047375976,183.37040632194638
-4,27.227520807109173,27.202299533674857,27.266723521091606,51.66011127074051,51.555715086488036,51.840625331770724,208.18091469437718,207.68326634854134,209.09477296109242
-5,53.14134517762996,53.09941497989714,53.281547651078945,68.71356228143745,68.56318667878111,68.77114355114793,272.1171465104768,270.47097586206894,274.01601221264366
-8,141.2790253849833,140.95206481826406,141.68049543712607,162.22478601970167,161.86865626610683,162.56549560477754,348.21559402824914,347.18022267396856,349.1754040441267
-16,626.5606138541871,624.4956536182669,627.6291586692258,574.1147371436412,572.7849165188117,575.6888051044084,854.9407805348176,853.314408186121,859.1660992820005
-32,2862.794789510007,2860.7804785793915,2864.5392419175027,2709.532471877147,2702.3908894171072,2716.620725916311,2806.6981316154734,2802.5897840011316,2814.7045584045586
-64,19703.238514957266,19501.774005848587,19923.90512820513,14388.28538961039,14376.077992277993,14407.440476190477,12085.452737127373,12070.825515947467,12098.666085946574
+2,2.0437070181606085,2.0425631264853448,2.04607019228461,4.542175209414122,4.538937481352276,4.545658680820327,143.95823454981254,143.35650513307058,144.55261268043853
+3,9.595660662683073,9.588797748121927,9.613326467320519,23.59900523298633,23.193334672880862,23.89156203624332,185.46552169511642,184.30770285322595,186.6833373684646
+4,23.338056895226114,23.241755833807627,23.450110414132816,50.71697549754131,50.61114270941055,50.84226416178329,210.97635296260154,210.09558569349872,211.8172518966618
+5,45.36802780858268,45.33368856824674,45.45857791494792,69.06519772701617,68.91705833943838,69.22990310070215,277.56412002670754,276.40090137857896,278.9175994658497
+8,127.86115337657483,127.67784037443938,127.97259027100068,164.41175356549803,163.98918099473198,164.89947719932118,364.864449238967,363.9575332348597,365.67189807976365
+16,631.9974053918763,630.5943769720783,634.117961205346,663.82156895873,608.9343639517663,684.2256402276,882.673648246476,880.5187023299551,885.2058986456484
+32,2745.604342979343,2733.882221204255,2755.327066196631,2424.5398969497583,2422.7844615775653,2425.665479115479,2867.431290847727,2862.8613304608957,2879.1905179193345
+64,17543.03432206594,17378.64912280702,17669.562753036436,14747.730784813924,14732.373598480488,14759.246031746032,12266.27068329904,12250.220156695157,12279.630177029401
diff --git a/docs/assets/bench/vs_linalg_lu_solve_median.svg b/docs/assets/bench/vs_linalg_lu_solve_median.svg
index a7e281e..deb7b3e 100644
--- a/docs/assets/bench/vs_linalg_lu_solve_median.svg
+++ b/docs/assets/bench/vs_linalg_lu_solve_median.svg
@@ -244,51 +244,51 @@
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
 	<g transform="translate(240.33,75.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
-		<text>la-stack v0.4.2</text>
+		<text>la-stack v0.4.3</text>
 	</g>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<path stroke='rgb( 31, 119, 180)'  d='M248.72,72.01 L291.28,72.01 M248.72,67.51 L248.72,76.51 M291.28,67.51 L291.28,76.51 M97.70,447.06 L111.20,389.31
-		L124.70,359.45 L138.21,334.57 L178.71,298.19 L286.73,242.76 L502.76,186.23 L934.82,114.45 M97.70,447.09 L97.70,447.04
-		M97.70,447.09 L102.20,447.09 M97.70,447.04 L102.20,447.04 M111.20,389.37 L111.20,389.25 M106.70,389.37 L115.70,389.37
-		M106.70,389.25 L115.70,389.25 M124.70,359.49 L124.70,359.40 M120.20,359.49 L129.20,359.49 M120.20,359.40 L129.20,359.40
-		M138.21,334.60 L138.21,334.47 M133.71,334.60 L142.71,334.60 M133.71,334.47 L142.71,334.47 M178.71,298.27 L178.71,298.08
-		M174.21,298.27 L183.21,298.27 M174.21,298.08 L183.21,298.08 M286.73,242.88 L286.73,242.70 M282.23,242.88 L291.23,242.88
-		M282.23,242.70 L291.23,242.70 M502.76,186.26 L502.76,186.21 M498.26,186.26 L507.26,186.26 M498.26,186.21 L507.26,186.21
-		M934.82,114.84 L934.82,114.04 M930.32,114.84 L934.82,114.84 M930.32,114.04 L934.82,114.04  '/></g>
+	<path stroke='rgb( 31, 119, 180)'  d='M248.72,72.01 L291.28,72.01 M248.72,67.51 L248.72,76.51 M291.28,67.51 L291.28,76.51 M97.70,455.80 L111.20,398.26
+		L124.70,365.19 L138.21,340.45 L178.71,301.90 L286.73,242.44 L502.76,187.78 L934.82,118.77 M97.70,455.82 L97.70,455.76
+		M97.70,455.82 L102.20,455.82 M97.70,455.76 L102.20,455.76 M111.20,398.28 L111.20,398.19 M106.70,398.28 L115.70,398.28
+		M106.70,398.19 L115.70,398.19 M124.70,365.34 L124.70,365.01 M120.20,365.34 L129.20,365.34 M120.20,365.01 L129.20,365.01
+		M138.21,340.48 L138.21,340.38 M133.71,340.48 L142.71,340.48 M133.71,340.38 L142.71,340.38 M178.71,301.95 L178.71,301.87
+		M174.21,301.95 L183.21,301.95 M174.21,301.87 L183.21,301.87 M286.73,242.52 L286.73,242.32 M282.23,242.52 L291.23,242.52
+		M282.23,242.32 L291.23,242.32 M502.76,187.94 L502.76,187.65 M498.26,187.94 L507.26,187.94 M498.26,187.65 L507.26,187.65
+		M934.82,119.12 L934.82,118.51 M930.32,119.12 L934.82,119.12 M930.32,118.51 L934.82,118.51  '/></g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(97.70,447.06) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(97.70,455.80) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(97.70,447.06) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(111.20,389.31) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(97.70,455.80) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(111.20,398.26) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(111.20,389.31) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(124.70,359.45) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(111.20,398.26) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(124.70,365.19) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(124.70,359.45) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(138.21,334.57) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(124.70,365.19) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(138.21,340.45) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(138.21,334.57) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(178.71,298.19) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(138.21,340.45) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(178.71,301.90) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(178.71,298.19) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(286.73,242.76) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(178.71,301.90) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(286.73,242.44) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(286.73,242.76) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(502.76,186.23) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(286.73,242.44) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(502.76,187.78) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(502.76,186.23) scale(4.50)' color='rgb( 31, 119, 180)'/>
-	<use xlink:href='#gpPt6' transform='translate(934.82,114.45) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(502.76,187.78) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(934.82,118.77) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(934.82,114.45) scale(4.50)' color='rgb( 31, 119, 180)'/>
+	<use xlink:href='#gpPt6' transform='translate(934.82,118.77) scale(4.50)' color='rgb( 31, 119, 180)'/>
 	<use xlink:href='#gpPt6' transform='translate(270.00,72.01) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
@@ -302,47 +302,47 @@
 	</g>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<path stroke='rgb(255, 127,  14)'  d='M248.72,90.01 L291.28,90.01 M248.72,85.51 L248.72,94.51 M291.28,85.51 L291.28,94.51 M97.70,426.55 L111.20,365.75
-		L124.70,335.62 L138.21,325.01 L178.71,293.04 L286.73,246.01 L502.76,188.28 L934.82,126.15 M97.70,426.61 L97.70,426.50
-		M97.70,426.61 L102.20,426.61 M97.70,426.50 L102.20,426.50 M111.20,365.87 L111.20,365.52 M106.70,365.87 L115.70,365.87
-		M106.70,365.52 L115.70,365.52 M124.70,335.70 L124.70,335.49 M120.20,335.70 L129.20,335.70 M120.20,335.49 L129.20,335.49
-		M138.21,325.09 L138.21,324.97 M133.71,325.09 L142.71,325.09 M133.71,324.97 L142.71,324.97 M178.71,293.12 L178.71,292.96
-		M174.21,293.12 L183.21,293.12 M174.21,292.96 L183.21,292.96 M286.73,246.10 L286.73,245.91 M282.23,246.10 L291.23,246.10
-		M282.23,245.91 L291.23,245.91 M502.76,188.37 L502.76,188.18 M498.26,188.37 L507.26,188.37 M498.26,188.18 L507.26,188.18
-		M934.82,126.18 L934.82,126.10 M930.32,126.18 L934.82,126.18 M930.32,126.10 L934.82,126.10  '/></g>
+	<path stroke='rgb(255, 127,  14)'  d='M248.72,90.01 L291.28,90.01 M248.72,85.51 L248.72,94.51 M291.28,85.51 L291.28,94.51 M97.70,426.09 L111.20,364.77
+		L124.70,336.31 L138.21,324.82 L178.71,292.54 L286.73,240.61 L502.76,192.41 L934.82,125.23 M97.70,426.11 L97.70,426.06
+		M97.70,426.11 L102.20,426.11 M97.70,426.06 L102.20,426.06 M111.20,365.42 L111.20,364.31 M106.70,365.42 L115.70,365.42
+		M106.70,364.31 L115.70,364.31 M124.70,336.38 L124.70,336.21 M120.20,336.38 L129.20,336.38 M120.20,336.21 L129.20,336.21
+		M138.21,324.90 L138.21,324.73 M133.71,324.90 L142.71,324.90 M133.71,324.73 L142.71,324.73 M178.71,292.64 L178.71,292.43
+		M174.21,292.64 L183.21,292.64 M174.21,292.43 L183.21,292.43 M286.73,243.82 L286.73,239.49 M282.23,243.82 L291.23,243.82
+		M282.23,239.49 L291.23,239.49 M502.76,192.44 L502.76,192.39 M498.26,192.44 L507.26,192.44 M498.26,192.39 L507.26,192.39
+		M934.82,125.27 L934.82,125.20 M930.32,125.27 L934.82,125.27 M930.32,125.20 L934.82,125.20  '/></g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(97.70,426.55) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(97.70,426.09) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(97.70,426.55) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(111.20,365.75) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(97.70,426.09) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(111.20,364.77) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(111.20,365.75) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(124.70,335.62) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(111.20,364.77) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(124.70,336.31) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(124.70,335.62) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(138.21,325.01) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(124.70,336.31) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(138.21,324.82) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(138.21,325.01) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(178.71,293.04) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(138.21,324.82) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(178.71,292.54) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(178.71,293.04) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(286.73,246.01) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(178.71,292.54) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(286.73,240.61) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(286.73,246.01) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(502.76,188.28) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(286.73,240.61) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(502.76,192.41) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(502.76,188.28) scale(4.50)' color='rgb(255, 127,  14)'/>
-	<use xlink:href='#gpPt6' transform='translate(934.82,126.15) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt4' transform='translate(502.76,192.41) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt6' transform='translate(934.82,125.23) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt4' transform='translate(934.82,126.15) scale(4.50)' color='rgb(255, 127,  14)'/>
+	<use xlink:href='#gpPt4' transform='translate(934.82,125.23) scale(4.50)' color='rgb(255, 127,  14)'/>
 	<use xlink:href='#gpPt6' transform='translate(270.00,90.01) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
@@ -356,47 +356,47 @@
 	</g>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<path stroke='rgb( 44, 160,  44)'  d='M248.72,108.01 L291.28,108.01 M248.72,103.51 L248.72,112.51 M291.28,103.51 L291.28,112.51 M97.70,299.15 L111.20,288.64
-		L124.70,283.76 L138.21,273.79 L178.71,264.62 L286.73,231.20 L502.76,186.97 L934.82,132.64 M97.70,299.24 L97.70,299.08
-		M97.70,299.24 L102.20,299.24 M97.70,299.08 L102.20,299.08 M111.20,288.79 L111.20,288.48 M106.70,288.79 L115.70,288.79
-		M106.70,288.48 L115.70,288.48 M124.70,283.85 L124.70,283.60 M120.20,283.85 L129.20,283.85 M120.20,283.60 L129.20,283.60
-		M138.21,274.02 L138.21,273.54 M133.71,274.02 L142.71,274.02 M133.71,273.54 L142.71,273.54 M178.71,264.73 L178.71,264.52
-		M174.21,264.73 L183.21,264.73 M174.21,264.52 L183.21,264.52 M286.73,231.27 L286.73,231.01 M282.23,231.27 L291.23,231.27
-		M282.23,231.01 L291.23,231.01 M502.76,187.02 L502.76,186.86 M498.26,187.02 L507.26,187.02 M498.26,186.86 L507.26,186.86
-		M934.82,132.68 L934.82,132.60 M930.32,132.68 L934.82,132.68 M930.32,132.60 L934.82,132.60  '/></g>
+	<path stroke='rgb( 44, 160,  44)'  d='M248.72,108.01 L291.28,108.01 M248.72,103.51 L248.72,112.51 M291.28,103.51 L291.28,112.51 M97.70,297.49 L111.20,288.06
+		L124.70,283.26 L138.21,273.06 L178.71,262.88 L286.73,230.01 L502.76,186.17 L934.82,132.09 M97.70,297.64 L97.70,297.33
+		M97.70,297.64 L102.20,297.64 M97.70,297.33 L102.20,297.33 M111.20,288.29 L111.20,287.82 M106.70,288.29 L115.70,288.29
+		M106.70,287.82 L115.70,287.82 M124.70,283.42 L124.70,283.12 M120.20,283.42 L129.20,283.42 M120.20,283.12 L129.20,283.12
+		M138.21,273.21 L138.21,272.88 M133.71,273.21 L142.71,273.21 M133.71,272.88 L142.71,272.88 M178.71,262.97 L178.71,262.80
+		M174.21,262.97 L183.21,262.97 M174.21,262.80 L183.21,262.80 M286.73,230.10 L286.73,229.90 M282.23,230.10 L291.23,230.10
+		M282.23,229.90 L291.23,229.90 M502.76,186.23 L502.76,186.02 M498.26,186.23 L507.26,186.23 M498.26,186.02 L507.26,186.02
+		M934.82,132.14 L934.82,132.05 M930.32,132.14 L934.82,132.14 M930.32,132.05 L934.82,132.05  '/></g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt6' transform='translate(97.70,299.15) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt6' transform='translate(97.70,297.49) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(97.70,299.15) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(111.20,288.64) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(97.70,297.49) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(111.20,288.06) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(111.20,288.64) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(124.70,283.76) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(111.20,288.06) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(124.70,283.26) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(124.70,283.76) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(138.21,273.79) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(124.70,283.26) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(138.21,273.06) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(138.21,273.79) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(178.71,264.62) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(138.21,273.06) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(178.71,262.88) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(178.71,264.62) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(286.73,231.20) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(178.71,262.88) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(286.73,230.01) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(286.73,231.20) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(502.76,186.97) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(286.73,230.01) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(502.76,186.17) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(502.76,186.97) scale(4.50)' color='rgb( 44, 160,  44)'/>
-	<use xlink:href='#gpPt6' transform='translate(934.82,132.64) scale(4.50)' color='white'/>
+	<use xlink:href='#gpPt8' transform='translate(502.76,186.17) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt6' transform='translate(934.82,132.09) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
-	<use xlink:href='#gpPt8' transform='translate(934.82,132.64) scale(4.50)' color='rgb( 44, 160,  44)'/>
+	<use xlink:href='#gpPt8' transform='translate(934.82,132.09) scale(4.50)' color='rgb( 44, 160,  44)'/>
 	<use xlink:href='#gpPt6' transform='translate(270.00,108.01) scale(4.50)' color='white'/>
 </g>
 <g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
diff --git a/docs/roadmap.md b/docs/roadmap.md
index ae3d49f..7e81b93 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -100,6 +100,35 @@ a modern typed baseline.
 - [#142](https://github.com/acgetchell/la-stack/issues/142) - Update Python
   tooling to 3.13 and parse scripts at boundaries.
 
+Release posture:
+
+- Release `v0.4.3` before starting another performance-focused implementation
+  branch. The current release-signal comparison against `v0.4.2` shows broad
+  improvement across LU, solve, determinant-via-LU, and vector helper rows.
+- Treat the remaining `D=4` direct determinant regression as a tracked
+  performance note rather than a release blocker because the LU-backed
+  determinant and solve paths improved.
+- Defer `Matrix::inf_norm` optimization to follow-up work after `v0.4.3`.
+  Larger-dimension `vs_linalg` measurements suggest it is the most interesting
+  leaf-kernel target, but it is not required for the release.
+
+### v0.4.4 Focused Leaf-Kernel Performance
+
+After `v0.4.3`, use the improved benchmark workflow to investigate narrow
+leaf-kernel performance gaps without broadening the crate's scope or weakening
+the small fixed-dimension API model.
+
+- [#154](https://github.com/acgetchell/la-stack/issues/154) - Investigate
+  `Matrix::inf_norm` performance against `nalgebra` and `faer`.
+- [#155](https://github.com/acgetchell/la-stack/issues/155) - Investigate
+  `Vector::dot` and `Vector::norm2_sq` performance against `nalgebra` and
+  `faer`.
+
+The goal is targeted profiling and implementation cleanup for operations where
+`vs_linalg` shows a meaningful peer-crate gap. Release scope should stay limited
+to changes that preserve numerical behavior, allocation-free fixed-size storage,
+and clear const-generic code.
+
 ### v0.5.0 Generic Const Expressions
 
 `v0.5.0` is reserved for the post-stabilization const-generic API revision.
diff --git a/pyproject.toml b/pyproject.toml
index 9136cf4..bd35633 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "la-stack-scripts"
-version = "0.4.2"
+version = "0.4.3"
 description = "Python utility scripts for the la-stack Rust library"
 readme = "README.md"
 requires-python = ">=3.13"
diff --git a/scripts/archive_performance.py b/scripts/archive_performance.py
index 8e0d8d1..1caf958 100644
--- a/scripts/archive_performance.py
+++ b/scripts/archive_performance.py
@@ -309,6 +309,8 @@ def _how_to_update_section() -> str:
         "`just performance-local` writes `target/bench-reports/performance.md`.",
         "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.",
         "",
+        "Older curated release-to-release reports are archived in `docs/archive/performance/`.",
+        "",
         "See `docs/BENCHMARKING.md` for the full comparison workflow.",
         "",
     ]
diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py
index df9b689..8a1776c 100644
--- a/scripts/bench_compare.py
+++ b/scripts/bench_compare.py
@@ -726,6 +726,8 @@ def _generate_markdown(
             "`just performance-local` writes `target/bench-reports/performance.md`.",
             "`just performance-github-assets` writes `target/bench-reports/github-assets-performance.md`.",
             "",
+            "Older curated release-to-release reports are archived in `docs/archive/performance/`.",
+            "",
             "See `docs/BENCHMARKING.md` for the full comparison workflow.",
         ]
     )
diff --git a/scripts/tests/test_archive_performance.py b/scripts/tests/test_archive_performance.py
index 78f04fb..cc7665b 100644
--- a/scripts/tests/test_archive_performance.py
+++ b/scripts/tests/test_archive_performance.py
@@ -53,6 +53,11 @@ def _normalized_report(version: str, baseline: str) -> str:
     return archive_performance._normalize_how_to_update(_report(version, baseline))
 
 
+def test_normalized_report_links_archived_performance_reports() -> None:
+    text = _normalized_report("0.4.3", "v0.4.2")
+    assert "Older curated release-to-release reports are archived in `docs/archive/performance/`." in text
+
+
 def _legacy_report(version: str, baseline: str) -> str:
     return (
         _report(version, baseline)
diff --git a/scripts/tests/test_bench_compare.py b/scripts/tests/test_bench_compare.py
index 04c59bc..1d78e98 100644
--- a/scripts/tests/test_bench_compare.py
+++ b/scripts/tests/test_bench_compare.py
@@ -379,6 +379,7 @@ def test_main_snapshot_writes_output(tmp_path: Path) -> None:
     assert "just performance-release" in text
     assert "just performance-github-assets" in text
     assert "just performance-release <current-tag> <previous-tag>" in text
+    assert "Older curated release-to-release reports are archived in `docs/archive/performance/`." in text
     assert "git checkout" not in text
 
 
diff --git a/uv.lock b/uv.lock
index c8f71d0..6259155 100644
--- a/uv.lock
+++ b/uv.lock
@@ -409,7 +409,7 @@ wheels = [
 
 [[package]]
 name = "la-stack-scripts"
-version = "0.4.2"
+version = "0.4.3"
 source = { editable = "." }
 
 [package.dev-dependencies]