From 5bbbf0fa6117be9ea99e06176e4522821d5afba4 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 27 Apr 2026 04:56:36 +0900 Subject: [PATCH 1/3] test(keyviz): add Phase 2-A bench coverage + concurrent-burst exact-count test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the "Benchmark gate green" half of the Phase 2-A exit criteria in docs/admin_ui_key_visualizer_design.md 12. Adds four benchmarks that pin the cost shape of the keyviz package: - BenchmarkObserveParallel — contention profile of the hot path. Each goroutine hits a distinct slot, so any future change that introduces a shared mutex on Observe shows up as a sharp drop in ns/op. - BenchmarkRegisterRoute — COW route-table mutation cost; flat ns/op (vs growing-with-N) would mean the immutable-table invariant has been silently traded for a shared mutable map. - BenchmarkFlush — per-step drain over 1024 live routes; pins the swap-and-publish cost and catches an O(N^2) scan regression. - BenchmarkSnapshot — read-side pivot over 1024 routes x 64 columns; the dominant term in the admin handler's response latency. Local Apple M1 Max baselines (200 ms benchtime): - ObserveParallel: 20.6 ns/op, 0 allocs - RegisterRoute: 123.9 us/op, 24 allocs - Flush: 96.4 us/op - Snapshot: 1.86 ms/op Adds TestObserveExactCountUnderConcurrentBurst — 32 routes x 8 writers x 4000 ops, all gated on a sync.WaitGroup so every goroutine starts simultaneously. The test pins the Phase 2-A invariant that no counts are lost under concurrent burst (sampleRate is 1 today, every Observe must be reflected exactly in the post-Flush Snapshot). When the design 5.2 sub-sampling estimator lands, this test must be updated alongside the new +/-5% / 95% CI assertion. Pure-test addition; no production code changes. --- keyviz/sampler_test.go | 189 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go index da70424d6..8bf8ddef8 100644 --- a/keyviz/sampler_test.go +++ b/keyviz/sampler_test.go @@ -1093,3 +1093,192 @@ func BenchmarkObserveMiss(b *testing.B) { s.Observe(99, OpRead, 16, 64) } } + +// BenchmarkObserveParallel pins the contention profile of the hot +// path. Each goroutine hits a distinct slot so the only shared work +// is the atomic.Pointer load of the route table; per-slot atomic +// adds do not contend across goroutines. A regression that adds a +// shared mutex (or a global counter) on the hot path will show up +// as a sharp drop in ns/op as the parallelism rises. +func BenchmarkObserveParallel(b *testing.B) { + const numRoutes = 64 + s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes}) + for r := uint64(1); r <= numRoutes; r++ { + if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) { + b.Fatalf("RegisterRoute(%d) returned false", r) + } + } + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + var i uint64 + for pb.Next() { + i++ + s.Observe((i%numRoutes)+1, OpWrite, 16, 64) + } + }) +} + +// BenchmarkRegisterRoute pins the route-mutation path: each call +// takes routesMu, deep-copies the immutable routeTable, mutates, and +// republishes via atomic.Store. The b.N route IDs grow monotonically +// so each iteration sees the previous Register's larger table — the +// shape that exercises the COW cost. A regression that switches to +// a shared mutable map shows up as a flat ns/op (no growth with N). +func BenchmarkRegisterRoute(b *testing.B) { + s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: b.N + 1}) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + id := uint64(i + 1) //nolint:gosec // i bounded by b.N + if !s.RegisterRoute(id, []byte{byte(id)}, []byte{byte(id) + 1}) { + b.Fatalf("RegisterRoute(%d) returned false at i=%d", id, i) + } + } +} + +// BenchmarkFlush pins the per-step drain cost. Flush walks every +// live slot, atomic.SwapUint64s its four counters, and pushes a new +// MatrixColumn into the ring buffer. The hot path Observe must not +// race with this drain (atomic-only for both sides), but Flush itself +// scales with the live route count — pin its cost so we notice if +// a future change adds e.g. an O(N²) slot scan. +func BenchmarkFlush(b *testing.B) { + const numRoutes = 1024 + clk := &fakeClock{now: time.Unix(1_700_000_000, 0)} + s := NewMemSampler(MemSamplerOptions{ + Step: time.Second, + HistoryColumns: 16, + MaxTrackedRoutes: numRoutes, + Now: clk.Now, + }) + for r := uint64(1); r <= numRoutes; r++ { + if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) { + b.Fatalf("RegisterRoute(%d) returned false", r) + } + // Pre-seed every slot with traffic so Flush has work to swap. + s.Observe(r, OpWrite, 16, 64) + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + s.Flush() + clk.Advance(time.Second) + // Reseed so the next Flush still has counters to drain. + for r := uint64(1); r <= numRoutes; r++ { + s.Observe(r, OpWrite, 16, 64) + } + } +} + +// BenchmarkSnapshot pins the read-side pivot cost. Snapshot copies +// every column in the requested window into freshly allocated +// MatrixRows so the caller can freely mutate the result. With +// numRoutes×numColumns cells the pivot is O(N×C) and its cost is the +// dominant term in the admin handler's response latency. +func BenchmarkSnapshot(b *testing.B) { + const ( + numRoutes = 1024 + numColumns = 64 + ) + clk := &fakeClock{now: time.Unix(1_700_000_000, 0)} + s := NewMemSampler(MemSamplerOptions{ + Step: time.Second, + HistoryColumns: numColumns, + MaxTrackedRoutes: numRoutes, + Now: clk.Now, + }) + for r := uint64(1); r <= numRoutes; r++ { + if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) { + b.Fatalf("RegisterRoute(%d) returned false", r) + } + } + for c := 0; c < numColumns; c++ { + for r := uint64(1); r <= numRoutes; r++ { + s.Observe(r, OpWrite, 16, 64) + } + s.Flush() + clk.Advance(time.Second) + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.Snapshot(time.Time{}, time.Time{}) + } +} + +// TestObserveExactCountUnderConcurrentBurst pins the Phase 2-A +// "no counts lost" invariant under the kind of workload §5.2 calls +// out as the SLO target: many goroutines hammering many routes +// simultaneously. With sub-sampling not yet implemented (sampleRate +// = 1 always), every Observe must be reflected exactly in the +// post-Flush Snapshot. When sub-sampling lands and this invariant +// no longer holds, this test must be updated alongside the new +// estimator-based ±5% / 95%-CI assertion described in §5.2. +func TestObserveExactCountUnderConcurrentBurst(t *testing.T) { + t.Parallel() + const ( + numRoutes = 32 + writersPerRoute = 8 + opsPerWriter = 4_000 + keyLen = 16 + valueLen = 64 + ) + s, clk := newTestSampler(t, MemSamplerOptions{ + Step: time.Second, + HistoryColumns: 8, + MaxTrackedRoutes: numRoutes, + }) + for r := uint64(1); r <= numRoutes; r++ { + if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) { + t.Fatalf("RegisterRoute(%d) returned false", r) + } + } + + runConcurrentBurst(s, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen) + clk.Advance(time.Second) + s.Flush() + + cols := s.Snapshot(time.Time{}, time.Time{}) + if len(cols) != 1 { + t.Fatalf("expected 1 column after a single Flush, got %d", len(cols)) + } + const expectedPerRoute = uint64(writersPerRoute * opsPerWriter) + const expectedBytesPerRoute = expectedPerRoute * uint64(keyLen+valueLen) + for _, row := range cols[0].Rows { + if row.Writes != expectedPerRoute { + t.Errorf("route %d: writes = %d, want exactly %d (no counts must be lost under concurrent burst)", + row.RouteID, row.Writes, expectedPerRoute) + } + if row.WriteBytes != expectedBytesPerRoute { + t.Errorf("route %d: writeBytes = %d, want exactly %d", + row.RouteID, row.WriteBytes, expectedBytesPerRoute) + } + } +} + +// runConcurrentBurst spawns numRoutes×writersPerRoute goroutines and +// releases them simultaneously so every route sees genuinely +// concurrent Observe traffic. Returns once every writer has finished. +func runConcurrentBurst(s *MemSampler, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen int) { + var ready, start, done sync.WaitGroup + ready.Add(numRoutes * writersPerRoute) + done.Add(numRoutes * writersPerRoute) + start.Add(1) + for r := 1; r <= numRoutes; r++ { + routeID := uint64(r) //nolint:gosec // r bounded by numRoutes + for w := 0; w < writersPerRoute; w++ { + go func() { + defer done.Done() + ready.Done() + start.Wait() + for op := 0; op < opsPerWriter; op++ { + s.Observe(routeID, OpWrite, keyLen, valueLen) + } + }() + } + } + ready.Wait() + start.Done() + done.Wait() +} From 19abb415cca291c95233ff757357242bc6e248c5 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 27 Apr 2026 17:58:19 +0900 Subject: [PATCH 2/3] test(keyviz): address PR #682 round-1 review (Gemini + Codex) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four items from the round-1 reviews: 1. Codex P1: TestObserveExactCountUnderConcurrentBurst only validated rows that appeared in cols[0].Rows. A future Flush regression that silently dropped a registered route would still pass — the loop would just iterate fewer times. Build a route ID -> row index and assert every registered route is present, then validate counters. 2. Gemini medium / Codex P2: BenchmarkObserveParallel claimed each goroutine hit a distinct slot, but the per-worker counter started at zero so all workers walked the same route sequence in lockstep. That defeated the "no contention" claim. Use atomic.Add to assign each worker a disjoint shard of route IDs (4 routes per shard, shardIndex * 4 base offset modulo numRoutes). 3. Gemini medium: BenchmarkRegisterRoute used b.N as both the iteration count AND the table size, producing O(N^2) total work. Go reports ns/op = total / b.N, so a constant-per-call cost read as growing with N. Use a fixed table size (1024 pre-loaded) and churn one ID in/out per iteration so b.N controls only the iteration count. 4. Gemini medium: BenchmarkFlush included the per-iteration reseed loop in the timed range, inflating ns/op by ~25% and letting a real Flush regression hide behind Observe variance. Bracket the reseed with b.StopTimer / b.StartTimer. New M1 Max baselines (200 ms benchtime): - ObserveParallel: 4.4 ns/op (was 20.6 — workers now genuinely parallel) - RegisterRoute: 162.2 us/op (constant; was reading as growing-with-N) - Flush: 82.3 us/op (was 96.4 — reseed excluded) - Snapshot: 1.92 ms/op (unchanged; was timed correctly already) --- keyviz/sampler_test.go | 84 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go index 8bf8ddef8..9abdc9df7 100644 --- a/keyviz/sampler_test.go +++ b/keyviz/sampler_test.go @@ -1095,44 +1095,74 @@ func BenchmarkObserveMiss(b *testing.B) { } // BenchmarkObserveParallel pins the contention profile of the hot -// path. Each goroutine hits a distinct slot so the only shared work -// is the atomic.Pointer load of the route table; per-slot atomic -// adds do not contend across goroutines. A regression that adds a -// shared mutex (or a global counter) on the hot path will show up -// as a sharp drop in ns/op as the parallelism rises. +// path. Each goroutine is shard-pinned to a disjoint range of route +// IDs so per-slot atomic adds genuinely never contend across +// goroutines and the only shared work is the atomic.Pointer load of +// the route table. A regression that adds a shared mutex (or a +// global counter) on the hot path will show up as a sharp drop in +// ns/op as parallelism rises. +// +// The earlier draft used a per-worker `var i uint64` counter which +// always started at zero, so all workers walked the same route +// sequence in lockstep — that defeated the "distinct slot" claim +// and the benchmark would have masked false-sharing regressions +// (Codex P2 / Gemini medium on PR #682). func BenchmarkObserveParallel(b *testing.B) { - const numRoutes = 64 + const ( + numRoutes = 64 + routesPerShard = 4 + ) s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes}) for r := uint64(1); r <= numRoutes; r++ { if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) { b.Fatalf("RegisterRoute(%d) returned false", r) } } + var nextShard atomic.Uint64 b.ReportAllocs() b.ResetTimer() b.RunParallel(func(pb *testing.PB) { + shardIndex := nextShard.Add(1) - 1 + shardBase := (shardIndex * routesPerShard) % numRoutes var i uint64 for pb.Next() { + s.Observe(shardBase+(i%routesPerShard)+1, OpWrite, 16, 64) i++ - s.Observe((i%numRoutes)+1, OpWrite, 16, 64) } }) } // BenchmarkRegisterRoute pins the route-mutation path: each call // takes routesMu, deep-copies the immutable routeTable, mutates, and -// republishes via atomic.Store. The b.N route IDs grow monotonically -// so each iteration sees the previous Register's larger table — the -// shape that exercises the COW cost. A regression that switches to -// a shared mutable map shows up as a flat ns/op (no growth with N). +// republishes via atomic.Store. The benchmark holds a fixed-size +// table (registerBenchTableSize routes pre-loaded) and toggles a +// route in/out on each iteration so b.N controls the iteration count +// only — not the table size. The earlier draft made b.N drive both +// the iteration count and the MaxTrackedRoutes cap, which produced an +// O(N²) total cost and made ns/op (which Go derives by dividing total +// time by b.N) read as growing-with-N even though per-call cost was +// constant (Gemini medium on PR #682). func BenchmarkRegisterRoute(b *testing.B) { - s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: b.N + 1}) + const registerBenchTableSize = 1024 + s := NewMemSampler(MemSamplerOptions{ + Step: time.Second, + HistoryColumns: 4, + MaxTrackedRoutes: registerBenchTableSize + 1, + }) + for r := uint64(1); r <= registerBenchTableSize; r++ { + if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) { + b.Fatalf("seed RegisterRoute(%d) returned false", r) + } + } + churnID := uint64(registerBenchTableSize + 1) + startKey := []byte{byte(churnID >> 8), byte(churnID & 0xFF)} + endKey := []byte{byte((churnID + 1) >> 8), byte((churnID + 1) & 0xFF)} b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - id := uint64(i + 1) //nolint:gosec // i bounded by b.N - if !s.RegisterRoute(id, []byte{byte(id)}, []byte{byte(id) + 1}) { - b.Fatalf("RegisterRoute(%d) returned false at i=%d", id, i) + s.RemoveRoute(churnID) + if !s.RegisterRoute(churnID, startKey, endKey) { + b.Fatalf("RegisterRoute(%d) returned false at i=%d", churnID, i) } } } @@ -1143,6 +1173,12 @@ func BenchmarkRegisterRoute(b *testing.B) { // race with this drain (atomic-only for both sides), but Flush itself // scales with the live route count — pin its cost so we notice if // a future change adds e.g. an O(N²) slot scan. +// +// b.StopTimer brackets the per-iteration reseed loop so the reported +// ns/op reflects the Flush cost only. Including the 1024 Observe +// calls in the timed range inflated the number by ~25% and let a +// real Flush regression hide behind Observe variance (Gemini medium +// on PR #682). func BenchmarkFlush(b *testing.B) { const numRoutes = 1024 clk := &fakeClock{now: time.Unix(1_700_000_000, 0)} @@ -1156,18 +1192,20 @@ func BenchmarkFlush(b *testing.B) { if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) { b.Fatalf("RegisterRoute(%d) returned false", r) } - // Pre-seed every slot with traffic so Flush has work to swap. + // Pre-seed every slot with traffic so the first Flush has work to swap. s.Observe(r, OpWrite, 16, 64) } b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { s.Flush() + b.StopTimer() clk.Advance(time.Second) // Reseed so the next Flush still has counters to drain. for r := uint64(1); r <= numRoutes; r++ { s.Observe(r, OpWrite, 16, 64) } + b.StartTimer() } } @@ -1245,7 +1283,21 @@ func TestObserveExactCountUnderConcurrentBurst(t *testing.T) { } const expectedPerRoute = uint64(writersPerRoute * opsPerWriter) const expectedBytesPerRoute = expectedPerRoute * uint64(keyLen+valueLen) + + // Codex P1 on PR #682: assert every registered route appears in + // the column. Without this index, a future Flush regression that + // silently drops a route's row would still pass the per-row + // counter checks below — the loop would just iterate fewer times. + rowsByRoute := make(map[uint64]MatrixRow, numRoutes) for _, row := range cols[0].Rows { + rowsByRoute[row.RouteID] = row + } + for r := uint64(1); r <= numRoutes; r++ { + row, ok := rowsByRoute[r] + if !ok { + t.Errorf("route %d: missing from Snapshot rows; Flush must not silently drop a registered route under burst", r) + continue + } if row.Writes != expectedPerRoute { t.Errorf("route %d: writes = %d, want exactly %d (no counts must be lost under concurrent burst)", row.RouteID, row.Writes, expectedPerRoute) From 3310a1a6befad69505cf54344f5e8e4725f03911 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 27 Apr 2026 18:08:38 +0900 Subject: [PATCH 3/3] test(keyviz): address PR #682 round-2 (Claude bot P1+P2) Two items from Claude bot's round-2 on 19abb415: - P1: drop the //nolint:gosec in runConcurrentBurst. Per CLAUDE.md ("avoid //nolint - refactor instead"), change the helper signature to take numRoutes as uint64 and iterate in that type from the start, eliminating the int->uint64 conversion that triggered gosec G115. - P2: bump BenchmarkObserveParallel's numRoutes from 64 to 256 so numRoutes / routesPerShard >= 64 disjoint shards. With 16 shards, RunParallel on a host with GOMAXPROCS > 16 wrapped multiple workers onto the same 4-route shard, silently reintroducing the shared-counter contention this benchmark exists to catch. 256 keeps the benchmark meaningful up to GOMAXPROCS = 64 without changing its character. --- keyviz/sampler_test.go | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go index 9abdc9df7..f1ed28f92 100644 --- a/keyviz/sampler_test.go +++ b/keyviz/sampler_test.go @@ -1102,19 +1102,21 @@ func BenchmarkObserveMiss(b *testing.B) { // global counter) on the hot path will show up as a sharp drop in // ns/op as parallelism rises. // -// The earlier draft used a per-worker `var i uint64` counter which -// always started at zero, so all workers walked the same route -// sequence in lockstep — that defeated the "distinct slot" claim -// and the benchmark would have masked false-sharing regressions -// (Codex P2 / Gemini medium on PR #682). +// numRoutes is sized for numRoutes / routesPerShard >= 64 +// disjoint shards so the benchmark stays meaningful up to +// GOMAXPROCS = 64. The previous draft (numRoutes = 64, +// routesPerShard = 4) only had 16 shards and silently regressed +// to shared-counter contention on bigger CI runners — the very +// regression class this benchmark exists to detect (Claude bot +// round-2 P2 on PR #682). func BenchmarkObserveParallel(b *testing.B) { const ( - numRoutes = 64 + numRoutes = 256 routesPerShard = 4 ) s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes}) for r := uint64(1); r <= numRoutes; r++ { - if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) { + if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) { b.Fatalf("RegisterRoute(%d) returned false", r) } } @@ -1312,13 +1314,20 @@ func TestObserveExactCountUnderConcurrentBurst(t *testing.T) { // runConcurrentBurst spawns numRoutes×writersPerRoute goroutines and // releases them simultaneously so every route sees genuinely // concurrent Observe traffic. Returns once every writer has finished. -func runConcurrentBurst(s *MemSampler, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen int) { +// +// numRoutes is uint64 so the loop iterates in the same type the +// sampler uses for RouteID; the earlier `int` form needed a +// per-iteration `uint64(r)` cast that triggered gosec G115 and a +// nolint suppression CLAUDE.md tells us to refactor instead of +// suppress (Claude bot round-2 P1 on PR #682). +func runConcurrentBurst(s *MemSampler, numRoutes uint64, writersPerRoute, opsPerWriter, keyLen, valueLen int) { var ready, start, done sync.WaitGroup - ready.Add(numRoutes * writersPerRoute) - done.Add(numRoutes * writersPerRoute) + total := int(numRoutes) * writersPerRoute + ready.Add(total) + done.Add(total) start.Add(1) - for r := 1; r <= numRoutes; r++ { - routeID := uint64(r) //nolint:gosec // r bounded by numRoutes + for r := uint64(1); r <= numRoutes; r++ { + routeID := r for w := 0; w < writersPerRoute; w++ { go func() { defer done.Done()