From 5bbbf0fa6117be9ea99e06176e4522821d5afba4 Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Mon, 27 Apr 2026 04:56:36 +0900
Subject: [PATCH 1/3] test(keyviz): add Phase 2-A bench coverage +
 concurrent-burst exact-count test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the "Benchmark gate green" half of the Phase 2-A exit criteria
in docs/admin_ui_key_visualizer_design.md 12.

Adds four benchmarks that pin the cost shape of the keyviz package:

- BenchmarkObserveParallel — contention profile of the hot path. Each
  goroutine hits a distinct slot, so any future change that introduces
  a shared mutex on Observe shows up as a sharp drop in ns/op.
- BenchmarkRegisterRoute — COW route-table mutation cost; flat ns/op
  (vs growing-with-N) would mean the immutable-table invariant has
  been silently traded for a shared mutable map.
- BenchmarkFlush — per-step drain over 1024 live routes; pins the
  swap-and-publish cost and catches an O(N^2) scan regression.
- BenchmarkSnapshot — read-side pivot over 1024 routes x 64 columns;
  the dominant term in the admin handler's response latency.

Local Apple M1 Max baselines (200 ms benchtime):
- ObserveParallel: 20.6 ns/op, 0 allocs
- RegisterRoute:   123.9 us/op, 24 allocs
- Flush:           96.4 us/op
- Snapshot:        1.86 ms/op

Adds TestObserveExactCountUnderConcurrentBurst — 32 routes x 8
writers x 4000 ops, all gated on a sync.WaitGroup so every goroutine
starts simultaneously. The test pins the Phase 2-A invariant that no
counts are lost under concurrent burst (sampleRate is 1 today, every
Observe must be reflected exactly in the post-Flush Snapshot). When
the design 5.2 sub-sampling estimator lands, this test must be
updated alongside the new +/-5% / 95% CI assertion.

Pure-test addition; no production code changes.
---
 keyviz/sampler_test.go | 189 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go
index da70424d6..8bf8ddef8 100644
--- a/keyviz/sampler_test.go
+++ b/keyviz/sampler_test.go
@@ -1093,3 +1093,192 @@ func BenchmarkObserveMiss(b *testing.B) {
 		s.Observe(99, OpRead, 16, 64)
 	}
 }
+
+// BenchmarkObserveParallel pins the contention profile of the hot
+// path. Each goroutine hits a distinct slot so the only shared work
+// is the atomic.Pointer load of the route table; per-slot atomic
+// adds do not contend across goroutines. A regression that adds a
+// shared mutex (or a global counter) on the hot path will show up
+// as a sharp drop in ns/op as the parallelism rises.
+func BenchmarkObserveParallel(b *testing.B) {
+	const numRoutes = 64
+	s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes})
+	for r := uint64(1); r <= numRoutes; r++ {
+		if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) {
+			b.Fatalf("RegisterRoute(%d) returned false", r)
+		}
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		var i uint64
+		for pb.Next() {
+			i++
+			s.Observe((i%numRoutes)+1, OpWrite, 16, 64)
+		}
+	})
+}
+
+// BenchmarkRegisterRoute pins the route-mutation path: each call
+// takes routesMu, deep-copies the immutable routeTable, mutates, and
+// republishes via atomic.Store. The b.N route IDs grow monotonically
+// so each iteration sees the previous Register's larger table — the
+// shape that exercises the COW cost. A regression that switches to
+// a shared mutable map shows up as a flat ns/op (no growth with N).
+func BenchmarkRegisterRoute(b *testing.B) {
+	s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: b.N + 1})
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		id := uint64(i + 1) //nolint:gosec // i bounded by b.N
+		if !s.RegisterRoute(id, []byte{byte(id)}, []byte{byte(id) + 1}) {
+			b.Fatalf("RegisterRoute(%d) returned false at i=%d", id, i)
+		}
+	}
+}
+
+// BenchmarkFlush pins the per-step drain cost. Flush walks every
+// live slot, atomic.SwapUint64s its four counters, and pushes a new
+// MatrixColumn into the ring buffer. The hot path Observe must not
+// race with this drain (atomic-only for both sides), but Flush itself
+// scales with the live route count — pin its cost so we notice if
+// a future change adds e.g. an O(N²) slot scan.
+func BenchmarkFlush(b *testing.B) {
+	const numRoutes = 1024
+	clk := &fakeClock{now: time.Unix(1_700_000_000, 0)}
+	s := NewMemSampler(MemSamplerOptions{
+		Step:             time.Second,
+		HistoryColumns:   16,
+		MaxTrackedRoutes: numRoutes,
+		Now:              clk.Now,
+	})
+	for r := uint64(1); r <= numRoutes; r++ {
+		if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) {
+			b.Fatalf("RegisterRoute(%d) returned false", r)
+		}
+		// Pre-seed every slot with traffic so Flush has work to swap.
+		s.Observe(r, OpWrite, 16, 64)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Flush()
+		clk.Advance(time.Second)
+		// Reseed so the next Flush still has counters to drain.
+		for r := uint64(1); r <= numRoutes; r++ {
+			s.Observe(r, OpWrite, 16, 64)
+		}
+	}
+}
+
+// BenchmarkSnapshot pins the read-side pivot cost. Snapshot copies
+// every column in the requested window into freshly allocated
+// MatrixRows so the caller can freely mutate the result. With
+// numRoutes×numColumns cells the pivot is O(N×C) and its cost is the
+// dominant term in the admin handler's response latency.
+func BenchmarkSnapshot(b *testing.B) {
+	const (
+		numRoutes  = 1024
+		numColumns = 64
+	)
+	clk := &fakeClock{now: time.Unix(1_700_000_000, 0)}
+	s := NewMemSampler(MemSamplerOptions{
+		Step:             time.Second,
+		HistoryColumns:   numColumns,
+		MaxTrackedRoutes: numRoutes,
+		Now:              clk.Now,
+	})
+	for r := uint64(1); r <= numRoutes; r++ {
+		if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) {
+			b.Fatalf("RegisterRoute(%d) returned false", r)
+		}
+	}
+	for c := 0; c < numColumns; c++ {
+		for r := uint64(1); r <= numRoutes; r++ {
+			s.Observe(r, OpWrite, 16, 64)
+		}
+		s.Flush()
+		clk.Advance(time.Second)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = s.Snapshot(time.Time{}, time.Time{})
+	}
+}
+
+// TestObserveExactCountUnderConcurrentBurst pins the Phase 2-A
+// "no counts lost" invariant under the kind of workload §5.2 calls
+// out as the SLO target: many goroutines hammering many routes
+// simultaneously. With sub-sampling not yet implemented (sampleRate
+// = 1 always), every Observe must be reflected exactly in the
+// post-Flush Snapshot. When sub-sampling lands and this invariant
+// no longer holds, this test must be updated alongside the new
+// estimator-based ±5% / 95%-CI assertion described in §5.2.
+func TestObserveExactCountUnderConcurrentBurst(t *testing.T) {
+	t.Parallel()
+	const (
+		numRoutes       = 32
+		writersPerRoute = 8
+		opsPerWriter    = 4_000
+		keyLen          = 16
+		valueLen        = 64
+	)
+	s, clk := newTestSampler(t, MemSamplerOptions{
+		Step:             time.Second,
+		HistoryColumns:   8,
+		MaxTrackedRoutes: numRoutes,
+	})
+	for r := uint64(1); r <= numRoutes; r++ {
+		if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) {
+			t.Fatalf("RegisterRoute(%d) returned false", r)
+		}
+	}
+
+	runConcurrentBurst(s, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen)
+	clk.Advance(time.Second)
+	s.Flush()
+
+	cols := s.Snapshot(time.Time{}, time.Time{})
+	if len(cols) != 1 {
+		t.Fatalf("expected 1 column after a single Flush, got %d", len(cols))
+	}
+	const expectedPerRoute = uint64(writersPerRoute * opsPerWriter)
+	const expectedBytesPerRoute = expectedPerRoute * uint64(keyLen+valueLen)
+	for _, row := range cols[0].Rows {
+		if row.Writes != expectedPerRoute {
+			t.Errorf("route %d: writes = %d, want exactly %d (no counts must be lost under concurrent burst)",
+				row.RouteID, row.Writes, expectedPerRoute)
+		}
+		if row.WriteBytes != expectedBytesPerRoute {
+			t.Errorf("route %d: writeBytes = %d, want exactly %d",
+				row.RouteID, row.WriteBytes, expectedBytesPerRoute)
+		}
+	}
+}
+
+// runConcurrentBurst spawns numRoutes×writersPerRoute goroutines and
+// releases them simultaneously so every route sees genuinely
+// concurrent Observe traffic. Returns once every writer has finished.
+func runConcurrentBurst(s *MemSampler, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen int) {
+	var ready, start, done sync.WaitGroup
+	ready.Add(numRoutes * writersPerRoute)
+	done.Add(numRoutes * writersPerRoute)
+	start.Add(1)
+	for r := 1; r <= numRoutes; r++ {
+		routeID := uint64(r) //nolint:gosec // r bounded by numRoutes
+		for w := 0; w < writersPerRoute; w++ {
+			go func() {
+				defer done.Done()
+				ready.Done()
+				start.Wait()
+				for op := 0; op < opsPerWriter; op++ {
+					s.Observe(routeID, OpWrite, keyLen, valueLen)
+				}
+			}()
+		}
+	}
+	ready.Wait()
+	start.Done()
+	done.Wait()
+}

From 19abb415cca291c95233ff757357242bc6e248c5 Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Mon, 27 Apr 2026 17:58:19 +0900
Subject: [PATCH 2/3] test(keyviz): address PR #682 round-1 review (Gemini +
 Codex)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four items from the round-1 reviews:

1. Codex P1: TestObserveExactCountUnderConcurrentBurst only validated
   rows that appeared in cols[0].Rows. A future Flush regression that
   silently dropped a registered route would still pass — the loop
   would just iterate fewer times. Build a route ID -> row index and
   assert every registered route is present, then validate counters.

2. Gemini medium / Codex P2: BenchmarkObserveParallel claimed each
   goroutine hit a distinct slot, but the per-worker counter started
   at zero so all workers walked the same route sequence in lockstep.
   That defeated the "no contention" claim. Use atomic.Add to assign
   each worker a disjoint shard of route IDs (4 routes per shard,
   shardIndex * 4 base offset modulo numRoutes).

3. Gemini medium: BenchmarkRegisterRoute used b.N as both the
   iteration count AND the table size, producing O(N^2) total work.
   Go reports ns/op = total / b.N, so a constant-per-call cost read
   as growing with N. Use a fixed table size (1024 pre-loaded) and
   churn one ID in/out per iteration so b.N controls only the
   iteration count.

4. Gemini medium: BenchmarkFlush included the per-iteration reseed
   loop in the timed range, inflating ns/op by ~25% and letting a
   real Flush regression hide behind Observe variance. Bracket the
   reseed with b.StopTimer / b.StartTimer.

New M1 Max baselines (200 ms benchtime):
- ObserveParallel:  4.4 ns/op (was 20.6 — workers now genuinely parallel)
- RegisterRoute:  162.2 us/op (constant; was reading as growing-with-N)
- Flush:           82.3 us/op (was 96.4 — reseed excluded)
- Snapshot:         1.92 ms/op (unchanged; was timed correctly already)
---
 keyviz/sampler_test.go | 84 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 68 insertions(+), 16 deletions(-)

diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go
index 8bf8ddef8..9abdc9df7 100644
--- a/keyviz/sampler_test.go
+++ b/keyviz/sampler_test.go
@@ -1095,44 +1095,74 @@ func BenchmarkObserveMiss(b *testing.B) {
 }
 
 // BenchmarkObserveParallel pins the contention profile of the hot
-// path. Each goroutine hits a distinct slot so the only shared work
-// is the atomic.Pointer load of the route table; per-slot atomic
-// adds do not contend across goroutines. A regression that adds a
-// shared mutex (or a global counter) on the hot path will show up
-// as a sharp drop in ns/op as the parallelism rises.
+// path. Each goroutine is shard-pinned to a disjoint range of route
+// IDs so per-slot atomic adds genuinely never contend across
+// goroutines and the only shared work is the atomic.Pointer load of
+// the route table. A regression that adds a shared mutex (or a
+// global counter) on the hot path will show up as a sharp drop in
+// ns/op as parallelism rises.
+//
+// The earlier draft used a per-worker `var i uint64` counter which
+// always started at zero, so all workers walked the same route
+// sequence in lockstep — that defeated the "distinct slot" claim
+// and the benchmark would have masked false-sharing regressions
+// (Codex P2 / Gemini medium on PR #682).
 func BenchmarkObserveParallel(b *testing.B) {
-	const numRoutes = 64
+	const (
+		numRoutes      = 64
+		routesPerShard = 4
+	)
 	s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes})
 	for r := uint64(1); r <= numRoutes; r++ {
 		if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) {
 			b.Fatalf("RegisterRoute(%d) returned false", r)
 		}
 	}
+	var nextShard atomic.Uint64
 	b.ReportAllocs()
 	b.ResetTimer()
 	b.RunParallel(func(pb *testing.PB) {
+		shardIndex := nextShard.Add(1) - 1
+		shardBase := (shardIndex * routesPerShard) % numRoutes
 		var i uint64
 		for pb.Next() {
+			s.Observe(shardBase+(i%routesPerShard)+1, OpWrite, 16, 64)
 			i++
-			s.Observe((i%numRoutes)+1, OpWrite, 16, 64)
 		}
 	})
 }
 
 // BenchmarkRegisterRoute pins the route-mutation path: each call
 // takes routesMu, deep-copies the immutable routeTable, mutates, and
-// republishes via atomic.Store. The b.N route IDs grow monotonically
-// so each iteration sees the previous Register's larger table — the
-// shape that exercises the COW cost. A regression that switches to
-// a shared mutable map shows up as a flat ns/op (no growth with N).
+// republishes via atomic.Store. The benchmark holds a fixed-size
+// table (registerBenchTableSize routes pre-loaded) and toggles a
+// route in/out on each iteration so b.N controls the iteration count
+// only — not the table size. The earlier draft made b.N drive both
+// the iteration count and the MaxTrackedRoutes cap, which produced an
+// O(N²) total cost and made ns/op (which Go derives by dividing total
+// time by b.N) read as growing-with-N even though per-call cost was
+// constant (Gemini medium on PR #682).
 func BenchmarkRegisterRoute(b *testing.B) {
-	s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: b.N + 1})
+	const registerBenchTableSize = 1024
+	s := NewMemSampler(MemSamplerOptions{
+		Step:             time.Second,
+		HistoryColumns:   4,
+		MaxTrackedRoutes: registerBenchTableSize + 1,
+	})
+	for r := uint64(1); r <= registerBenchTableSize; r++ {
+		if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) {
+			b.Fatalf("seed RegisterRoute(%d) returned false", r)
+		}
+	}
+	churnID := uint64(registerBenchTableSize + 1)
+	startKey := []byte{byte(churnID >> 8), byte(churnID & 0xFF)}
+	endKey := []byte{byte((churnID + 1) >> 8), byte((churnID + 1) & 0xFF)}
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		id := uint64(i + 1) //nolint:gosec // i bounded by b.N
-		if !s.RegisterRoute(id, []byte{byte(id)}, []byte{byte(id) + 1}) {
-			b.Fatalf("RegisterRoute(%d) returned false at i=%d", id, i)
+		s.RemoveRoute(churnID)
+		if !s.RegisterRoute(churnID, startKey, endKey) {
+			b.Fatalf("RegisterRoute(%d) returned false at i=%d", churnID, i)
 		}
 	}
 }
@@ -1143,6 +1173,12 @@ func BenchmarkRegisterRoute(b *testing.B) {
 // race with this drain (atomic-only for both sides), but Flush itself
 // scales with the live route count — pin its cost so we notice if
 // a future change adds e.g. an O(N²) slot scan.
+//
+// b.StopTimer brackets the per-iteration reseed loop so the reported
+// ns/op reflects the Flush cost only. Including the 1024 Observe
+// calls in the timed range inflated the number by ~25% and let a
+// real Flush regression hide behind Observe variance (Gemini medium
+// on PR #682).
 func BenchmarkFlush(b *testing.B) {
 	const numRoutes = 1024
 	clk := &fakeClock{now: time.Unix(1_700_000_000, 0)}
@@ -1156,18 +1192,20 @@ func BenchmarkFlush(b *testing.B) {
 		if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) {
 			b.Fatalf("RegisterRoute(%d) returned false", r)
 		}
-		// Pre-seed every slot with traffic so Flush has work to swap.
+		// Pre-seed every slot with traffic so the first Flush has work to swap.
 		s.Observe(r, OpWrite, 16, 64)
 	}
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		s.Flush()
+		b.StopTimer()
 		clk.Advance(time.Second)
 		// Reseed so the next Flush still has counters to drain.
 		for r := uint64(1); r <= numRoutes; r++ {
 			s.Observe(r, OpWrite, 16, 64)
 		}
+		b.StartTimer()
 	}
 }
 
@@ -1245,7 +1283,21 @@ func TestObserveExactCountUnderConcurrentBurst(t *testing.T) {
 	}
 	const expectedPerRoute = uint64(writersPerRoute * opsPerWriter)
 	const expectedBytesPerRoute = expectedPerRoute * uint64(keyLen+valueLen)
+
+	// Codex P1 on PR #682: assert every registered route appears in
+	// the column. Without this index, a future Flush regression that
+	// silently drops a route's row would still pass the per-row
+	// counter checks below — the loop would just iterate fewer times.
+	rowsByRoute := make(map[uint64]MatrixRow, numRoutes)
 	for _, row := range cols[0].Rows {
+		rowsByRoute[row.RouteID] = row
+	}
+	for r := uint64(1); r <= numRoutes; r++ {
+		row, ok := rowsByRoute[r]
+		if !ok {
+			t.Errorf("route %d: missing from Snapshot rows; Flush must not silently drop a registered route under burst", r)
+			continue
+		}
 		if row.Writes != expectedPerRoute {
 			t.Errorf("route %d: writes = %d, want exactly %d (no counts must be lost under concurrent burst)",
 				row.RouteID, row.Writes, expectedPerRoute)

From 3310a1a6befad69505cf54344f5e8e4725f03911 Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Mon, 27 Apr 2026 18:08:38 +0900
Subject: [PATCH 3/3] test(keyviz): address PR #682 round-2 (Claude bot P1+P2)

Two items from Claude bot's round-2 on 19abb415:

- P1: drop the //nolint:gosec in runConcurrentBurst. Per CLAUDE.md
  ("avoid //nolint - refactor instead"), change the helper signature
  to take numRoutes as uint64 and iterate in that type from the
  start, eliminating the int->uint64 conversion that triggered
  gosec G115.
- P2: bump BenchmarkObserveParallel's numRoutes from 64 to 256 so
  numRoutes / routesPerShard >= 64 disjoint shards. With 16 shards,
  RunParallel on a host with GOMAXPROCS > 16 wrapped multiple
  workers onto the same 4-route shard, silently reintroducing the
  shared-counter contention this benchmark exists to catch.
  256 keeps the benchmark meaningful up to GOMAXPROCS = 64 without
  changing its character.
---
 keyviz/sampler_test.go | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/keyviz/sampler_test.go b/keyviz/sampler_test.go
index 9abdc9df7..f1ed28f92 100644
--- a/keyviz/sampler_test.go
+++ b/keyviz/sampler_test.go
@@ -1102,19 +1102,21 @@ func BenchmarkObserveMiss(b *testing.B) {
 // global counter) on the hot path will show up as a sharp drop in
 // ns/op as parallelism rises.
 //
-// The earlier draft used a per-worker `var i uint64` counter which
-// always started at zero, so all workers walked the same route
-// sequence in lockstep — that defeated the "distinct slot" claim
-// and the benchmark would have masked false-sharing regressions
-// (Codex P2 / Gemini medium on PR #682).
+// numRoutes is sized for numRoutes / routesPerShard >= 64
+// disjoint shards so the benchmark stays meaningful up to
+// GOMAXPROCS = 64. The previous draft (numRoutes = 64,
+// routesPerShard = 4) only had 16 shards and silently regressed
+// to shared-counter contention on bigger CI runners — the very
+// regression class this benchmark exists to detect (Claude bot
+// round-2 P2 on PR #682).
 func BenchmarkObserveParallel(b *testing.B) {
 	const (
-		numRoutes      = 64
+		numRoutes      = 256
 		routesPerShard = 4
 	)
 	s := NewMemSampler(MemSamplerOptions{Step: time.Second, HistoryColumns: 4, MaxTrackedRoutes: numRoutes})
 	for r := uint64(1); r <= numRoutes; r++ {
-		if !s.RegisterRoute(r, []byte{byte(r)}, []byte{byte(r) + 1}) {
+		if !s.RegisterRoute(r, []byte{byte(r >> 8), byte(r)}, []byte{byte((r + 1) >> 8), byte(r + 1)}) {
 			b.Fatalf("RegisterRoute(%d) returned false", r)
 		}
 	}
@@ -1312,13 +1314,20 @@ func TestObserveExactCountUnderConcurrentBurst(t *testing.T) {
 // runConcurrentBurst spawns numRoutes×writersPerRoute goroutines and
 // releases them simultaneously so every route sees genuinely
 // concurrent Observe traffic. Returns once every writer has finished.
-func runConcurrentBurst(s *MemSampler, numRoutes, writersPerRoute, opsPerWriter, keyLen, valueLen int) {
+//
+// numRoutes is uint64 so the loop iterates in the same type the
+// sampler uses for RouteID; the earlier `int` form needed a
+// per-iteration `uint64(r)` cast that triggered gosec G115 and a
+// nolint suppression CLAUDE.md tells us to refactor instead of
+// suppress (Claude bot round-2 P1 on PR #682).
+func runConcurrentBurst(s *MemSampler, numRoutes uint64, writersPerRoute, opsPerWriter, keyLen, valueLen int) {
 	var ready, start, done sync.WaitGroup
-	ready.Add(numRoutes * writersPerRoute)
-	done.Add(numRoutes * writersPerRoute)
+	total := int(numRoutes) * writersPerRoute
+	ready.Add(total)
+	done.Add(total)
 	start.Add(1)
-	for r := 1; r <= numRoutes; r++ {
-		routeID := uint64(r) //nolint:gosec // r bounded by numRoutes
+	for r := uint64(1); r <= numRoutes; r++ {
+		routeID := r
 		for w := 0; w < writersPerRoute; w++ {
 			go func() {
 				defer done.Done()