From 90ca5284e8991e06750e02a78335ce78f9882280 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:02:13 +0900 Subject: [PATCH 1/7] =?UTF-8?q?tla:=20M1=20=E2=80=94=20HLC=20TLA+=20spec?= =?UTF-8?q?=20+=20lib=20+=20make=20tla-check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per docs/design/2026_05_28_proposed_tla_safety_spec.md §8.1 (M1 deliverables): adds the TLA+ specification suite directory layout and the HLC module, runnable via `make tla-check`. Files: - tla/lib/Env.tla — shared constants and the HLC-4 precondition (i) ASSUME (MaxClockSkewMs < HlcPhysicalWindowMs). - tla/lib/Raft.tla — abstract Raft interface (currentTerm, leaderOf, BecomeLeader, IsLeader). Log matching and leader completeness are treated as assumed properties per NG4. - tla/hlc/HLC.tla — HLC layer with all five invariants: - HLC-1 (per-node monotonic), HLC-2 (ceiling monotonic), HLC-3 (leader-only issuance), HLC-4 (no regression across leader change), TypeOK. Strategy (c) handoff lives in BecomeLeader_HLC; the ceiling fence is an enabling guard on IssueTimestamp. Both are gated by CONSTANT EnableSafety so the same spec drives the correct-design and gap configurations. - tla/hlc/MCHLC.tla — TLC model-check instance. - tla/hlc/MCHLC.cfg — EnableSafety = TRUE (expected to PASS all invariants at default bounds: 3 nodes / 2 terms / 3 ops). - tla/hlc/MCHLC_gap.cfg — EnableSafety = FALSE (expected to FAIL on HLC-4 with a counterexample, demonstrating that strategy (c) + ceiling fence are necessary). - tla/README.md — install, run, what each module proves, reading list, how to map a TLC counterexample back to the implementation. Tooling: - Makefile: tla-check target downloads tla2tools.jar v1.8.0 to .cache/tla/ with checksum pinning (SHA-256 237332bdcc79a35c7d26efa7b82c77c85c2744591c5598673a8a45085ff2a4fb), runs both configurations, and inverts the exit code on the gap configuration so the expected-FAIL counts as PASS for CI. - .gitignore: TLC counterexample-trace artefacts (MCHLC_TTrace_*.{tla,bin}) and the per-run states/ directory. Verified locally: - make tla-check passes end-to-end. - MCHLC.cfg: 28 distinct states, 0 invariant violations. - MCHLC_gap.cfg: HLC-4 counterexample at depth 5 (n1 commits (0,1) in term 1; n2 elected term 2 without strategy (c), commits (0,1) again, tying the prior commit — HLC-4 requires strictly greater). Follow-up (separate PRs): - Rename the design doc proposed -> partial (next commit in this PR). - Implement strategy (c) and the ceiling fence in Go: - Add MaxAppliedHLC() to kv/fsm.go. - Call hlc.Observe(fsm.MaxAppliedHLC()) on new-term detection in kv/sharded_coordinator.go RunHLCLeaseRenewal. - Add wallNow < physicalCeiling fence to HLC.Next() with a fail-closed return path, plus operator-visible metrics. --- .gitignore | 5 ++ Makefile | 59 ++++++++++++ tla/README.md | 171 +++++++++++++++++++++++++++++++++++ tla/hlc/HLC.tla | 203 ++++++++++++++++++++++++++++++++++++++++++ tla/hlc/MCHLC.cfg | 27 ++++++ tla/hlc/MCHLC.tla | 21 +++++ tla/hlc/MCHLC_gap.cfg | 32 +++++++ tla/lib/Env.tla | 32 +++++++ tla/lib/Raft.tla | 82 +++++++++++++++++ 9 files changed, 632 insertions(+) create mode 100644 tla/README.md create mode 100644 tla/hlc/HLC.tla create mode 100644 tla/hlc/MCHLC.cfg create mode 100644 tla/hlc/MCHLC.tla create mode 100644 tla/hlc/MCHLC_gap.cfg create mode 100644 tla/lib/Env.tla create mode 100644 tla/lib/Raft.tla diff --git a/.gitignore b/.gitignore index b8a06d526..b311a018d 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,11 @@ jepsen/.ssh/ .golangci-cache/ server +# TLA+ TLC artifacts (counterexample traces, state directories) +tla/**/MCHLC_TTrace_*.tla +tla/**/MCHLC_TTrace_*.bin +tla/**/states/ + # Admin SPA build outputs. The placeholder internal/admin/dist/index.html # is committed so `go build` succeeds in a fresh clone (the //go:embed # directive needs at least one matching file). Everything else under diff --git a/Makefile b/Makefile index 035a0db5b..bb2fc4437 100644 --- a/Makefile +++ b/Makefile @@ -12,3 +12,62 @@ lint: gen: @$(MAKE) -C proto gen + +# === TLA+ model-check support (M1 deliverable) === +# Per docs/design/2026_05_28_proposed_tla_safety_spec.md §7.2. +# Downloads a checksum-pinned tla2tools.jar on first use, then runs TLC on +# both MCHLC.cfg (correct design — expected PASS) and MCHLC_gap.cfg +# (preconditions disabled — expected FAIL on HLC-4 with a counterexample +# that motivates the implementation gaps). +TLA_VERSION := v1.8.0 +TLA_JAR := .cache/tla/tla2tools.jar +TLA_SHA256 := 237332bdcc79a35c7d26efa7b82c77c85c2744591c5598673a8a45085ff2a4fb +TLA_URL := https://github.com/tlaplus/tlaplus/releases/download/$(TLA_VERSION)/tla2tools.jar +TLA_LIB := ../lib + +.PHONY: tla-check tla-tools + +tla-tools: $(TLA_JAR) + +$(TLA_JAR): + @mkdir -p $(dir $(TLA_JAR)) + @echo "Downloading tla2tools.jar $(TLA_VERSION)..." + @curl -fsSL -o $(TLA_JAR).tmp $(TLA_URL) + @actual=$$(shasum -a 256 $(TLA_JAR).tmp | awk '{print $$1}'); \ + if [ "$$actual" != "$(TLA_SHA256)" ]; then \ + echo "ERROR: tla2tools.jar SHA-256 mismatch."; \ + echo " expected: $(TLA_SHA256)"; \ + echo " actual: $$actual"; \ + rm -f $(TLA_JAR).tmp; \ + exit 1; \ + fi + @mv $(TLA_JAR).tmp $(TLA_JAR) + @echo "tla2tools.jar ready at $(TLA_JAR) (SHA-256 $(TLA_SHA256))" + +tla-check: $(TLA_JAR) + @echo "================================================================" + @echo " TLC: tla/hlc/MCHLC.cfg (correct design, expected PASS)" + @echo "================================================================" + @cd tla/hlc && java -XX:+UseParallelGC \ + -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ + tlc2.TLC -nowarning -config MCHLC.cfg MCHLC.tla + @echo + @echo "================================================================" + @echo " TLC: tla/hlc/MCHLC_gap.cfg (no preconditions, expected FAIL)" + @echo "================================================================" + @cd tla/hlc && if java -XX:+UseParallelGC \ + -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ + tlc2.TLC -nowarning -config MCHLC_gap.cfg MCHLC.tla; then \ + echo "ERROR: MCHLC_gap.cfg unexpectedly passed."; \ + echo " The gap configuration disables HLC-4 preconditions (ii)+(iii);"; \ + echo " TLC was supposed to surface a counterexample showing why those"; \ + echo " preconditions are necessary. A clean pass means either the spec"; \ + echo " no longer encodes the gap correctly or the safety guards leaked"; \ + echo " past the EnableSafety toggle."; \ + exit 1; \ + else \ + echo; \ + echo "OK: MCHLC_gap.cfg failed as designed (HLC-4 counterexample)."; \ + fi + @echo + @echo "tla-check: all model-check outcomes match the design contract." diff --git a/tla/README.md b/tla/README.md new file mode 100644 index 000000000..06db61138 --- /dev/null +++ b/tla/README.md @@ -0,0 +1,171 @@ +# elastickv TLA+ specs + +Machine-checkable safety models for the four cross-cutting subsystems +(HLC, OCC, MVCC, route catalog) per +[`docs/design/2026_05_28_partial_tla_safety_spec.md`](../docs/design/2026_05_28_partial_tla_safety_spec.md). + +This directory is **independent of the Go module**; you can navigate and +run it without touching anything under the Go source tree. + +## Status + +| Milestone | Scope | Status | +|---|---|---| +| M1 | `lib/Raft.tla`, `lib/Env.tla`, `hlc/HLC.tla`, `make tla-check` | Landed | +| M2 | `occ/OCC.tla` (safety invariants OCC-1..OCC-5) | Not started | +| M3 | `mvcc/MVCC.tla` (MVCC-1..MVCC-4) | Not started | +| M4 | `routes/Routes.tla` (Routes-1..Routes-4) | Not started | +| M5 | `composed/Composed.tla` + CI integration | Not started | +| M6 | Liveness checking (`tla-check-deep`) | Not started | + +## Install + +The only runtime dependency is a Java VM (TLC ships as `tla2tools.jar`). + +```sh +# JDK 17+ is required. Verify with: +java -version +``` + +`make tla-check` automatically downloads `tla2tools.jar` to +`.cache/tla/tla2tools.jar` on first use and verifies its SHA-256 against +a pinned value defined in the top-level `Makefile` +(`TLA_VERSION`, `TLA_JAR`, `TLA_SHA256`). The file is gitignored. + +To pre-download without running TLC: + +```sh +make tla-tools +``` + +## Run + +From the repository root: + +```sh +make tla-check +``` + +This runs TLC against both configurations and prints whether the outcome +matches the design contract: + +1. `tla/hlc/MCHLC.cfg` — the **correct design**, with HLC-4 + preconditions encoded as ASSUMEs / action guards. TLC must finish + with no invariant violation. +2. `tla/hlc/MCHLC_gap.cfg` — the **gap configuration**, with the + preconditions disabled (`EnableSafety = FALSE`). TLC must produce a + counterexample on `HLC4_NoRegressionAcrossTerms` — this is the + motivating evidence that strategy (c) handoff + the ceiling fence + are necessary. The `tla-check` target inverts the exit code for + this run so a TLC FAILURE here counts as PASS for CI. + +To run a single module by hand (skipping the Makefile): + +```sh +JAR=.cache/tla/tla2tools.jar + +# Safe config (expected PASS): +cd tla/hlc +java -XX:+UseParallelGC -cp ../../$JAR -DTLA-Library=../lib \ + tlc2.TLC -config MCHLC.cfg MCHLC.tla + +# Gap config (expected FAIL on HLC-4): +java -XX:+UseParallelGC -cp ../../$JAR -DTLA-Library=../lib \ + tlc2.TLC -config MCHLC_gap.cfg MCHLC.tla +``` + +State-space bounds are set in the `.cfg` files (M1 defaults: 3 nodes, +2 terms, ≤ 3 IssueTimestamp ops, wall clock ≤ 3 ms, `LogicalMax = 1`). +Raise them locally to deepen exploration; the M1 PR keeps them small so +the full `make tla-check` runs in well under a second. + +## What each module proves + +### `lib/Env.tla` +Shared constants (`Nodes`, `MaxClockSkewMs`, `HlcPhysicalWindowMs`, +`MaxTerms`, `MaxOps`, `MaxWallTime`, `LogicalMax`) and the single +normative `ASSUME MaxClockSkewMs < HlcPhysicalWindowMs` — HLC-4 +precondition (i). All downstream modules `EXTENDS Env`. + +### `lib/Raft.tla` +Abstract Raft interface modelled per NG4 of the design doc: log matching +and leader completeness are assumed (etcd/raft is a verified black box), +and only the `currentTerm` / `leaderOf` / `BecomeLeader` / `IsLeader` +operators that other modules actually need are exposed. `Propose`, +`Apply`, and `InstallSnapshot` are placeholders for later milestones. + +### `hlc/HLC.tla` +The HLC layer. Encodes all three HLC-4 preconditions: + +- **(i) bounded skew** — inherited via the Env ASSUME. +- **(ii) logical-counter handoff** — `BecomeLeader_HLC` calls + `Observe(maxAppliedHLC)` on the elected node when + `EnableSafety = TRUE`. This is strategy (c) from the design doc. +- **(iii) ceiling fence** — `IssueTimestamp` has + `wallNow[n] < physicalCeiling[n]` as an enabling condition when + `EnableSafety = TRUE`. + +Invariants asserted: + +| Invariant | Statement | +|---|---| +| `TypeOK` | Variable types are well-formed | +| `HLC1_PerNodeMonotonic` | Per-node committed ts are totally ordered | +| `HLC2_CeilingMonotonic` | Ceiling never decreases (built into ApplyCeiling) | +| `HLC3_LeaderOnly` | Every committed ts was issued by the leader of its term | +| `HLC4_NoRegressionAcrossTerms` | Earlier-term commits are strictly less than later-term commits | + +### `hlc/MCHLC.tla` + `MCHLC.cfg` / `MCHLC_gap.cfg` +TLC model-check instance. Two configurations, one module. See [Run](#run). + +## How to interpret a TLC failure + +When TLC finds a counterexample on an invariant it prints: + +``` +Error: Invariant is violated. +``` + +followed by a state trace (one TLA+ record per state). Each state shows +the values of every spec variable after the named action fired. For a +counterexample-of-HLC-4 you can read off which action sequence violated +strict-greater commits across terms — this is exactly the schedule the +follow-up Go code fix must rule out. + +To map a state back to the implementation: + +1. The `State N: ` header names the + spec action. Look it up in the corresponding `.tla` file. +2. Each action documents its corresponding implementation anchor in a + comment at the action definition. Example: `BecomeLeader_HLC` in + `HLC.tla` corresponds to `ShardedCoordinator.RunHLCLeaseRenewal` + detecting a new term and calling `hlc.Observe(fsm.MaxAppliedHLC())`. +3. The state's variable values map to the corresponding Go fields: + `hlcLast[n]` ↔ `kv/hlc.go HLC.last`, `physicalCeiling[n]` ↔ + `kv/hlc.go HLC.physicalCeiling`, etc. + +If the failure is from `tla-check`'s gap-config branch announcing +"unexpectedly passed", the spec or the toggle wiring has drifted — the +preconditions are no longer doing what the design doc says they do. + +## Reading list + +For reviewers new to TLA+: + +- Leslie Lamport, *Specifying Systems* (PDF, free) — the canonical + book; chapters 1–6 are enough to read the modules here. +- Leslie Lamport's TLA+ video lecture series on YouTube. +- Diego Ongaro's Raft TLA+ specification (the abstraction in + `lib/Raft.tla` is intentionally consistent with the style there). +- Hillel Wayne, *Practical TLA+* — friendlier introduction to the + syntax and to TLC model checking. + +For the elastickv-specific design: + +- [`docs/design/2026_05_28_partial_tla_safety_spec.md`](../docs/design/2026_05_28_partial_tla_safety_spec.md) + — the proposal that this directory implements. +- [`docs/architecture_overview.md`](../docs/architecture_overview.md) + — high-level subsystem diagrams that the modules abstract. +- [`CLAUDE.md`](../CLAUDE.md) — coding conventions; the "HLC" section + in particular describes the leader-only issuance invariant the spec + encodes as HLC-3. diff --git a/tla/hlc/HLC.tla b/tla/hlc/HLC.tla new file mode 100644 index 000000000..b598e5dc4 --- /dev/null +++ b/tla/hlc/HLC.tla @@ -0,0 +1,203 @@ +--------------------------------- MODULE HLC --------------------------------- +(***************************************************************************) +(* TLA+ specification of the elastickv Hybrid Logical Clock. *) +(* Per docs/design/2026_05_28_proposed_tla_safety_spec.md §5.1. *) +(* *) +(* This module encodes the HLC layer that sits on top of an abstract Raft *) +(* (lib/Raft.tla). All three HLC-4 preconditions are first-class: *) +(* *) +(* (i) bounded clock skew (Env.tla ASSUME) *) +(* (ii) strategy (c) logical-counter handoff — Observe(MaxAppliedHLC) *) +(* inside the BecomeLeader action *) +(* (iii) commit-time ceiling fencing — wallNow[n] < physicalCeiling[n] *) +(* as an enabling guard on IssueTimestamp *) +(* *) +(* Both (ii) and (iii) are gated by the CONSTANT EnableSafety so the same *) +(* spec drives the safe model-check (MCHLC.cfg, EnableSafety = TRUE) and *) +(* the gap model-check (MCHLC_gap.cfg, EnableSafety = FALSE) that *) +(* demonstrates the HLC-4 counterexample the design doc anticipates. *) +(***************************************************************************) + +EXTENDS Naturals, FiniteSets, Env, Raft + +CONSTANT EnableSafety \* TRUE: encode preconditions (ii) and (iii); FALSE: gap config + +VARIABLES + hlcLast, \* hlcLast[n] = [wall |-> Nat, logical |-> 0..LogicalMax] + physicalCeiling, \* physicalCeiling[n] : Nat (Raft-applied) + maxAppliedHLC, \* [wall, logical] : maximum committed HLC across all terms (FSM-observable) + wallNow, \* wallNow[n] : Nat (per-node wall clock) + committedTS, \* set of [ts, term] records — every persistence Next() result + opCount \* state-space bound: number of IssueTimestamp calls so far + +hlcVars == <> +vars == <> + +\* === DATA TYPES === +HLCVal(w, l) == [wall |-> w, logical |-> l] + +HLCLE(x, y) == + \/ x.wall < y.wall + \/ (x.wall = y.wall /\ x.logical <= y.logical) + +HLCLT(x, y) == + \/ x.wall < y.wall + \/ (x.wall = y.wall /\ x.logical < y.logical) + +\* === HLC.Next() COMPUTATION (mirrors kv/hlc.go) === +\* nowMs = max(wallNow[n], physicalCeiling[n]) +FlooredNow(n) == + IF wallNow[n] < physicalCeiling[n] THEN physicalCeiling[n] ELSE wallNow[n] + +\* The HLC algorithm proper. prev is the previously issued ts on this node. +\* See kv/hlc.go Next() — the two branches mirror the (nowMs > prevWall) and +\* (nowMs <= prevWall) cases, including the logical-overflow physical bump. +ComputeNextHLC(prev, flooredNow) == + IF flooredNow > prev.wall THEN + HLCVal(flooredNow, 0) + ELSE IF prev.logical < LogicalMax THEN + HLCVal(prev.wall, prev.logical + 1) + ELSE + HLCVal(prev.wall + 1, 0) \* logical overflow bumps wall by 1 + +\* === INIT === +HLCInit == + /\ hlcLast = [n \in Nodes |-> HLCVal(0, 0)] + /\ physicalCeiling = [n \in Nodes |-> 0] + /\ maxAppliedHLC = HLCVal(0, 0) + /\ wallNow = [n \in Nodes |-> 0] + /\ committedTS = {} + /\ opCount = 0 + +Init == RaftInit /\ HLCInit + +\* === ACTIONS === + +(***************************************************************************) +(* BecomeLeader_HLC(n) — Raft elects n as the new term's leader, then *) +(* (under EnableSafety) applies strategy (c): hlcLast[n] is raised to *) +(* max(hlcLast[n], maxAppliedHLC) by an in-memory Observe. This is the *) +(* M1 default per the design doc §5.1 HLC-4 (ii). *) +(***************************************************************************) +BecomeLeader_HLC(n) == + /\ BecomeLeader(n) + /\ hlcLast' = IF EnableSafety /\ HLCLT(hlcLast[n], maxAppliedHLC) + THEN [hlcLast EXCEPT ![n] = maxAppliedHLC] + ELSE hlcLast + /\ UNCHANGED <> + +(***************************************************************************) +(* TickWall(n) — n's wall clock advances by 1 ms. Bounded skew is enforced *) +(* against every other node (HLC-4 precondition (i) governs how big the *) +(* allowed gap is). MaxWallTime caps state-space exploration. *) +(***************************************************************************) +TickWall(n) == + /\ wallNow[n] < MaxWallTime + /\ \A m \in Nodes : (wallNow[n] + 1) - wallNow[m] <= MaxClockSkewMs + /\ wallNow[m] - (wallNow[n] + 1) <= MaxClockSkewMs + /\ wallNow' = [wallNow EXCEPT ![n] = @ + 1] + /\ UNCHANGED <> + +(***************************************************************************) +(* ApplyCeiling(n) — the leader proposes a fresh ceiling and Raft applies *) +(* it on every node. Modelled atomically: every node's physicalCeiling *) +(* is raised to max(current, wallNow[leader] + HlcPhysicalWindowMs). *) +(* HLC-2 (ceiling monotonicity) is preserved by the max(). *) +(***************************************************************************) +ApplyCeiling(n) == + /\ IsLeader(n) + /\ LET newC == wallNow[n] + HlcPhysicalWindowMs IN + /\ physicalCeiling' = [m \in Nodes |-> IF physicalCeiling[m] < newC + THEN newC + ELSE physicalCeiling[m]] + /\ UNCHANGED <> + +(***************************************************************************) +(* IssueTimestamp(n) — issue a persistence HLC ts. Two normative gates: *) +(* HLC-3: only the active leader may call this. *) +(* HLC-4 (iii) under EnableSafety: wallNow[n] < physicalCeiling[n] *) +(* (the ceiling fence; if the ceiling has expired the leader *) +(* fails closed and ApplyCeiling must run first). *) +(* The committedTS log records (ts, term) for later HLC-4 checking; the *) +(* FSM-side maxAppliedHLC is bumped so a future BecomeLeader_HLC under *) +(* EnableSafety inherits the highest committed ts. *) +(***************************************************************************) +IssueTimestamp(n) == + /\ IsLeader(n) + /\ opCount < MaxOps \* state-space bound + /\ \/ ~EnableSafety \* gap config: no fence + \/ wallNow[n] < physicalCeiling[n] \* precondition (iii) guard + /\ LET prev == hlcLast[n] + ts == ComputeNextHLC(prev, FlooredNow(n)) + IN /\ hlcLast' = [hlcLast EXCEPT ![n] = ts] + /\ committedTS' = committedTS \cup + {[ts |-> ts, term |-> activeTerm, node |-> n]} + /\ maxAppliedHLC' = IF HLCLT(maxAppliedHLC, ts) + THEN ts + ELSE maxAppliedHLC + /\ opCount' = opCount + 1 + /\ UNCHANGED <> + +\* === NEXT === +Next == + \/ \E n \in Nodes : BecomeLeader_HLC(n) + \/ \E n \in Nodes : ApplyCeiling(n) + \/ \E n \in Nodes : IssueTimestamp(n) + \/ \E n \in Nodes : TickWall(n) + +Spec == Init /\ [][Next]_vars + +\* === STATE CONSTRAINT (TLC pruning) === +StateConstraint == + /\ opCount <= MaxOps + /\ activeTerm <= MaxTerms + /\ \A n \in Nodes : wallNow[n] <= MaxWallTime + /\ \A n \in Nodes : hlcLast[n].wall <= MaxWallTime + 2 + /\ \A n \in Nodes : physicalCeiling[n] <= MaxWallTime + HlcPhysicalWindowMs + +\* === TYPE INVARIANT === +HLCType == [wall : Nat, logical : 0..LogicalMax] + +TypeOK == + /\ hlcLast \in [Nodes -> HLCType] + /\ physicalCeiling \in [Nodes -> Nat] + /\ maxAppliedHLC \in HLCType + /\ wallNow \in [Nodes -> Nat] + /\ opCount \in 0..MaxOps + /\ \A r \in committedTS : + /\ r.ts \in HLCType + /\ r.term \in 1..MaxTerms + /\ r.node \in Nodes + +\* === SAFETY INVARIANTS === + +\* HLC-1 — per-node monotonicity of hlcLast. By construction +\* IssueTimestamp produces ComputeNextHLC(prev, ...) which is >= prev in the +\* HLCLE order, so this invariant is implied by the algorithm. We assert +\* the algebraic form on committed ts to make the proof obligation explicit. +HLC1_PerNodeMonotonic == + \A r1, r2 \in committedTS : + r1.node = r2.node => HLCLE(r1.ts, r2.ts) \/ HLCLE(r2.ts, r1.ts) + +\* HLC-2 — physical ceiling monotonicity (per node). ApplyCeiling raises +\* with a max(), and there is no action that lowers physicalCeiling. The +\* invariant is therefore a stuttering-step consequence, asserted here as +\* a sanity check rather than a behavioural property. +HLC2_CeilingMonotonic == \A n \in Nodes : physicalCeiling[n] >= 0 + +\* HLC-3 — only the leader of a term issues commits for that term. Built +\* into IssueTimestamp via the IsLeader(n) guard; asserted explicitly so +\* TLC reports a violation if anyone bypasses the guard. +HLC3_LeaderOnly == + \A r \in committedTS : + r.term > 0 /\ leaderOf[r.term] = r.node + +\* HLC-4 — every commit in a later term is strictly greater than every +\* commit in any earlier term. This is the load-bearing invariant: when +\* EnableSafety = TRUE strategy (c) + the ceiling fence enforce it; when +\* EnableSafety = FALSE TLC is expected to surface a counterexample. +HLC4_NoRegressionAcrossTerms == + \A r1, r2 \in committedTS : + r1.term < r2.term => HLCLT(r1.ts, r2.ts) + +============================================================================= diff --git a/tla/hlc/MCHLC.cfg b/tla/hlc/MCHLC.cfg new file mode 100644 index 000000000..4ea3072e4 --- /dev/null +++ b/tla/hlc/MCHLC.cfg @@ -0,0 +1,27 @@ +\* TLC config: HLC safety model — encodes all three HLC-4 preconditions. +\* Expected outcome: TLC explores the bounded state space and finds no +\* invariant violation, i.e. proves HLC-1..HLC-4 hold in the correct +\* design. + +SPECIFICATION Spec + +CONSTANTS + Nodes = {n1, n2, n3} + MaxClockSkewMs = 0 + HlcPhysicalWindowMs = 1 + MaxTerms = 2 + MaxOps = 3 + MaxWallTime = 3 + LogicalMax = 1 + EnableSafety = TRUE + +SYMMETRY NodeSymmetry + +INVARIANTS + TypeOK + HLC1_PerNodeMonotonic + HLC2_CeilingMonotonic + HLC3_LeaderOnly + HLC4_NoRegressionAcrossTerms + +CONSTRAINT StateConstraint diff --git a/tla/hlc/MCHLC.tla b/tla/hlc/MCHLC.tla new file mode 100644 index 000000000..e3debdd99 --- /dev/null +++ b/tla/hlc/MCHLC.tla @@ -0,0 +1,21 @@ +-------------------------------- MODULE MCHLC -------------------------------- +(***************************************************************************) +(* Model-check instance for HLC.tla. *) +(* *) +(* Two .cfg files point at this module: *) +(* MCHLC.cfg — EnableSafety = TRUE (correct design; TLC passes) *) +(* MCHLC_gap.cfg — EnableSafety = FALSE (no preconditions; TLC fails *) +(* on HLC-4 with a counterexample, demonstrating the *) +(* design gap the spec proposal exists to surface) *) +(* *) +(* Run from the repository root: *) +(* make tla-check *) +(***************************************************************************) + +EXTENDS HLC, TLC + +\* TLC symmetry hint: nodes are interchangeable in HLC.tla (per the spec +\* §6.3 per-module symmetry — HLC alone is node-symmetric). +NodeSymmetry == Permutations(Nodes) + +============================================================================= diff --git a/tla/hlc/MCHLC_gap.cfg b/tla/hlc/MCHLC_gap.cfg new file mode 100644 index 000000000..3646b1389 --- /dev/null +++ b/tla/hlc/MCHLC_gap.cfg @@ -0,0 +1,32 @@ +\* TLC config: HLC gap model — preconditions (ii) and (iii) are disabled +\* (EnableSafety = FALSE). Expected outcome: TLC FAILS the +\* HLC4_NoRegressionAcrossTerms invariant and emits a counterexample +\* trace. This is not a bug in the spec; it is the motivating evidence +\* that strategy (c) handoff + the IssueTimestamp ceiling fence are +\* necessary for HLC-4 to hold. +\* +\* The make tla-check target inverts the exit code for this config so a +\* TLC FAILURE here counts as PASS for CI. + +SPECIFICATION Spec + +CONSTANTS + Nodes = {n1, n2, n3} + MaxClockSkewMs = 0 + HlcPhysicalWindowMs = 1 + MaxTerms = 2 + MaxOps = 3 + MaxWallTime = 3 + LogicalMax = 1 + EnableSafety = FALSE + +SYMMETRY NodeSymmetry + +INVARIANTS + TypeOK + HLC1_PerNodeMonotonic + HLC2_CeilingMonotonic + HLC3_LeaderOnly + HLC4_NoRegressionAcrossTerms + +CONSTRAINT StateConstraint diff --git a/tla/lib/Env.tla b/tla/lib/Env.tla new file mode 100644 index 000000000..8a2ebe57f --- /dev/null +++ b/tla/lib/Env.tla @@ -0,0 +1,32 @@ +-------------------------------- MODULE Env ---------------------------------- +(***************************************************************************) +(* Shared environment constants used by every module in the elastickv *) +(* TLA+ suite. Per docs/design/2026_05_28_proposed_tla_safety_spec.md *) +(* §6.1. *) +(* *) +(* The single normative ASSUME below encodes HLC-4 precondition (i) — the *) +(* bounded-skew assumption. Modules that follow (Raft, HLC, …) EXTEND *) +(* this module so the assumption is in scope wherever it is needed. *) +(***************************************************************************) + +EXTENDS Naturals + +CONSTANTS + Nodes, \* finite set of node identities + MaxClockSkewMs, \* upper bound on inter-node wall clock skew (ms) + HlcPhysicalWindowMs, \* hlcPhysicalWindowMs in kv/sharded_coordinator.go + MaxTerms, \* state-space bound: number of leadership terms + MaxOps, \* state-space bound: number of persistence Next() calls + MaxWallTime, \* state-space bound: largest wall clock value + LogicalMax \* logical-counter wrap bound (real impl: 65535) + +\* HLC-4 precondition (i): the renewed ceiling outlives any inter-node skew. +\* See HLC.tla and the design doc §5.1 (i) for the hazard this rules out. +ASSUME MaxClockSkewMs < HlcPhysicalWindowMs + +\* Useful sets +Terms == 1..MaxTerms + +\* Liveness sets (placeholders for future modules; trivial in M1) +LiveNodes == Nodes +============================================================================= diff --git a/tla/lib/Raft.tla b/tla/lib/Raft.tla new file mode 100644 index 000000000..5e05f2da6 --- /dev/null +++ b/tla/lib/Raft.tla @@ -0,0 +1,82 @@ +-------------------------------- MODULE Raft --------------------------------- +(***************************************************************************) +(* Abstract Raft interface for the elastickv TLA+ suite. *) +(* Per docs/design/2026_05_28_proposed_tla_safety_spec.md §3 / §6.1, Raft *) +(* is modelled as a black box that delivers the standard log-matching + *) +(* leader-completeness guarantees. This module exposes the **interface** *) +(* (leader / term / BecomeLeader / Apply / InstallSnapshot) — the internal *) +(* election / replication state machine is NOT modelled. Modules that *) +(* need Raft EXTEND or INSTANCE this module. *) +(* *) +(* For M1 (HLC alone) only the leader / term part is exercised. Apply / *) +(* InstallSnapshot are stubs that later milestones (M2..M5) fill in. *) +(***************************************************************************) + +EXTENDS Naturals, FiniteSets, Env + +VARIABLES + currentTerm, \* currentTerm[n] \in 0..MaxTerms : node n's view of the active term + leaderOf, \* leaderOf[t] \in Nodes \cup {NoLeader} : leader elected for term t + activeTerm \* highest term that has had a leader elected (0 = pre-election) + +raftVars == <> + +NoLeader == "no_leader" + +\* All term indices reachable in the bounded model. +TermsExplored == 0..MaxTerms + +\* === INIT === +RaftInit == + /\ currentTerm = [n \in Nodes |-> 0] + /\ leaderOf = [t \in TermsExplored |-> NoLeader] + /\ activeTerm = 0 + +\* === ACTIONS === + +(***************************************************************************) +(* BecomeLeader(n): node n is elected leader of a fresh term. *) +(* Models a successful Raft election as an atomic step — the underlying *) +(* vote-counting protocol is abstracted away (NG4 in the spec doc). *) +(***************************************************************************) +BecomeLeader(n) == + /\ activeTerm < MaxTerms + /\ LET t == activeTerm + 1 IN + /\ activeTerm' = t + /\ leaderOf' = [leaderOf EXCEPT ![t] = n] + /\ currentTerm' = [currentTerm EXCEPT ![n] = t] + +(***************************************************************************) +(* IsLeader(n): predicate. TRUE iff n is the leader of the currently *) +(* active term. Used as a guard wherever the spec restricts an action *) +(* to the leader (e.g. HLC.IssueTimestamp). *) +(***************************************************************************) +IsLeader(n) == + /\ activeTerm > 0 + /\ leaderOf[activeTerm] = n + +\* Convenience: the node leading the current term, or NoLeader. +CurrentLeader == + IF activeTerm = 0 THEN NoLeader ELSE leaderOf[activeTerm] + +\* === STUBS FOR LATER MILESTONES === +(***************************************************************************) +(* Propose / Apply / InstallSnapshot are part of the Raft interface that *) +(* later milestones (M2 OCC, M3 MVCC, M5 Composed) plug into. For M1 they *) +(* are left as comments rather than no-op actions to avoid expanding the *) +(* HLC-only state space. *) +(***************************************************************************) + +\* === ASSUMED PROPERTIES === +(***************************************************************************) +(* Per NG4: the abstract Raft layer assumes log matching and leader *) +(* completeness without proving them — those are properties of etcd/raft *) +(* which we treat as a verified black box. In M1 we exercise only the *) +(* leader-uniqueness consequence, which is structural here: BecomeLeader *) +(* sets exactly one entry in leaderOf per term. *) +(***************************************************************************) +LeaderUniquenessPerTerm == + \A t \in TermsExplored : t > 0 /\ leaderOf[t] # NoLeader => + leaderOf[t] \in Nodes + +============================================================================= From 66b515392ede30719d77d540636bae01bd9b251c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:02:38 +0900 Subject: [PATCH 2/7] docs(design): promote tla_safety_spec from proposed to partial MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit M1 (HLC.tla + lib + make tla-check) lands in the prior commit on this branch, satisfying the first row of §8.1. Per CLAUDE.md docs/design/ lifecycle, rename proposed -> partial and update the header / Status note. Filename propose date and slug are preserved; only the status token changed. M2..M5 remain open; M6 (liveness) remains optional. --- ...y_spec.md => 2026_05_28_partial_tla_safety_spec.md} | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) rename docs/design/{2026_05_28_proposed_tla_safety_spec.md => 2026_05_28_partial_tla_safety_spec.md} (98%) diff --git a/docs/design/2026_05_28_proposed_tla_safety_spec.md b/docs/design/2026_05_28_partial_tla_safety_spec.md similarity index 98% rename from docs/design/2026_05_28_proposed_tla_safety_spec.md rename to docs/design/2026_05_28_partial_tla_safety_spec.md index 62b5489fb..6ff82daf7 100644 --- a/docs/design/2026_05_28_proposed_tla_safety_spec.md +++ b/docs/design/2026_05_28_partial_tla_safety_spec.md @@ -1,9 +1,17 @@ # TLA+ Safety Specification for elastickv -Status: Proposed +Status: Partial Author: bootjp Date: 2026-05-28 +> **Implementation status.** M1 landed in PR #TBD (the PR opening this +> document's rename to `partial`). The HLC TLA+ module + `make tla-check` +> + the gap-config counterexample evidence are now part of the build. +> M2–M5 (OCC, MVCC, Routes, Composed) remain open per §8. The follow-up +> Go code changes that strategy (c) and the ceiling fence require (per +> §5.1 HLC-4 (ii) and (iii)) are tracked as separate issues against +> `kv/hlc.go`, `kv/sharded_coordinator.go`, and `kv/fsm.go`. + --- ## 1. Background and Motivation From 791c65b31b27e4c6f0b5e7a1e2a5c80bc28d0297 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:29:24 +0900 Subject: [PATCH 3/7] =?UTF-8?q?tla:=20M1=20review=20round=201=20=E2=80=94?= =?UTF-8?q?=20fix=20HLC.tla=20invariant=20strength=20+=20Makefile=20portab?= =?UTF-8?q?ility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four findings from PR #856 review on commit 66b51539: - HLC.tla TickWall skew check: rewrite subtraction as addition. Nat subtraction in TLA+ underflows when LHS < RHS, which would halt TLC the moment MaxClockSkewMs > 0 enabled an out-of-order TickWall. Inequalities equivalently expressed with `+` stay inside Nat. (gemini HIGH on PR #856). - HLC1_PerNodeMonotonic now uses the per-commit `seq` ordinal (the global opCount value at issuance time) so the invariant compares same-node commits in their actual issuance order: `r1.node = r2.node /\ r1.seq < r2.seq => HLCLT(r1.ts, r2.ts)`. The earlier `HLCLE(r1, r2) \/ HLCLE(r2, r1)` form was vacuously true on a totally ordered set and would not have caught a future regression (codex P2 on PR #856). - HLC1_Action + HLC2_Action: new action-level safety properties registered under PROPERTIES (not INVARIANTS) in both .cfg files — TLC checks `[][P]_vars` as a transition property without requiring fairness. These pin the contract that hlcLast and physicalCeiling weakly increase across every step. HLC2_CeilingMonotonic's old `>= 0` form was vacuously implied by TypeOK (codex P2 on PR #856) and is replaced by HLC2_NonNeg (the residual TypeOK content) plus HLC2_Action (the real monotonicity claim). - Makefile sha256 portability: prefer GNU sha256sum (Linux universal), fall back to shasum -a 256 (macOS default), error out if neither is available. Without this the target failed on minimal Linux CI images (gemini MEDIUM on PR #856). - committedTS records now carry `seq` (= opCount at issuance), and TypeOK is extended to require it. Verified: `make tla-check` still PASSES on MCHLC.cfg (now also exercising the two action properties) and FAILS on MCHLC_gap.cfg as designed, with the HLC-4 counterexample at depth 5 unchanged. --- Makefile | 13 ++++++++- tla/hlc/HLC.tla | 61 ++++++++++++++++++++++++++++++++++--------- tla/hlc/MCHLC.cfg | 6 ++++- tla/hlc/MCHLC_gap.cfg | 6 ++++- 4 files changed, 70 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index bb2fc4437..abdaa1cb5 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,18 @@ $(TLA_JAR): @mkdir -p $(dir $(TLA_JAR)) @echo "Downloading tla2tools.jar $(TLA_VERSION)..." @curl -fsSL -o $(TLA_JAR).tmp $(TLA_URL) - @actual=$$(shasum -a 256 $(TLA_JAR).tmp | awk '{print $$1}'); \ + @# Prefer sha256sum (GNU coreutils, universal on Linux); fall back to + @# shasum -a 256 (default on macOS). Either yields the same hex + @# digest in the first whitespace-delimited field. + @if command -v sha256sum >/dev/null 2>&1; then \ + actual=$$(sha256sum $(TLA_JAR).tmp | awk '{print $$1}'); \ + elif command -v shasum >/dev/null 2>&1; then \ + actual=$$(shasum -a 256 $(TLA_JAR).tmp | awk '{print $$1}'); \ + else \ + echo "ERROR: neither sha256sum nor shasum is available."; \ + rm -f $(TLA_JAR).tmp; \ + exit 1; \ + fi; \ if [ "$$actual" != "$(TLA_SHA256)" ]; then \ echo "ERROR: tla2tools.jar SHA-256 mismatch."; \ echo " expected: $(TLA_SHA256)"; \ diff --git a/tla/hlc/HLC.tla b/tla/hlc/HLC.tla index b598e5dc4..d0a052aad 100644 --- a/tla/hlc/HLC.tla +++ b/tla/hlc/HLC.tla @@ -93,8 +93,12 @@ BecomeLeader_HLC(n) == (***************************************************************************) TickWall(n) == /\ wallNow[n] < MaxWallTime - /\ \A m \in Nodes : (wallNow[n] + 1) - wallNow[m] <= MaxClockSkewMs - /\ wallNow[m] - (wallNow[n] + 1) <= MaxClockSkewMs + \* Bounded inter-node skew (HLC-4 precondition (i)). Stated with addition + \* rather than subtraction so the inequality stays inside Nat (per gemini + \* PR #856 review: subtraction in Naturals is undefined when the LHS is + \* smaller and would halt TLC). + /\ \A m \in Nodes : /\ wallNow[n] + 1 <= wallNow[m] + MaxClockSkewMs + /\ wallNow[m] <= wallNow[n] + 1 + MaxClockSkewMs /\ wallNow' = [wallNow EXCEPT ![n] = @ + 1] /\ UNCHANGED <> @@ -130,8 +134,12 @@ IssueTimestamp(n) == /\ LET prev == hlcLast[n] ts == ComputeNextHLC(prev, FlooredNow(n)) IN /\ hlcLast' = [hlcLast EXCEPT ![n] = ts] + \* Each commit gets the global opCount as a strict issuance + \* ordinal so HLC1_PerNodeMonotonic can talk about "the prev + \* and next commit by the same node" — without it the state + \* invariant would just be a tautology on a totally ordered set. /\ committedTS' = committedTS \cup - {[ts |-> ts, term |-> activeTerm, node |-> n]} + {[ts |-> ts, term |-> activeTerm, node |-> n, seq |-> opCount]} /\ maxAppliedHLC' = IF HLCLT(maxAppliedHLC, ts) THEN ts ELSE maxAppliedHLC @@ -168,22 +176,49 @@ TypeOK == /\ r.ts \in HLCType /\ r.term \in 1..MaxTerms /\ r.node \in Nodes + /\ r.seq \in 0..MaxOps \* === SAFETY INVARIANTS === -\* HLC-1 — per-node monotonicity of hlcLast. By construction -\* IssueTimestamp produces ComputeNextHLC(prev, ...) which is >= prev in the -\* HLCLE order, so this invariant is implied by the algorithm. We assert -\* the algebraic form on committed ts to make the proof obligation explicit. +\* HLC-1 — per-node monotonicity of hlcLast across IssueTimestamp / +\* BecomeLeader_HLC / Observe. The earlier formulation +\* `HLCLE(r1, r2) \/ HLCLE(r2, r1)` was vacuously true under the total +\* lexicographic order on HLCVal (codex PR #856 P2): it would have admitted +\* a regression (5,0) -> (4,0) as "ordered". We instead carry an explicit +\* issuance ordinal on every committedTS record (`seq` is the global opCount +\* at the moment IssueTimestamp fired) and assert strict-greater on the same +\* node — every later issuance is strictly above the previous one. Combined +\* with the per-action invariant HLC1_Action below this gives the same +\* monotonicity the implementation provides via h.last's CAS. HLC1_PerNodeMonotonic == \A r1, r2 \in committedTS : - r1.node = r2.node => HLCLE(r1.ts, r2.ts) \/ HLCLE(r2.ts, r1.ts) + (r1.node = r2.node /\ r1.seq < r2.seq) => HLCLT(r1.ts, r2.ts) -\* HLC-2 — physical ceiling monotonicity (per node). ApplyCeiling raises -\* with a max(), and there is no action that lowers physicalCeiling. The -\* invariant is therefore a stuttering-step consequence, asserted here as -\* a sanity check rather than a behavioural property. -HLC2_CeilingMonotonic == \A n \in Nodes : physicalCeiling[n] >= 0 +\* HLC-2 — physical ceiling monotonicity is fundamentally an *action* +\* invariant: physicalCeiling[n] must not decrease across any step. We +\* state it as the safety property `HLC2_Action` below and reference it +\* via PROPERTIES in MCHLC.cfg. No state-level form is meaningful here +\* (the earlier `>= 0` was vacuously implied by TypeOK — codex PR #856 +\* P2). +HLC2_NonNeg == \A n \in Nodes : physicalCeiling[n] \in Nat + +\* === ACTION-LEVEL INVARIANTS === +(***************************************************************************) +(* Action-level safety properties. In TLA+ idiom these are written as *) +(* [][P]_vars and registered under PROPERTIES (not INVARIANTS) in the .cfg *) +(* — TLC checks them as transition properties without requiring fairness. *) +(***************************************************************************) + +\* HLC-1 (transition form): hlcLast on every node weakly increases across +\* every step. Built into ComputeNextHLC and into BecomeLeader_HLC's +\* Observe; this property pins the contract. +HLC1_Action == + [][\A n \in Nodes : HLCLE(hlcLast[n], hlcLast'[n])]_vars + +\* HLC-2 (transition form): physicalCeiling on every node weakly increases +\* across every step. ApplyCeiling raises with max(); no action lowers it. +HLC2_Action == + [][\A n \in Nodes : physicalCeiling[n] <= physicalCeiling'[n]]_vars \* HLC-3 — only the leader of a term issues commits for that term. Built \* into IssueTimestamp via the IsLeader(n) guard; asserted explicitly so diff --git a/tla/hlc/MCHLC.cfg b/tla/hlc/MCHLC.cfg index 4ea3072e4..a9f2797f3 100644 --- a/tla/hlc/MCHLC.cfg +++ b/tla/hlc/MCHLC.cfg @@ -20,8 +20,12 @@ SYMMETRY NodeSymmetry INVARIANTS TypeOK HLC1_PerNodeMonotonic - HLC2_CeilingMonotonic + HLC2_NonNeg HLC3_LeaderOnly HLC4_NoRegressionAcrossTerms +PROPERTIES + HLC1_Action + HLC2_Action + CONSTRAINT StateConstraint diff --git a/tla/hlc/MCHLC_gap.cfg b/tla/hlc/MCHLC_gap.cfg index 3646b1389..73765130d 100644 --- a/tla/hlc/MCHLC_gap.cfg +++ b/tla/hlc/MCHLC_gap.cfg @@ -25,8 +25,12 @@ SYMMETRY NodeSymmetry INVARIANTS TypeOK HLC1_PerNodeMonotonic - HLC2_CeilingMonotonic + HLC2_NonNeg HLC3_LeaderOnly HLC4_NoRegressionAcrossTerms +PROPERTIES + HLC1_Action + HLC2_Action + CONSTRAINT StateConstraint From 1123a0e156dd7cc3bc72b81f87ef3a66e8293df1 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:37:24 +0900 Subject: [PATCH 4/7] =?UTF-8?q?tla:=20M1=20review=20round=202=20=E2=80=94?= =?UTF-8?q?=20fix=20TickWall=20self-skew=20block=20+=20validate=20gap=20re?= =?UTF-8?q?ason?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two P2 findings from codex on commit 791c65b3. - HLC.tla TickWall: exclude self from the inter-node skew check. The `\A m \in Nodes:` quantifier included `m = n`, so under MaxClockSkewMs = 0 the constraint forced `wallNow[n] + 1 <= wallNow[n] + 0`, which is unsatisfiable. TickWall was therefore never enabled and the model never explored wall-clock advancement, ceiling expiry, or skew schedules — the safe config was passing on a degenerate 28-state subspace. Restrict the universal to `Nodes \ {n}` so self-comparison is excluded. (codex PR #856 round 2.) - Raise MaxClockSkewMs from 0 to 1 (and HlcPhysicalWindowMs from 1 to 2 so the ASSUME `MaxClockSkewMs < HlcPhysicalWindowMs` still holds) in both MCHLC.cfg and MCHLC_gap.cfg so TickWall is actually enabled and the skew dimension is part of the explored space. After this change the safe config explores 3,594 distinct states (vs 28 previously) and the gap config still surfaces the HLC-4 counterexample at depth 5. - Makefile tla-check: validate the gap-run failure reason instead of blindly accepting any non-zero TLC exit. Capture TLC stdout/stderr, print it, then grep for the exact string `Invariant HLC4_NoRegressionAcrossTerms is violated` before declaring the gap evidence intact. Without this check a parse error, deadlock, JVM crash, or different invariant violation would have silently counted as "expected gap counterexample" and masked regressions in the harness itself. (codex PR #856 round 2.) --- Makefile | 48 ++++++++++++++++++++++++++++++------------- tla/hlc/HLC.tla | 12 +++++++++-- tla/hlc/MCHLC.cfg | 4 ++-- tla/hlc/MCHLC_gap.cfg | 4 ++-- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index abdaa1cb5..44f805ae6 100644 --- a/Makefile +++ b/Makefile @@ -66,19 +66,39 @@ tla-check: $(TLA_JAR) @echo "================================================================" @echo " TLC: tla/hlc/MCHLC_gap.cfg (no preconditions, expected FAIL)" @echo "================================================================" - @cd tla/hlc && if java -XX:+UseParallelGC \ - -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ - tlc2.TLC -nowarning -config MCHLC_gap.cfg MCHLC.tla; then \ - echo "ERROR: MCHLC_gap.cfg unexpectedly passed."; \ - echo " The gap configuration disables HLC-4 preconditions (ii)+(iii);"; \ - echo " TLC was supposed to surface a counterexample showing why those"; \ - echo " preconditions are necessary. A clean pass means either the spec"; \ - echo " no longer encodes the gap correctly or the safety guards leaked"; \ - echo " past the EnableSafety toggle."; \ - exit 1; \ - else \ - echo; \ - echo "OK: MCHLC_gap.cfg failed as designed (HLC-4 counterexample)."; \ - fi + @# Capture the gap run output so we can validate not just that TLC + @# exited non-zero but that the failure reason is exactly the HLC-4 + @# invariant violation we expect. Without this check a parse error, + @# deadlock, JVM crash, or different invariant violation would silently + @# count as "expected gap counterexample" (codex P2 on PR #856 round 2). + @cd tla/hlc && \ + out=$$(java -XX:+UseParallelGC \ + -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ + tlc2.TLC -nowarning -config MCHLC_gap.cfg MCHLC.tla 2>&1) ; \ + rc=$$? ; \ + printf '%s\n' "$$out" ; \ + if [ "$$rc" -eq 0 ]; then \ + echo ; \ + echo "ERROR: MCHLC_gap.cfg unexpectedly passed."; \ + echo " The gap configuration disables HLC-4 preconditions (ii)+(iii);"; \ + echo " TLC was supposed to surface a counterexample showing why those"; \ + echo " preconditions are necessary. A clean pass means either the spec"; \ + echo " no longer encodes the gap correctly or the safety guards leaked"; \ + echo " past the EnableSafety toggle."; \ + exit 1; \ + fi ; \ + if printf '%s\n' "$$out" | grep -qF "Invariant HLC4_NoRegressionAcrossTerms is violated"; then \ + echo ; \ + echo "OK: MCHLC_gap.cfg failed as designed (HLC-4 counterexample)."; \ + else \ + echo ; \ + echo "ERROR: MCHLC_gap.cfg failed, but the reason is NOT a HLC-4 violation."; \ + echo " Expected substring in TLC output:"; \ + echo " 'Invariant HLC4_NoRegressionAcrossTerms is violated'"; \ + echo " The non-zero exit may indicate a parse error, deadlock, JVM"; \ + echo " crash, or a different invariant breaking — review the output"; \ + echo " above before treating this as a regression in the gap evidence."; \ + exit 1; \ + fi @echo @echo "tla-check: all model-check outcomes match the design contract." diff --git a/tla/hlc/HLC.tla b/tla/hlc/HLC.tla index d0a052aad..e1981e8de 100644 --- a/tla/hlc/HLC.tla +++ b/tla/hlc/HLC.tla @@ -97,8 +97,16 @@ TickWall(n) == \* rather than subtraction so the inequality stays inside Nat (per gemini \* PR #856 review: subtraction in Naturals is undefined when the LHS is \* smaller and would halt TLC). - /\ \A m \in Nodes : /\ wallNow[n] + 1 <= wallNow[m] + MaxClockSkewMs - /\ wallNow[m] <= wallNow[n] + 1 + MaxClockSkewMs + \* Self is excluded from the skew check: the bounded-skew assumption + \* is about *inter*-node disagreement, not the ticking node's own + \* clock relative to its post-tick value. Without this exclusion + \* the constraint includes `wallNow[n] + 1 <= wallNow[n] + MaxClockSkewMs` + \* which is unsatisfiable whenever MaxClockSkewMs = 0 — disabling + \* TickWall entirely and preventing the model from ever exercising + \* wall-clock advancement (codex P2 on PR #856 round 2). + /\ \A m \in Nodes \ {n} : + /\ wallNow[n] + 1 <= wallNow[m] + MaxClockSkewMs + /\ wallNow[m] <= wallNow[n] + 1 + MaxClockSkewMs /\ wallNow' = [wallNow EXCEPT ![n] = @ + 1] /\ UNCHANGED <> diff --git a/tla/hlc/MCHLC.cfg b/tla/hlc/MCHLC.cfg index a9f2797f3..b088a885b 100644 --- a/tla/hlc/MCHLC.cfg +++ b/tla/hlc/MCHLC.cfg @@ -7,8 +7,8 @@ SPECIFICATION Spec CONSTANTS Nodes = {n1, n2, n3} - MaxClockSkewMs = 0 - HlcPhysicalWindowMs = 1 + MaxClockSkewMs = 1 + HlcPhysicalWindowMs = 2 MaxTerms = 2 MaxOps = 3 MaxWallTime = 3 diff --git a/tla/hlc/MCHLC_gap.cfg b/tla/hlc/MCHLC_gap.cfg index 73765130d..790ffbf39 100644 --- a/tla/hlc/MCHLC_gap.cfg +++ b/tla/hlc/MCHLC_gap.cfg @@ -12,8 +12,8 @@ SPECIFICATION Spec CONSTANTS Nodes = {n1, n2, n3} - MaxClockSkewMs = 0 - HlcPhysicalWindowMs = 1 + MaxClockSkewMs = 1 + HlcPhysicalWindowMs = 2 MaxTerms = 2 MaxOps = 3 MaxWallTime = 3 From 9901e6ca04c29f274480eee8c5bde0b72d4fa690 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:41:00 +0900 Subject: [PATCH 5/7] =?UTF-8?q?tla:=20M1=20review=20round=203=20=E2=80=94?= =?UTF-8?q?=20fix=20stale=20doc=20paths=20+=20README=20invariant=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two doc-consistency findings from claude bot's round 2 review of commit 791c65b3 (after the proposed -> partial rename in 66b51539 those references became broken links): - Update 4 stale references to the old proposed_ doc path: - tla/hlc/HLC.tla line 4 - tla/lib/Env.tla line 4 - tla/lib/Raft.tla line 4 - Makefile line 17 All now point at docs/design/2026_05_28_partial_tla_safety_spec.md. - tla/README.md "What each module proves" table referenced HLC2_CeilingMonotonic, which does not exist in HLC.tla after the round-1 rework. Replace with the actual operators: HLC1_PerNodeMonotonic (state, with strict seq ordering), HLC1_Action (PROPERTY, transition form), HLC2_NonNeg (state, residual TypeOK content), HLC2_Action (PROPERTY, transition form). Also clarify HLC-1's state form is the strict-seq version, not the earlier vacuous total-order form. Docs-only changes. make tla-check still passes on MCHLC.cfg (3,594 distinct states) and fails on MCHLC_gap.cfg as designed. --- Makefile | 2 +- tla/README.md | 6 ++++-- tla/hlc/HLC.tla | 2 +- tla/lib/Env.tla | 2 +- tla/lib/Raft.tla | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 44f805ae6..2af213987 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ gen: @$(MAKE) -C proto gen # === TLA+ model-check support (M1 deliverable) === -# Per docs/design/2026_05_28_proposed_tla_safety_spec.md §7.2. +# Per docs/design/2026_05_28_partial_tla_safety_spec.md §7.2. # Downloads a checksum-pinned tla2tools.jar on first use, then runs TLC on # both MCHLC.cfg (correct design — expected PASS) and MCHLC_gap.cfg # (preconditions disabled — expected FAIL on HLC-4 with a counterexample diff --git a/tla/README.md b/tla/README.md index 06db61138..06bef48a2 100644 --- a/tla/README.md +++ b/tla/README.md @@ -110,8 +110,10 @@ Invariants asserted: | Invariant | Statement | |---|---| | `TypeOK` | Variable types are well-formed | -| `HLC1_PerNodeMonotonic` | Per-node committed ts are totally ordered | -| `HLC2_CeilingMonotonic` | Ceiling never decreases (built into ApplyCeiling) | +| `HLC1_PerNodeMonotonic` | Per-node committed ts are strictly ordered by issuance `seq` | +| `HLC1_Action` (PROPERTY) | Transition form: `hlcLast[n]` weakly increases on every step | +| `HLC2_NonNeg` | Residual `physicalCeiling \in Nat` content; harmless type sanity | +| `HLC2_Action` (PROPERTY) | Transition form: `physicalCeiling[n]` weakly increases on every step | | `HLC3_LeaderOnly` | Every committed ts was issued by the leader of its term | | `HLC4_NoRegressionAcrossTerms` | Earlier-term commits are strictly less than later-term commits | diff --git a/tla/hlc/HLC.tla b/tla/hlc/HLC.tla index e1981e8de..9be9635d6 100644 --- a/tla/hlc/HLC.tla +++ b/tla/hlc/HLC.tla @@ -1,7 +1,7 @@ --------------------------------- MODULE HLC --------------------------------- (***************************************************************************) (* TLA+ specification of the elastickv Hybrid Logical Clock. *) -(* Per docs/design/2026_05_28_proposed_tla_safety_spec.md §5.1. *) +(* Per docs/design/2026_05_28_partial_tla_safety_spec.md §5.1. *) (* *) (* This module encodes the HLC layer that sits on top of an abstract Raft *) (* (lib/Raft.tla). All three HLC-4 preconditions are first-class: *) diff --git a/tla/lib/Env.tla b/tla/lib/Env.tla index 8a2ebe57f..add03db74 100644 --- a/tla/lib/Env.tla +++ b/tla/lib/Env.tla @@ -1,7 +1,7 @@ -------------------------------- MODULE Env ---------------------------------- (***************************************************************************) (* Shared environment constants used by every module in the elastickv *) -(* TLA+ suite. Per docs/design/2026_05_28_proposed_tla_safety_spec.md *) +(* TLA+ suite. Per docs/design/2026_05_28_partial_tla_safety_spec.md *) (* §6.1. *) (* *) (* The single normative ASSUME below encodes HLC-4 precondition (i) — the *) diff --git a/tla/lib/Raft.tla b/tla/lib/Raft.tla index 5e05f2da6..dc48fbe97 100644 --- a/tla/lib/Raft.tla +++ b/tla/lib/Raft.tla @@ -1,7 +1,7 @@ -------------------------------- MODULE Raft --------------------------------- (***************************************************************************) (* Abstract Raft interface for the elastickv TLA+ suite. *) -(* Per docs/design/2026_05_28_proposed_tla_safety_spec.md §3 / §6.1, Raft *) +(* Per docs/design/2026_05_28_partial_tla_safety_spec.md §3 / §6.1, Raft *) (* is modelled as a black box that delivers the standard log-matching + *) (* leader-completeness guarantees. This module exposes the **interface** *) (* (leader / term / BecomeLeader / Apply / InstallSnapshot) — the internal *) From ca0ecfc0e7cd9cd1abab57ef7a550c0cf5ae4572 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:46:24 +0900 Subject: [PATCH 6/7] ci(tla): wire make tla-check into GitHub Actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per docs/design/2026_05_28_partial_tla_safety_spec.md §7.3 — the M1 PR is the right place to land the CI workflow alongside the concrete spec. Adds .github/workflows/tla-check.yml: - Triggers on PR (opened/synchronize/reopened) and on push to main, with a path filter matching exactly the §7.3 list: tla/**, Makefile, kv/hlc*.go, kv/coordinator.go, kv/sharded_coordinator.go, kv/transaction.go, kv/fsm.go, kv/lock_resolver.go, store/mvcc_store.go, distribution/**, and the workflow file itself. - Uses Temurin JDK 21 (consistent with jepsen-test.yml). - Caches .cache/tla/tla2tools.jar keyed on the Makefile hash so any TLA_VERSION / TLA_SHA256 change automatically invalidates the cache. - Pre-fetches the jar via `make tla-tools` so the download is in a separate log group from the model check. - Runs `make tla-check`, which inherits the round-2 gap-config validation: a non-zero TLC exit on MCHLC_gap.cfg only counts as success if the output contains the exact HLC-4 violation string. Updates §7.3 of the design doc: removes the "(deferred)" qualifier and names the workflow file as the now-active CI binding. actionlint clean. The full M1 run completes in well under a minute locally (3,594 distinct states on the safe config, HLC-4 counter- example at depth 5 on the gap config). --- .github/workflows/tla-check.yml | 81 +++++++++++++++++++ .../2026_05_28_partial_tla_safety_spec.md | 23 ++++-- 2 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/tla-check.yml diff --git a/.github/workflows/tla-check.yml b/.github/workflows/tla-check.yml new file mode 100644 index 000000000..e1b8b4286 --- /dev/null +++ b/.github/workflows/tla-check.yml @@ -0,0 +1,81 @@ +name: TLA+ model check + +# Runs `make tla-check` on PRs (and pushes to main) that touch the +# subsystems modelled by the spec or the spec / tooling themselves. +# Per docs/design/2026_05_28_partial_tla_safety_spec.md §7.3. +# +# The job runs both MCHLC.cfg (correct design — expected PASS) and +# MCHLC_gap.cfg (preconditions disabled — expected FAIL with the HLC-4 +# counterexample); the Makefile target greps for the specific HLC-4 +# violation string so the gap evidence is validated, not just the exit +# code. + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'tla/**' + - 'Makefile' + - 'kv/hlc*.go' + - 'kv/coordinator.go' + - 'kv/sharded_coordinator.go' + - 'kv/transaction.go' + - 'kv/fsm.go' + - 'kv/lock_resolver.go' + - 'store/mvcc_store.go' + - 'distribution/**' + - '.github/workflows/tla-check.yml' + push: + branches: [main] + paths: + - 'tla/**' + - 'Makefile' + - 'kv/hlc*.go' + - 'kv/coordinator.go' + - 'kv/sharded_coordinator.go' + - 'kv/transaction.go' + - 'kv/fsm.go' + - 'kv/lock_resolver.go' + - 'store/mvcc_store.go' + - 'distribution/**' + - '.github/workflows/tla-check.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-tla-check + cancel-in-progress: false + +permissions: + contents: read + +jobs: + tla-check: + name: tla-check + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Check out + uses: actions/checkout@v6 + + - name: Set up JDK 21 (Temurin) + uses: actions/setup-java@v5 + with: + distribution: temurin + java-version: '21' + + # Cache the checksum-pinned tla2tools.jar so we don't re-download + # GitHub release artefacts on every run. Key includes the Makefile + # hash so a version-pin change invalidates the cache automatically + # (TLA_VERSION + TLA_SHA256 both live in the Makefile). + - name: Cache tla2tools.jar + uses: actions/cache@v5 + with: + path: .cache/tla + key: ${{ runner.os }}-tla-tools-${{ hashFiles('Makefile') }} + restore-keys: | + ${{ runner.os }}-tla-tools- + + - name: Pre-fetch tla2tools.jar (logs separately from tla-check) + run: make tla-tools + + - name: Run make tla-check + run: make tla-check diff --git a/docs/design/2026_05_28_partial_tla_safety_spec.md b/docs/design/2026_05_28_partial_tla_safety_spec.md index 6ff82daf7..5b29432c6 100644 --- a/docs/design/2026_05_28_partial_tla_safety_spec.md +++ b/docs/design/2026_05_28_partial_tla_safety_spec.md @@ -544,10 +544,12 @@ Both targets shell out to `tla2tools.jar` which is downloaded on first use into `.cache/tla/` (matching the existing `.cache/` pattern from CLAUDE.md). The downloaded jar is checksum-pinned. No system-wide install required. -### 7.3 CI integration (deferred) +### 7.3 CI integration -`tla-check` is intended to run in CI on PRs that touch: +`tla-check` runs in CI via `.github/workflows/tla-check.yml` on every PR +(and push to `main`) that touches: +- `tla/**` and `Makefile` (the spec and the harness themselves). - `kv/hlc*.go`, `kv/coordinator.go`, `kv/sharded_coordinator.go`, `kv/transaction.go`, `kv/fsm.go`, `kv/lock_resolver.go` — the HLC renewal scheduler, the timestamp-issuance dispatch path, and the @@ -555,12 +557,17 @@ The downloaded jar is checksum-pinned. No system-wide install required. anchors); a change here can violate HLC/OCC invariants and must re-run `tla-check`. - `store/mvcc_store.go` -- `distribution/` -- the spec files themselves - -The exact CI wiring (workflow file, path filter) is deferred to the M1 PR -so it can be reviewed alongside a concrete spec rather than in the -abstract. +- `distribution/**` +- `.github/workflows/tla-check.yml` (the workflow file itself). + +The workflow uses Temurin JDK 21, caches `.cache/tla/tla2tools.jar` +keyed on the Makefile hash so version-pin changes invalidate the cache +automatically, and inherits the Makefile's gap-config validation: a +non-zero TLC exit on `MCHLC_gap.cfg` only counts as success if the +output contains the exact string +`Invariant HLC4_NoRegressionAcrossTerms is violated`. Any other failure +mode (parse error, deadlock, JVM crash, different invariant) fails the +job. The full M1 run completes in well under a minute. --- From 2a2238aff2ff2eb540383db0a0656461171d3de4 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 29 May 2026 02:52:13 +0900 Subject: [PATCH 7/7] ci(tla): drop cache restore-keys to keep checksum pin load-bearing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex P2 on PR #856 ca0ecfc0: the prefix-match restore-keys could bring back an older tla2tools.jar after a TLA_VERSION/TLA_SHA256 bump in the Makefile. Because $(TLA_JAR) is a file-existence target, the restored stale jar would skip the Makefile's checksum verification entirely — defeating the pin. A full cache miss is the correct behaviour on a version-pin change: the recipe runs, downloads from GitHub, and verifies the SHA-256 before serving the jar to TLC. Documented inline so the next reviewer doesn't re-add restore-keys. --- .github/workflows/tla-check.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tla-check.yml b/.github/workflows/tla-check.yml index e1b8b4286..51d05dfb8 100644 --- a/.github/workflows/tla-check.yml +++ b/.github/workflows/tla-check.yml @@ -66,13 +66,18 @@ jobs: # GitHub release artefacts on every run. Key includes the Makefile # hash so a version-pin change invalidates the cache automatically # (TLA_VERSION + TLA_SHA256 both live in the Makefile). + # + # NO restore-keys: a prefix-match restore would bring back an older + # jar after a TLA_VERSION/TLA_SHA256 bump. Because the Makefile's + # $(TLA_JAR) recipe is gated on file existence, the restored stale + # jar would skip checksum verification entirely — defeating the + # whole point of the pin (codex P2 on PR #856 round 4). A full + # miss-and-redownload is correct here. - name: Cache tla2tools.jar uses: actions/cache@v5 with: path: .cache/tla key: ${{ runner.os }}-tla-tools-${{ hashFiles('Makefile') }} - restore-keys: | - ${{ runner.os }}-tla-tools- - name: Pre-fetch tla2tools.jar (logs separately from tla-check) run: make tla-tools