diff --git a/.gitignore b/.gitignore index b311a018d..0d56ce1fc 100644 --- a/.gitignore +++ b/.gitignore @@ -40,8 +40,8 @@ jepsen/.ssh/ server # TLA+ TLC artifacts (counterexample traces, state directories) -tla/**/MCHLC_TTrace_*.tla -tla/**/MCHLC_TTrace_*.bin +tla/**/MC*_TTrace_*.tla +tla/**/MC*_TTrace_*.bin tla/**/states/ # Admin SPA build outputs. The placeholder internal/admin/dist/index.html diff --git a/Makefile b/Makefile index 2af213987..9c9bcb620 100644 --- a/Makefile +++ b/Makefile @@ -56,49 +56,9 @@ $(TLA_JAR): @echo "tla2tools.jar ready at $(TLA_JAR) (SHA-256 $(TLA_SHA256))" tla-check: $(TLA_JAR) - @echo "================================================================" - @echo " TLC: tla/hlc/MCHLC.cfg (correct design, expected PASS)" - @echo "================================================================" - @cd tla/hlc && java -XX:+UseParallelGC \ - -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ - tlc2.TLC -nowarning -config MCHLC.cfg MCHLC.tla - @echo - @echo "================================================================" - @echo " TLC: tla/hlc/MCHLC_gap.cfg (no preconditions, expected FAIL)" - @echo "================================================================" - @# Capture the gap run output so we can validate not just that TLC - @# exited non-zero but that the failure reason is exactly the HLC-4 - @# invariant violation we expect. Without this check a parse error, - @# deadlock, JVM crash, or different invariant violation would silently - @# count as "expected gap counterexample" (codex P2 on PR #856 round 2). - @cd tla/hlc && \ - out=$$(java -XX:+UseParallelGC \ - -cp ../../$(TLA_JAR) -DTLA-Library=$(TLA_LIB) \ - tlc2.TLC -nowarning -config MCHLC_gap.cfg MCHLC.tla 2>&1) ; \ - rc=$$? ; \ - printf '%s\n' "$$out" ; \ - if [ "$$rc" -eq 0 ]; then \ - echo ; \ - echo "ERROR: MCHLC_gap.cfg unexpectedly passed."; \ - echo " The gap configuration disables HLC-4 preconditions (ii)+(iii);"; \ - echo " TLC was supposed to surface a counterexample showing why those"; \ - echo " preconditions are necessary. A clean pass means either the spec"; \ - echo " no longer encodes the gap correctly or the safety guards leaked"; \ - echo " past the EnableSafety toggle."; \ - exit 1; \ - fi ; \ - if printf '%s\n' "$$out" | grep -qF "Invariant HLC4_NoRegressionAcrossTerms is violated"; then \ - echo ; \ - echo "OK: MCHLC_gap.cfg failed as designed (HLC-4 counterexample)."; \ - else \ - echo ; \ - echo "ERROR: MCHLC_gap.cfg failed, but the reason is NOT a HLC-4 violation."; \ - echo " Expected substring in TLC output:"; \ - echo " 'Invariant HLC4_NoRegressionAcrossTerms is violated'"; \ - echo " The non-zero exit may indicate a parse error, deadlock, JVM"; \ - echo " crash, or a different invariant breaking — review the output"; \ - echo " above before treating this as a regression in the gap evidence."; \ - exit 1; \ - fi - @echo - @echo "tla-check: all model-check outcomes match the design contract." + @# Per-module orchestration lives in scripts/tla-check.sh so adding + @# M3..M5 only needs an entry in that script's TLA_MODULES array + @# and a `case` line for the gap-invariant string. The script does + @# the safe-config / gap-config pair per module, validates the gap + @# failure reason via string match, and aggregates exit codes. + @bash scripts/tla-check.sh diff --git a/scripts/tla-check.sh b/scripts/tla-check.sh new file mode 100755 index 000000000..8942a9bde --- /dev/null +++ b/scripts/tla-check.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# tla-check.sh — run TLC on every module in the elastickv TLA+ suite. +# +# Per docs/design/2026_05_28_partial_tla_safety_spec.md §7.2. Each +# module under tla// ships: +# MC.tla — the TLC model module +# MC.cfg — the correct-design config (expected PASS) +# MC_gap.cfg — the preconditions-disabled config (expected +# FAIL with a SPECIFIC invariant counterexample, +# checked by string match against +# TLA__GAP_INVARIANT below) +# +# Adding a new module (M3..M5): append to TLA_MODULES, drop the per- +# module gap-invariant string into the case statement, and `make +# tla-check` picks it up automatically. + +set -uo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +TLA_JAR="${REPO_ROOT}/.cache/tla/tla2tools.jar" + +if [ ! -f "$TLA_JAR" ]; then + echo "ERROR: ${TLA_JAR} not found — run 'make tla-tools' first." >&2 + exit 1 +fi + +# Modules to check, in dependency order (libs before consumers). +TLA_MODULES=( "hlc" "occ" ) + +# The invariant the gap config is expected to break, per module. +# These strings are matched against TLC stdout via grep -F (literal). +gap_invariant_for() { + case "$1" in + hlc) echo "Invariant HLC4_NoRegressionAcrossTerms is violated" ;; + occ) echo "Invariant OCC1_CommitTsAboveStart is violated" ;; + *) + echo "ERROR: no gap-invariant string registered for module '$1'." \ + "Add a case to scripts/tla-check.sh." >&2 + exit 64 + ;; + esac +} + +mc_basename() { + # tla//MC.tla — module name in UPPERCASE + printf 'MC%s' "$(printf '%s' "$1" | tr '[:lower:]' '[:upper:]')" +} + +run_tlc() { + local module="$1" + local cfg="$2" + local mc + mc="$(mc_basename "$module")" + ( cd "${REPO_ROOT}/tla/${module}" && \ + java -XX:+UseParallelGC \ + -cp "${TLA_JAR}" -DTLA-Library=../lib \ + tlc2.TLC -nowarning -config "${cfg}" "${mc}.tla" ) +} + +overall_rc=0 + +for module in "${TLA_MODULES[@]}"; do + mc="$(mc_basename "$module")" + safe_cfg="${mc}.cfg" + gap_cfg="${mc}_gap.cfg" + # `set -e` is not in effect (the script uses `set -uo pipefail`). + # Without explicit checking, a non-zero exit from gap_invariant_for + # inside the command substitution would only terminate the + # subshell — the parent loop would continue with gap_inv="" and the + # later `grep -qF "$gap_inv"` would always match (empty pattern), + # silently passing the gap check. Check explicitly (gemini HIGH + # on PR #858). + if ! gap_inv="$(gap_invariant_for "$module")"; then + echo "ERROR: gap_invariant_for failed for module '${module}' — see error above." >&2 + overall_rc=1 + continue + fi + + echo "================================================================" + echo " TLC: tla/${module}/${safe_cfg} (correct design, expected PASS)" + echo "================================================================" + if ! run_tlc "$module" "$safe_cfg"; then + echo + echo "ERROR: ${safe_cfg} did not pass — see TLC output above." >&2 + overall_rc=1 + continue + fi + echo + + echo "================================================================" + echo " TLC: tla/${module}/${gap_cfg} (no preconditions, expected FAIL on ${gap_inv})" + echo "================================================================" + # Capture stdout+stderr so we can validate both the exit code AND the + # specific invariant string. Without the string match, a parse + # error / deadlock / JVM crash / different invariant would silently + # count as the expected counterexample (codex P2 on PR #856 round 2). + gap_out=$(run_tlc "$module" "$gap_cfg" 2>&1) + gap_rc=$? + printf '%s\n' "$gap_out" + if [ "$gap_rc" -eq 0 ]; then + echo + echo "ERROR: ${gap_cfg} unexpectedly passed." >&2 + echo " The gap configuration disables the safety guard; TLC was" >&2 + echo " supposed to surface a counterexample. A clean pass means" >&2 + echo " either the spec no longer encodes the gap correctly or the" >&2 + echo " safety guards leaked past the EnableSafety toggle." >&2 + overall_rc=1 + continue + fi + if printf '%s\n' "$gap_out" | grep -qF "$gap_inv"; then + echo + echo "OK: ${gap_cfg} failed as designed (${gap_inv})." + else + echo + echo "ERROR: ${gap_cfg} failed, but the reason is NOT \"${gap_inv}\"." >&2 + echo " The non-zero exit may indicate a parse error, deadlock, JVM" >&2 + echo " crash, or a different invariant breaking — review the output" >&2 + echo " above before treating this as a regression in the gap evidence." >&2 + overall_rc=1 + continue + fi + echo +done + +if [ "$overall_rc" -eq 0 ]; then + echo "tla-check: all model-check outcomes match the design contract." +else + echo "tla-check: at least one module did not match the design contract." >&2 +fi +exit "$overall_rc" diff --git a/tla/README.md b/tla/README.md index 06bef48a2..55d4794db 100644 --- a/tla/README.md +++ b/tla/README.md @@ -12,7 +12,7 @@ run it without touching anything under the Go source tree. | Milestone | Scope | Status | |---|---|---| | M1 | `lib/Raft.tla`, `lib/Env.tla`, `hlc/HLC.tla`, `make tla-check` | Landed | -| M2 | `occ/OCC.tla` (safety invariants OCC-1..OCC-5) | Not started | +| M2 | `occ/OCC.tla` (safety invariants OCC-1..OCC-5) | Landed | | M3 | `mvcc/MVCC.tla` (MVCC-1..MVCC-4) | Not started | | M4 | `routes/Routes.tla` (Routes-1..Routes-4) | Not started | | M5 | `composed/Composed.tla` + CI integration | Not started | @@ -46,38 +46,47 @@ From the repository root: make tla-check ``` -This runs TLC against both configurations and prints whether the outcome +This runs TLC against the safe + gap configuration pair for every +module declared in `scripts/tla-check.sh` (currently HLC and OCC; M3 / +M4 / M5 will be added as they land), and prints whether the outcome matches the design contract: -1. `tla/hlc/MCHLC.cfg` — the **correct design**, with HLC-4 - preconditions encoded as ASSUMEs / action guards. TLC must finish - with no invariant violation. -2. `tla/hlc/MCHLC_gap.cfg` — the **gap configuration**, with the - preconditions disabled (`EnableSafety = FALSE`). TLC must produce a - counterexample on `HLC4_NoRegressionAcrossTerms` — this is the - motivating evidence that strategy (c) handoff + the ceiling fence - are necessary. The `tla-check` target inverts the exit code for - this run so a TLC FAILURE here counts as PASS for CI. - -To run a single module by hand (skipping the Makefile): +- **Safe config (`tla//MC.cfg`)** — the correct + design, with each module's preconditions or commit guards enabled. + TLC must finish with no invariant violation. +- **Gap config (`tla//MC_gap.cfg`)** — the + preconditions or guards disabled (`EnableSafety = FALSE`). TLC must + produce a counterexample on the module's load-bearing invariant + (`HLC4_NoRegressionAcrossTerms` for HLC; `OCC1_CommitTsAboveStart` + for OCC). The harness inverts the exit code for the gap run AND + greps for the specific invariant-violation string, so any other + failure mode (parse error, deadlock, JVM crash, different + invariant) fails the job rather than silently being treated as the + expected gap evidence. + +To run a single module by hand (skipping the harness): ```sh JAR=.cache/tla/tla2tools.jar -# Safe config (expected PASS): +# HLC safe (expected PASS): cd tla/hlc java -XX:+UseParallelGC -cp ../../$JAR -DTLA-Library=../lib \ tlc2.TLC -config MCHLC.cfg MCHLC.tla -# Gap config (expected FAIL on HLC-4): +# HLC gap (expected FAIL on HLC4_NoRegressionAcrossTerms): java -XX:+UseParallelGC -cp ../../$JAR -DTLA-Library=../lib \ tlc2.TLC -config MCHLC_gap.cfg MCHLC.tla ``` +The OCC module follows the same pattern under `tla/occ/` with +`MCOCC.cfg` / `MCOCC_gap.cfg`; future M3 / M4 / M5 modules will too. + State-space bounds are set in the `.cfg` files (M1 defaults: 3 nodes, -2 terms, ≤ 3 IssueTimestamp ops, wall clock ≤ 3 ms, `LogicalMax = 1`). -Raise them locally to deepen exploration; the M1 PR keeps them small so -the full `make tla-check` runs in well under a second. +2 terms, ≤ 3 IssueTimestamp ops, wall clock ≤ 3 ms, `LogicalMax = 1`; +M2 defaults: 2 keys, 2 txns, `MaxTs = 6`, `MaxOps = 3`). Raise them +locally to deepen exploration; the bounded configs keep `make +tla-check` end-to-end in well under a second. ## What each module proves @@ -120,6 +129,36 @@ Invariants asserted: ### `hlc/MCHLC.tla` + `MCHLC.cfg` / `MCHLC_gap.cfg` TLC model-check instance. Two configurations, one module. See [Run](#run). +### `occ/OCC.tla` +The OCC layer. Models the Percolator-style 2PC transaction lifecycle +`Idle → Active → Prepared → Committed / Aborted` and the lock map +`(key, lock_ts) → start_ts`. The lock-resolver +(`kv/lock_resolver.go`) is abstracted into the atomic lock drain +inside `Commit`/`Abort` for M2; the async-resolver case is deferred +to M5 (composed) — see the block comment in `OCC.tla` where +`LockResolve` would have lived. HLC is abstracted to a single global +monotonic counter for M2; M5 will INSTANCE `HLC.tla` for the real +48/16 layout. The `EnableSafety` CONSTANT gates the OCC-1 commit +guard so the same module drives the safe and gap configurations. + +Invariants asserted: + +| Invariant | Statement | +|---|---| +| `TypeOK` | Variable types are well-formed | +| `OCC1_CommitTsAboveStart` | Every committed txn has `commit_ts > start_ts` | +| `OCC2_NoWriteWriteConflict` | Two committed txns sharing a write key have distinct commit_ts and one started after the other committed (`commit_ts[earlier] <= start_ts[later]`) | +| `OCC3_ReadSnapshotStability` | Every read observation equals `LatestVisible(k, start_ts(t))` (the snapshot value remains stable under monotonic-ts allocation in BeginTxn / Commit) | +| `OCC4_NoStrandedLockAtQuiescence` | When all txns are in a terminal state, no lock remains | +| `OCC5_StartTsConsistency` | Every read observation is bounded by the txn's `start_ts` (= `read_ts` by OCC-5) | +| `OCC5_Action` (PROPERTY) | Transition form: `start_ts[t]` is assigned once at `BeginTxn` and never updated | +| `CommitTsAssignedOnce` (PROPERTY) | Transition form: `commit_ts[t]` is assigned once at `Commit` and never updated | + +### `occ/MCOCC.tla` + `MCOCC.cfg` / `MCOCC_gap.cfg` +TLC model-check instance for OCC. Same one-module / two-config layout +as MCHLC. The gap config disables the OCC-1 commit guard; TLC +produces an `OCC1_CommitTsAboveStart` counterexample at depth ≈ 5. + ## How to interpret a TLC failure When TLC finds a counterexample on an invariant it prints: diff --git a/tla/occ/MCOCC.cfg b/tla/occ/MCOCC.cfg new file mode 100644 index 000000000..fbe96b4dd --- /dev/null +++ b/tla/occ/MCOCC.cfg @@ -0,0 +1,36 @@ +\* TLC config: OCC safety model — encodes the OCC-1 commit guard. +\* Expected outcome: TLC explores the bounded state space and finds no +\* invariant violation, i.e. proves OCC-1..OCC-5 hold in the correct +\* design. + +SPECIFICATION Spec + +CONSTANTS + Keys = {k1, k2} + TxnIds = {t1, t2} + MaxTs = 6 + MaxOps = 3 + EnableSafety = TRUE + +SYMMETRY Symmetry + +INVARIANTS + TypeOK + OCC1_CommitTsAboveStart + OCC2_NoWriteWriteConflict + OCC3_ReadSnapshotStability + OCC4_NoStrandedLockAtQuiescence + OCC5_StartTsConsistency + +PROPERTIES + OCC5_Action + CommitTsAssignedOnce + +CONSTRAINT StateConstraint + +\* Quiescence is a valid OCC end state (OCC-4 specifically checks it); +\* a state with no enabled action is "all txns terminal and no locks", +\* which is the bounded-safety form we want to verify, not a model bug. +CHECK_DEADLOCK + FALSE + diff --git a/tla/occ/MCOCC.tla b/tla/occ/MCOCC.tla new file mode 100644 index 000000000..abf8b0406 --- /dev/null +++ b/tla/occ/MCOCC.tla @@ -0,0 +1,23 @@ +-------------------------------- MODULE MCOCC -------------------------------- +(***************************************************************************) +(* Model-check instance for OCC.tla. *) +(* *) +(* Two .cfg files point at this module: *) +(* MCOCC.cfg — EnableSafety = TRUE (correct design; TLC passes) *) +(* MCOCC_gap.cfg — EnableSafety = FALSE (Commit's OCC-1 guard removed; *) +(* TLC fails on OCC1_CommitTsAboveStart and emits a *) +(* counterexample, motivating the spec's existence) *) +(* *) +(* Run via `make tla-check` from the repository root. *) +(***************************************************************************) + +EXTENDS OCC, TLC + +\* TLC symmetry hint: TxnIds and Keys are interchangeable in OCC.tla +\* per the spec doc §6.3 (OCC alone is both txn- and key-symmetric — +\* keys participate only as indices, not in an order). +TxnSymmetry == Permutations(TxnIds) +KeySymmetry == Permutations(Keys) +Symmetry == TxnSymmetry \cup KeySymmetry + +============================================================================= diff --git a/tla/occ/MCOCC_gap.cfg b/tla/occ/MCOCC_gap.cfg new file mode 100644 index 000000000..3dde6218d --- /dev/null +++ b/tla/occ/MCOCC_gap.cfg @@ -0,0 +1,41 @@ +\* TLC config: OCC gap model — EnableSafety = FALSE. The Commit action +\* skips the OCC-1 strict-greater guard, so TLC is expected to FAIL on +\* the OCC1_CommitTsAboveStart invariant and emit a counterexample. +\* This is not a bug in the spec; it is the motivating evidence that +\* the commit-ts gating is necessary. +\* +\* The make tla-check target inverts the exit code for this config and +\* greps for the specific OCC-1 violation string, so any other failure +\* mode (parse error, deadlock, JVM crash, different invariant) fails +\* the harness rather than being silently treated as the expected +\* counterexample. + +SPECIFICATION Spec + +CONSTANTS + Keys = {k1, k2} + TxnIds = {t1, t2} + MaxTs = 6 + MaxOps = 3 + EnableSafety = FALSE + +SYMMETRY Symmetry + +INVARIANTS + TypeOK + OCC1_CommitTsAboveStart + OCC2_NoWriteWriteConflict + OCC3_ReadSnapshotStability + OCC4_NoStrandedLockAtQuiescence + OCC5_StartTsConsistency + +PROPERTIES + OCC5_Action + CommitTsAssignedOnce + +CONSTRAINT StateConstraint + +\* See MCOCC.cfg — quiescence is a valid end state, not a deadlock. +CHECK_DEADLOCK + FALSE + diff --git a/tla/occ/OCC.tla b/tla/occ/OCC.tla new file mode 100644 index 000000000..133fde4c4 --- /dev/null +++ b/tla/occ/OCC.tla @@ -0,0 +1,341 @@ +--------------------------------- MODULE OCC --------------------------------- +(***************************************************************************) +(* TLA+ specification of the elastickv OCC layer (Percolator-style 2PC). *) +(* Per docs/design/2026_05_28_partial_tla_safety_spec.md §5.2. *) +(* *) +(* Models the transaction lifecycle Idle -> Active -> Prepared -> *) +(* Committed/Aborted and the lock-map (key, lock_ts) -> start_ts. In M2 *) +(* the lock-resolver (kv/lock_resolver.go) is abstracted into the atomic *) +(* lock drain inside Commit/Abort; see the block comment further down for *) +(* why a separate LockResolve action is unreachable here and deferred to *) +(* M5 (composed). Encodes all five OCC safety invariants OCC-1..OCC-5: *) +(* *) +(* OCC-1 Commit_ts > read_ts for every committed transaction. *) +(* OCC-2 No write-write conflict on intersecting write sets. *) +(* OCC-3 Snapshot read stability under the lock-encoding clause. *) +(* OCC-4 No stranded lock at quiescence (bounded-safety form). *) +(* OCC-5 start_ts = read_ts for every operation in the same txn. *) +(* *) +(* The single CONSTANT EnableSafety gates two normative guards so the *) +(* same module drives the safe model-check (MCOCC.cfg) and the gap *) +(* model-check (MCOCC_gap.cfg) — when off, the Commit action skips the *) +(* OCC-1 strict-greater check, which TLC surfaces as an explicit *) +(* counterexample (the design doc's motivating evidence pattern). *) +(* *) +(* HLC is abstracted to a single global monotonic counter `tsCounter`; *) +(* M5 (composed) will INSTANCE HLC.tla for the real 48/16 layout. *) +(***************************************************************************) + +EXTENDS Naturals, FiniteSets, Sequences + +CONSTANTS + Keys, \* finite set of keys exercised in the model + TxnIds, \* finite set of transaction identifiers + MaxTs, \* upper bound on the abstract HLC counter (state-space) + MaxOps, \* upper bound on per-txn Write actions (state-space) + EnableSafety \* TRUE: encode OCC-1 commit guard; FALSE: gap config + +\* Transaction lifecycle states. +States == {"Idle", "Active", "Prepared", "Committed", "Aborted"} + +\* Sentinel for "no lock on this key". +NoLock == [owner |-> "none", lockTs |-> 0] + +VARIABLES + tsCounter, \* Nat. Abstract HLC counter; advances on BeginTxn + Commit. + txnState, \* TxnIds -> States + startTs, \* TxnIds -> Nat (== read_ts by OCC-5) + commitTs, \* TxnIds -> Nat (0 if not committed) + writeSet, \* TxnIds -> SUBSET Keys + readObs, \* TxnIds -> [Keys -> Nat] (commit_ts observed; 0 = nothing) + lockMap, \* Keys -> [owner: TxnIds \cup {"none"}, lockTs: Nat] + versions, \* Keys -> SUBSET [commitTs: Nat, owner: TxnIds] + opCount \* state-space bound + +vars == <> + +\* === HELPERS === +HasLock(k) == lockMap[k] # NoLock +LockedBy(k, t) == HasLock(k) /\ lockMap[k].owner = t + +\* For OCC-3 lock encoding: a read on k at start_ts must skip the version +\* read whenever a lock with lockTs <= start_ts exists (cf. §5.2 OCC-3). +ConflictingLockExists(k, ts) == + HasLock(k) /\ lockMap[k].lockTs <= ts + +\* The committed version visible to a snapshot at read_ts on key k: the +\* largest commitTs strictly <= read_ts among versions[k], 0 if none. +LatestVisible(k, ts) == + LET candidates == { v.commitTs : v \in {w \in versions[k] : w.commitTs <= ts} } + IN IF candidates = {} THEN 0 + ELSE CHOOSE c \in candidates : \A o \in candidates : o <= c + +\* === INIT === +Init == + /\ tsCounter = 0 + /\ txnState = [t \in TxnIds |-> "Idle"] + /\ startTs = [t \in TxnIds |-> 0] + /\ commitTs = [t \in TxnIds |-> 0] + /\ writeSet = [t \in TxnIds |-> {}] + /\ readObs = [t \in TxnIds |-> [k \in Keys |-> 0]] + /\ lockMap = [k \in Keys |-> NoLock] + /\ versions = [k \in Keys |-> {}] + /\ opCount = 0 + +\* === ACTIONS === + +(***************************************************************************) +(* BeginTxn: an Idle txn becomes Active and pins its start_ts. *) +(* tsCounter is advanced as part of the action so the assigned startTs *) +(* is a fresh, monotonically increasing value across all BeginTxn calls. *) +(* This models the real HLC behaviour where start_ts is allocated from a *) +(* leader-issued HLC ts that strictly increases per allocation. *) +(* *) +(* The monotonic-tsCounter property is load-bearing for OCC-3 (snapshot *) +(* stability — strict equality form): a reader's `LatestVisible(k, T)` *) +(* cannot change after the read because no future Commit can produce a *) +(* commit_ts <= T (Commit also auto-advances tsCounter under *) +(* EnableSafety; see Commit below). *) +(* *) +(* OCC-5 is structurally satisfied: every later op on this txn reads *) +(* startTs[t] directly — there is no separate read_ts variable. *) +(***************************************************************************) +BeginTxn(t) == + /\ txnState[t] = "Idle" + /\ tsCounter < MaxTs + /\ tsCounter' = tsCounter + 1 + /\ txnState' = [txnState EXCEPT ![t] = "Active"] + /\ startTs' = [startTs EXCEPT ![t] = tsCounter + 1] + /\ UNCHANGED <> + +(***************************************************************************) +(* ReadKey: snapshot read on key k at startTs[t]. Gated by OCC-3 lock *) +(* encoding — if any uncommitted lock with lockTs <= startTs[t] is *) +(* outstanding, the read is blocked (modelled here as the action being *) +(* disabled until the lock holder's Commit or Abort drains the lock in *) +(* that same atomic step). *) +(***************************************************************************) +ReadKey(t, k) == + /\ txnState[t] = "Active" + /\ readObs[t][k] = 0 \* not yet read in this txn + /\ ~ConflictingLockExists(k, startTs[t]) + /\ readObs' = [readObs EXCEPT ![t] = [@ EXCEPT ![k] = LatestVisible(k, startTs[t])]] + /\ UNCHANGED <> + +(***************************************************************************) +(* WriteIntent: buffer a write to k locally in the txn's writeSet. *) +(* No lock is taken yet (Percolator pre-prepare). *) +(***************************************************************************) +WriteIntent(t, k) == + /\ txnState[t] = "Active" + /\ k \notin writeSet[t] + /\ opCount < MaxOps + /\ writeSet' = [writeSet EXCEPT ![t] = @ \cup {k}] + /\ opCount' = opCount + 1 + /\ UNCHANGED <> + +(***************************************************************************) +(* Prepare: acquire locks on every key in writeSet at lockTs = startTs[t]. *) +(* The conflict check (no existing lock and no version with *) +(* commitTs > startTs[t]) prevents two concurrent writers from prepar- *) +(* ing on the same key — this is the OCC-2 enforcement point. *) +(***************************************************************************) +Prepare(t) == + /\ txnState[t] = "Active" + /\ writeSet[t] # {} + /\ \A k \in writeSet[t] : + /\ ~HasLock(k) + /\ \A v \in versions[k] : v.commitTs <= startTs[t] + /\ txnState' = [txnState EXCEPT ![t] = "Prepared"] + /\ lockMap' = [k \in Keys |-> + IF k \in writeSet[t] + THEN [owner |-> t, lockTs |-> startTs[t]] + ELSE lockMap[k]] + /\ UNCHANGED <> + +(***************************************************************************) +(* Commit: promote every lock owned by t into a version at commitTs. *) +(* *) +(* Under EnableSafety, Commit advances tsCounter and uses the new value *) +(* as commit_ts — this models the HLC behaviour where commit_ts is *) +(* allocated as the next fresh ts > all prior start_ts and commit_ts. *) +(* The OCC-1 invariant (`commit_ts > start_ts`) is therefore a consequence *) +(* of the monotonic ts allocation rather than an explicit comparison. *) +(* *) +(* Under ~EnableSafety (gap config), Commit borrows the *current* *) +(* tsCounter without advancing. Combined with BeginTxn's auto-advance, *) +(* this lets commit_ts == startTs[t] (the BeginTxn's freshly assigned *) +(* value), violating OCC-1. TLC surfaces the (committed, commit_ts == *) +(* start_ts) counterexample. *) +(***************************************************************************) +Commit(t) == + /\ txnState[t] = "Prepared" + /\ LET ct == IF EnableSafety THEN tsCounter + 1 ELSE tsCounter IN + /\ EnableSafety => tsCounter < MaxTs \* keep ct inside the bounded model + /\ tsCounter' = IF EnableSafety THEN tsCounter + 1 ELSE tsCounter + /\ commitTs' = [commitTs EXCEPT ![t] = ct] + /\ txnState' = [txnState EXCEPT ![t] = "Committed"] + /\ versions' = [k \in Keys |-> + IF k \in writeSet[t] + THEN versions[k] \cup {[commitTs |-> ct, owner |-> t]} + ELSE versions[k]] + /\ lockMap' = [k \in Keys |-> + IF LockedBy(k, t) THEN NoLock ELSE lockMap[k]] + /\ UNCHANGED <> + +(***************************************************************************) +(* Abort: release all locks owned by t. Versions are NOT created. *) +(***************************************************************************) +Abort(t) == + /\ txnState[t] \in {"Active", "Prepared"} + /\ txnState' = [txnState EXCEPT ![t] = "Aborted"] + /\ lockMap' = [k \in Keys |-> + IF LockedBy(k, t) THEN NoLock ELSE lockMap[k]] + /\ UNCHANGED <> + +(***************************************************************************) +(* LockResolve is intentionally NOT part of this M2 model. *) +(* *) +(* The real implementation has an asynchronous lock-resolver *) +(* (kv/lock_resolver.go): when a reader encounters a lock whose owner is *) +(* in a terminal state, the resolver either promotes the lock to a *) +(* version (Committed owner) or clears it (Aborted owner). Modelling *) +(* that resolver as a TLA+ action only makes sense if Commit/Abort can *) +(* leave orphan locks behind — which in our `Commit`/`Abort` above is *) +(* not the case: both actions atomically drain every lock they own as *) +(* part of the same step. A LockResolve action whose guard requires *) +(* HasLock(k) /\ txnState[owner] \in {Committed, Aborted} would *) +(* therefore be unreachable in this model (gemini HIGH on PR #858). *) +(* *) +(* This is a deliberate M2 abstraction: the atomic-drain in Commit/Abort *) +(* subsumes the resolver's effect on the durable state without modelling *) +(* the asynchronous interleaving. The bounded-safety form of OCC-4 (no *) +(* stranded lock at quiescence) is therefore trivially preserved. *) +(* OCC-L1 (eventual lock release, the liveness counterpart) is deferred *) +(* to M6 per §5.6 of the design doc. M5 (composed) is the right place *) +(* to model the async resolver: cross-module interactions with OCC-3 *) +(* (lock encoding for reads) and Routes (where the resolver issues *) +(* secondary RPCs) make sense to test together. *) +(***************************************************************************) + +\* === NEXT === +Next == + \/ \E t \in TxnIds : BeginTxn(t) + \/ \E t \in TxnIds, k \in Keys : ReadKey(t, k) + \/ \E t \in TxnIds, k \in Keys : WriteIntent(t, k) + \/ \E t \in TxnIds : Prepare(t) + \/ \E t \in TxnIds : Commit(t) + \/ \E t \in TxnIds : Abort(t) + +Spec == Init /\ [][Next]_vars + +\* === STATE CONSTRAINT === +StateConstraint == + /\ tsCounter <= MaxTs + /\ opCount <= MaxOps + +\* === TYPE INVARIANT === +\* Every state variable is exhaustively typed (gemini medium on PR +\* #858). Without coverage of `lockMap[k].lockTs` and `versions`, +\* TypeOK would let a buggy action smuggle in a malformed lock-ts or +\* version record undetected. +TypeOK == + /\ tsCounter \in 0..MaxTs + /\ txnState \in [TxnIds -> States] + /\ startTs \in [TxnIds -> 0..MaxTs] + /\ commitTs \in [TxnIds -> 0..MaxTs] + /\ writeSet \in [TxnIds -> SUBSET Keys] + /\ readObs \in [TxnIds -> [Keys -> 0..MaxTs]] + /\ opCount \in 0..MaxOps + /\ \A k \in Keys : + /\ lockMap[k].owner \in TxnIds \cup {"none"} + /\ lockMap[k].lockTs \in 0..MaxTs + /\ versions \in [Keys -> SUBSET [commitTs: 0..MaxTs, owner: TxnIds]] + +\* Set of transactions in a terminal state. +CommittedTxns == { t \in TxnIds : txnState[t] = "Committed" } + +\* === SAFETY INVARIANTS === + +\* OCC-1 — every committed transaction has commit_ts strictly greater +\* than its start_ts. Enforced at the Commit action under EnableSafety; +\* removed in the gap config so TLC produces the motivating counter- +\* example. +OCC1_CommitTsAboveStart == + \A t \in CommittedTxns : commitTs[t] > startTs[t] + +\* OCC-2 — no write-write conflict on intersecting write sets. When two +\* committed transactions share a write key, (a) their commit +\* timestamps are distinct and (b) the later starter saw the earlier +\* one (`commitTs[earlier] <= startTs[later]`). The non-strict +\* `<=` is correct: a transaction with `start_ts = T` sees every +\* version with `commit_ts <= T`, so `start_ts = commit_ts[earlier]` +\* still counts as "the later one ran in the world after the earlier +\* one committed". +OCC2_NoWriteWriteConflict == + \A t1, t2 \in CommittedTxns : + (t1 # t2 /\ writeSet[t1] \cap writeSet[t2] # {}) => + /\ commitTs[t1] # commitTs[t2] + /\ (commitTs[t1] <= startTs[t2] \/ commitTs[t2] <= startTs[t1]) + +\* OCC-3 — snapshot-read stability under the lock encoding. The +\* recorded `readObs[t][k]` equals the current `LatestVisible(k, +\* startTs[t])` for every key the transaction has read. The previous +\* weaker `<=` form (gemini medium on PR #858) only asserted bounds +\* and overlapped with OCC-5; this strict equality is the real +\* snapshot-stability claim — the read value MUST remain consistent +\* with the canonical latest-visible version at the txn's snapshot. +\* +\* Soundness in the safe config rests on the monotonic ts allocation +\* in BeginTxn/Commit: every BeginTxn issues a fresh startTs > +\* tsCounter, and every Commit issues a commit_ts > tsCounter > every +\* prior startTs. So no future Commit can produce a commit_ts <= +\* startTs[t] for any t already past BeginTxn, hence LatestVisible(k, +\* startTs[t]) cannot change after the read. +\* +\* The `readObs[t][k] # 0` precondition conflates "did not read" and +\* "read but observed no version"; this is acceptable because the +\* latter case is vacuously stable (no version <= startTs[t] either +\* before or after, under the monotonic-ts allocation). +OCC3_ReadSnapshotStability == + \A t \in TxnIds, k \in Keys : + (txnState[t] \in {"Active", "Prepared", "Committed"} /\ readObs[t][k] # 0) => + readObs[t][k] = LatestVisible(k, startTs[t]) + +\* OCC-4 — no stranded lock at quiescence. Bounded safety form: in any +\* state where every txn is in a terminal state (Idle/Committed/Aborted), +\* no lock remains on any key. Under the M2 atomic-drain abstraction +\* (Commit/Abort clear every lock they own in the same step), this is +\* maintained by construction rather than by a separate resolver action. +\* The async resolver case is deferred to M5 — see the block comment +\* above where LockResolve would have lived. +OCC4_NoStrandedLockAtQuiescence == + LET AllTerminal == \A t \in TxnIds : txnState[t] \in {"Idle", "Committed", "Aborted"} + IN AllTerminal => + \A k \in Keys : ~HasLock(k) + +\* OCC-5 — start_ts equals read_ts for every operation. Structurally +\* satisfied by the model (no separate read_ts variable; every read uses +\* startTs[t] directly). Asserted here as a sanity check on the +\* invariant: every read observation is bounded by the txn's startTs. +\* (OCC-3 already implies this; OCC-5 is the named invariant.) +OCC5_StartTsConsistency == + \A t \in TxnIds, k \in Keys : + readObs[t][k] # 0 => readObs[t][k] <= startTs[t] + +\* === ACTION-LEVEL INVARIANTS (registered under PROPERTIES) === + +\* Each txn's startTs is assigned once (at BeginTxn) and never updated. +OCC5_Action == + [][\A t \in TxnIds : startTs[t] # 0 => startTs'[t] = startTs[t]]_vars + +\* commitTs is assigned once (at Commit) and never updated. +CommitTsAssignedOnce == + [][\A t \in TxnIds : commitTs[t] # 0 => commitTs'[t] = commitTs[t]]_vars + +=============================================================================