From 4d9c5e1ab722db96168be694d582482106689cd6 Mon Sep 17 00:00:00 2001
From: Hubert Gruszecki <h.gruszecki@gmail.com>
Date: Tue, 14 Apr 2026 14:34:27 +0200
Subject: [PATCH] fix(ci): harden publish pipeline

Follow-up to #3124 plus review-round fixes on the release chain.

Narrow cargo publish continue-on-error to the "already uploaded"
class; capture stderr to a tempfile instead of a process-sub tee
that raced the classifier. Make `commit` required in the rust
publish reusable workflow and re-verify master-ancestry so direct
workflow_call callers cannot bypass the check. Bring wait-for-crate
curl up to wait-for-url parity and switch to `jq -Rr 'fromjson?'`
so malformed NDJSON survives pipefail. Fail-fast on wrong-target
tags; render them distinctly and keep rendering under
skip_tag_creation=true, gating only exit 1. Cache cargo metadata
once per job via \$GITHUB_ENV so the four post-merge composite
Validate steps reuse it. Consolidate the pre-release vocabulary
behind \`extract-version.sh --is-pre-release\` so post-merge.yml and
publish.yml no longer diverge on \`.devN\` / bare \`rcN\`. Retune the
idempotency pre-check to initial_sleep=1 and rewrite the two-layer
wait comment with correct budgets (~8 min inner, ~28 min outer).
Includes the Maven \`continue-on-error\` carve-out on publish.yml's
Central wait step, rationale documented in-tree above the step.
---
 .github/actions/rust/post-merge/action.yml    | 183 ++++++++++-----
 .../actions/utils/create-git-tag/action.yml   |  63 ++++--
 .../actions/utils/wait-for-crate/action.yml   |  72 ++++--
 .github/actions/utils/wait-for-url/action.yml |  21 +-
 .github/workflows/_publish_rust_crates.yml    | 105 ++++++++-
 .github/workflows/post-merge.yml              |   4 +-
 .github/workflows/publish.yml                 | 209 +++++++++++++++---
 scripts/extract-version.sh                    |  78 ++++++-
 8 files changed, 599 insertions(+), 136 deletions(-)

diff --git a/.github/actions/rust/post-merge/action.yml b/.github/actions/rust/post-merge/action.yml
index b303ccf05e..c8fb2284d6 100644
--- a/.github/actions/rust/post-merge/action.yml
+++ b/.github/actions/rust/post-merge/action.yml
@@ -18,8 +18,10 @@
 name: rust-post-merge
 description: >
   Publish a single Rust crate to crates.io. Idempotent on rerun via a
-  sparse-index pre-check. Intended to be called once per crate, in
-  dependency order, from .github/workflows/_publish_rust_crates.yml.
+  sparse-index pre-check and a post-publish CAS verify. Intended to be
+  called once per crate, in dependency order, from
+  .github/workflows/_publish_rust_crates.yml. Dry-run publishing is
+  handled one level up by scripts/verify-crates-publish.sh.
 
 inputs:
   package:
@@ -28,41 +30,14 @@ inputs:
   version:
     description: "Version for publishing"
     required: true
-  dry_run:
-    description: |
-      Deprecated. Retained only to avoid silently breaking downstream forks
-      that pin this composite by SHA and still pass `dry_run: true`.
-
-      Dry-run publishing is now handled one level up, by
-      scripts/verify-crates-publish.sh invoked from
-      .github/workflows/_publish_rust_crates.yml on the dry-run path. When
-      this composite is called with dry_run=true it prints a deprecation
-      warning and no-ops every publish step so forks keep getting the
-      "don't touch the real registry" semantics they expected. This input
-      will be removed in a future release once forks have migrated.
-    required: false
-    default: "false"
 
 runs:
   using: "composite"
   steps:
-    - name: Deprecated dry_run warning
-      if: inputs.dry_run == 'true'
-      shell: bash
-      run: |
-        echo "::warning::rust/post-merge: the 'dry_run' input is deprecated."
-        echo "::warning::Dry-run publishing now happens at the workflow level via"
-        echo "::warning::scripts/verify-crates-publish.sh (see _publish_rust_crates.yml)."
-        echo "::warning::Honoring dry_run=true by skipping every step in this composite."
-        echo "::warning::This input will be removed in a future release; please migrate."
-        echo "⏭️ dry_run=true → skipping all publish steps"
-
     - name: Setup Rust with cache
-      if: inputs.dry_run != 'true'
       uses: ./.github/actions/utils/setup-rust-with-cache
 
     - name: Validate package
-      if: inputs.dry_run != 'true'
       env:
         PACKAGE: ${{ inputs.package }}
         VERSION: ${{ inputs.version }}
@@ -74,24 +49,48 @@ runs:
         echo "Version: $VERSION"
         echo ""
 
-        if ! cargo metadata --format-version 1 | jq -e --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg)' > /dev/null; then
+        # Single cargo metadata invocation reused for presence check, version,
+        # and manifest path. --no-deps keeps all three fields we read and
+        # avoids walking the dep graph, saving ~30-60s across a 4-crate release.
+        #
+        # Reuse the cache written by _publish_rust_crates.yml's `Extract
+        # versions and tags` step if present (propagated via $GITHUB_ENV).
+        # Saves ~8s per crate on a 36-crate workspace; across 4 crates in
+        # the chain, ~30s per release. Falls back to a fresh cargo metadata
+        # fork if the cache is missing (e.g., the composite is invoked from
+        # a different workflow that doesn't set up the cache).
+        if [[ -n "${IGGY_CARGO_METADATA_FILE:-}" ]] && [[ -r "${IGGY_CARGO_METADATA_FILE}" ]]; then
+          META=$(cat "${IGGY_CARGO_METADATA_FILE}")
+        else
+          META=$(cargo metadata --format-version 1 --no-deps)
+        fi
+
+        if ! echo "$META" | jq -e --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg)' > /dev/null; then
           echo "❌ Package '$PACKAGE' not found in workspace"
           echo ""
           echo "Available packages:"
-          cargo metadata --format-version 1 | jq -r '.packages[].name' | sort
+          echo "$META" | jq -r '.packages[].name' | sort
           exit 1
         fi
 
-        CARGO_VERSION=$(cargo metadata --format-version 1 | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .version')
-        CARGO_PATH=$(cargo metadata --format-version 1 | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .manifest_path')
+        CARGO_VERSION=$(echo "$META" | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .version')
+        CARGO_PATH=$(echo "$META" | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .manifest_path')
 
         echo "Current Cargo.toml version: $CARGO_VERSION"
         echo "Target version: $VERSION"
         echo "Manifest path: $CARGO_PATH"
 
         if [ "$CARGO_VERSION" != "$VERSION" ]; then
-          echo "⚠️ Warning: Cargo.toml version ($CARGO_VERSION) doesn't match target version ($VERSION)"
-          echo "Make sure to update Cargo.toml before publishing"
+          echo "❌ Cargo.toml version ($CARGO_VERSION) doesn't match target version ($VERSION)"
+          echo ""
+          echo "cargo publish uses the Cargo.toml version, not the input, so a mismatch"
+          echo "would upload the WRONG version and then fail the downstream wait-for-crate"
+          echo "on the target version ~15 min later. Fail fast here instead."
+          echo ""
+          echo "Recovery:"
+          echo "  scripts/bump-version.sh $PACKAGE --set $VERSION"
+          echo "  git commit -am 'chore(release): bump $PACKAGE to $VERSION'"
+          exit 1
         fi
 
         echo ""
@@ -99,7 +98,6 @@ runs:
         cargo tree -p "$PACKAGE" --depth 1 | head -20
 
     - name: Build package
-      if: inputs.dry_run != 'true'
       env:
         PACKAGE: ${{ inputs.package }}
       shell: bash
@@ -112,7 +110,6 @@ runs:
         echo "✅ Package built successfully"
 
     - name: Verify package contents
-      if: inputs.dry_run != 'true'
       env:
         PACKAGE: ${{ inputs.package }}
       shell: bash
@@ -129,28 +126,47 @@ runs:
         cargo package -p "$PACKAGE" --list | wc -l
         echo "files would be included"
 
-    # Idempotency pre-check: ask the crates.io sparse index (same data the
-    # publish wait gate uses) whether this exact version is already live.
-    # If it is, we skip `cargo publish` cleanly instead of hard-failing on
-    # "crate version already uploaded", which is the failure mode that blocks
-    # reruns after a transient post-publish issue (e.g. tag push failure).
+    # Idempotency pre-check: ask the crates.io sparse index whether this
+    # exact version is already live. A success here is a warm-cache fast
+    # path that skips `cargo publish` entirely (used by reruns after a
+    # transient post-publish issue like a tag push failure). A failure
+    # here does NOT prove the crate is absent - the CDN can serve a
+    # stale 404 or 5xx - so a failure flips through to the publish path,
+    # which is guarded by the post-publish CAS verify below.
     #
-    # continue-on-error: true so an exit 1 ("not there") flips through to
-    # steps.already_published.outcome == 'failure' and gates the publish
-    # step below, instead of failing the job.
+    # max_attempts=5 with initial_sleep=1 closes the common cold-cache
+    # race (CDN not yet caught up from a prior successful publish: sleeps
+    # 1+2+4+8=15s worst case per crate). continue-on-error: true so an
+    # exit 1 surfaces as steps.already_published.outcome == 'failure'
+    # instead of failing the job.
     - name: Check if crate is already on crates.io
-      if: inputs.dry_run != 'true'
       id: already_published
       continue-on-error: true
       uses: ./.github/actions/utils/wait-for-crate
       with:
         package: ${{ inputs.package }}
         version: ${{ inputs.version }}
-        max_attempts: "1"
+        max_attempts: "5"
         initial_sleep_seconds: "1"
 
+    # Publish runs WITHOUT continue-on-error so any failure that is NOT the
+    # "already uploaded" class (invalid CARGO_REGISTRY_TOKEN, 401/403, 429
+    # rate limit, 5xx, Cargo.toml validation error, dependency resolution)
+    # surfaces loudly with its actual error instead of getting swallowed
+    # into a misleading "not on sparse index" CAS timeout ~3 min later.
+    #
+    # The only expected benign failure is the race where a prior run
+    # already uploaded this exact version between our pre-check and our
+    # publish attempt; cargo emits that as "crate version X.Y.Z is
+    # already uploaded", which the stderr-grep below classifies as a
+    # benign skip and translates into exit 0. The CAS verify step
+    # immediately below is then the authoritative state oracle: if the
+    # sparse index serves this version after the publish path ran, the
+    # crate is live regardless of whether THIS run uploaded it or an
+    # earlier one did.
     - name: Publish to crates.io
-      if: inputs.dry_run != 'true' && steps.already_published.outcome == 'failure'
+      if: steps.already_published.outcome == 'failure'
+      id: publish
       shell: bash
       env:
         CARGO_REGISTRY_TOKEN: ${{ env.CARGO_REGISTRY_TOKEN }}
@@ -167,14 +183,77 @@ runs:
         echo "📦 Publishing $PACKAGE v$VERSION to crates.io..."
         echo ""
 
-        cargo publish -p "$PACKAGE"
+        # Capture cargo publish stderr to a tempfile so we can classify the
+        # "already uploaded" benign class after the fact. A previous version
+        # used `2> >(tee ...)` to also stream stderr live to the job log, but
+        # bash does not wait on process-substitution children before the
+        # grep classifier runs, producing a measured 2-3% race where the
+        # classifier misses the benign-rerun signature. GitHub Actions step
+        # logs are line-buffered via the agent regardless, so `cat` after
+        # cargo exits gives the operator the same experience without the
+        # race.
+        publish_stderr="$(mktemp)"
+        trap 'rm -f "${publish_stderr}"' EXIT
+
+        rc=0
+        cargo publish -p "$PACKAGE" 2>"${publish_stderr}" || rc=$?
+        cat "${publish_stderr}" >&2
+
+        if [ "${rc}" -eq 0 ]; then
+          echo ""
+          echo "✅ cargo publish reports success"
+          echo "View on crates.io: https://crates.io/crates/$PACKAGE/$VERSION"
+          exit 0
+        fi
+
+        # Narrow benign class: race where a prior run already uploaded
+        # this exact version. cargo's message shape has changed across
+        # releases:
+        #   * cargo <1.75 (server-side error passed through):
+        #       error: failed to publish to registry at https://...
+        #       caused by: the remote server responded with an error
+        #       (status 200 OK): crate version `X.Y.Z` is already uploaded
+        #   * cargo ≥1.75 (local sparse-index pre-check):
+        #       error: crate <name>@<version> already exists on crates.io index
+        # We match both by disjunction: "is already uploaded" OR "already
+        # exists on ... index". Both substrings are specific to this
+        # class of failure and would not appear in token/network/5xx/
+        # validation errors.
+        if grep -qE "(is already uploaded|already exists on .*index)" "${publish_stderr}"; then
+          echo ""
+          echo "ℹ️ $PACKAGE v$VERSION is already uploaded (race with prior run)"
+          echo "   Continuing to CAS verify to confirm the crate is live on the sparse index."
+          exit 0
+        fi
 
         echo ""
-        echo "✅ Successfully published to crates.io"
-        echo "View on crates.io: https://crates.io/crates/$PACKAGE/$VERSION"
+        echo "❌ cargo publish failed with rc=${rc} and no 'already uploaded' signature"
+        echo "   The actual error is in the stderr above. Common causes:"
+        echo "     - invalid or expired CARGO_REGISTRY_TOKEN (401/403)"
+        echo "     - crates.io rate limit (429)"
+        echo "     - crates.io 5xx (transient, rerun should recover)"
+        echo "     - Cargo.toml validation error or dependency resolution failure"
+        exit "${rc}"
+
+    # CAS verify: authoritative post-publish state check. Runs whenever
+    # the pre-check fell through to the publish path, regardless of
+    # whether cargo publish itself succeeded. Success = crate is live on
+    # the sparse index, which is the same contract the top-level wait
+    # gates in _publish_rust_crates.yml use before tagging. Failure of
+    # this step fails the job (no continue-on-error), so a genuine
+    # upload failure still surfaces loudly - we only swallow the
+    # "already uploaded" false-negative class.
+    - name: Verify crate landed on crates.io (CAS)
+      if: steps.already_published.outcome == 'failure'
+      uses: ./.github/actions/utils/wait-for-crate
+      with:
+        package: ${{ inputs.package }}
+        version: ${{ inputs.version }}
+        max_attempts: "10"
+        initial_sleep_seconds: "2"
 
     - name: Publish skipped (crate already on crates.io)
-      if: inputs.dry_run != 'true' && steps.already_published.outcome == 'success'
+      if: steps.already_published.outcome == 'success'
       shell: bash
       env:
         PACKAGE: ${{ inputs.package }}
diff --git a/.github/actions/utils/create-git-tag/action.yml b/.github/actions/utils/create-git-tag/action.yml
index cec8117bc8..7c9aea411e 100644
--- a/.github/actions/utils/create-git-tag/action.yml
+++ b/.github/actions/utils/create-git-tag/action.yml
@@ -57,19 +57,32 @@ runs:
           exit 1
         fi
 
-        # Reject inputs that could mangle git invocation. Tag and commit are
-        # both derived from trusted sources today (extract-version.sh outputs
-        # and pre-validated SHAs), but the composite has no caller context, so
-        # validate defensively.
-        #
-        # `+` is allowed in the tag alphabet because every tag_pattern in
-        # .github/config/publish.yml already permits the semver build
-        # metadata suffix `(?:\+[0-9A-Za-z.-]+)?`. Rejecting `+` here would
-        # hard-fail the entire chain after a successful publish the first
-        # time a release uses a `X.Y.Z+build.N` version - the exact rc1
-        # failure shape this PR is trying to eliminate.
-        if ! [[ "${TAG}" =~ ^[A-Za-z0-9._/+-]+$ ]]; then
-          echo "❌ create-git-tag: tag '${TAG}' contains characters outside [A-Za-z0-9._/+-]"
+        # Validate the tag name with two layers of defense:
+        #  1. Shell-option injection: reject anything not starting with
+        #     alphanumeric / `_` / `/` so the composite cannot be coerced
+        #     into parsing the tag as a git or shell short-option (a
+        #     leading `-` would be the classic attack shape). A leading
+        #     `.` is also rejected because git's own check_refname_format
+        #     would reject it later and we prefer a fast, actionable
+        #     failure here.
+        #  2. Git refname format: delegate to `git check-ref-format`, which
+        #     enforces the full refs/tags/ restrictions (no `..`, no
+        #     `.lock`, no trailing slash, no control chars, etc.).
+        #     Strictly stronger than a hand-rolled alphabet rule, and
+        #     stays in sync with git's own receive-pack check instead of
+        #     drifting from it. In particular this accepts every real
+        #     tag_pattern in .github/config/publish.yml, including the
+        #     semver build metadata suffix `X.Y.Z+build.N` that rc1
+        #     choked on, and the `foreign/go/v0.7.0` slash-containing
+        #     Go module tag shape.
+        if ! [[ "${TAG}" =~ ^[A-Za-z0-9_/] ]]; then
+          echo "❌ create-git-tag: tag '${TAG}' must start with [A-Za-z0-9_/]"
+          exit 1
+        fi
+        if ! git check-ref-format "refs/tags/${TAG}" 2>/dev/null; then
+          echo "❌ create-git-tag: tag '${TAG}' is not a valid git ref name"
+          echo "   git check-ref-format rejected it. See"
+          echo "   https://git-scm.com/docs/git-check-ref-format for the rules."
           exit 1
         fi
         if ! [[ "${COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then
@@ -83,14 +96,16 @@ runs:
 
         # Ensure the commit object exists locally; required by `git tag -a`.
         # If the workflow used a shallow checkout, fetch just the one commit.
+        # GitHub enables allowReachableSHA1InWant=true, so single-commit
+        # fetches work even when the caller's checkout used fetch-depth:1.
         if ! git cat-file -e "${COMMIT}^{commit}" 2>/dev/null; then
           echo "ℹ️ Commit ${COMMIT} not in local clone, fetching..."
           if ! git fetch --no-tags --depth=1 origin "${COMMIT}" 2>/dev/null; then
             echo "❌ Failed to fetch commit ${COMMIT} from origin"
             echo "   Recovery:"
-            echo "     - verify the caller's checkout step uses fetch-depth: 0"
             echo "     - verify the commit still exists on origin (was it force-pushed away?)"
             echo "     - verify the commit is reachable from a branch on origin (not only from a PR ref)"
+            echo "     - if the caller's network restricts single-commit fetches, increase fetch-depth on the calling checkout step"
             exit 1
           fi
         fi
@@ -163,7 +178,12 @@ runs:
         # always 0.
         push_rc=0
         push_stderr_file="$(mktemp)"
-        trap 'rm -f "${push_stderr_file}"' EXIT
+        # Cleanup wrapped in a named function so future traps can append
+        # to it instead of overwriting (trap 'foo' EXIT replaces any
+        # earlier EXIT trap). No earlier EXIT trap exists today, so this
+        # is purely refactor-defensive.
+        _create_git_tag_cleanup() { rm -f "${push_stderr_file}"; }
+        trap _create_git_tag_cleanup EXIT
         git push origin "${TAG}" 2>"${push_stderr_file}" || push_rc=$?
         if [ "${push_rc}" -eq 0 ]; then
           echo "✅ Created and pushed tag: ${TAG}"
@@ -174,8 +194,12 @@ runs:
         echo "   push stderr:"
         sed 's/^/     /' "${push_stderr_file}"
 
-        REMOTE_RAW=$(git ls-remote --tags origin "refs/tags/${TAG}" | awk '{print $1}')
-        if [ -z "${REMOTE_RAW}" ]; then
+        # Use the same peeled-then-raw resolver as the early-skip and
+        # post-push branches so all three agree on what the tag points at.
+        # The previous inline `git ls-remote ... | awk` only read the raw
+        # line, which would miss an annotated-tag same-commit race.
+        REMOTE_SHA="$(remote_tag_commit)"
+        if [ -z "${REMOTE_SHA}" ]; then
           echo "❌ Push failed and tag ${TAG} is not on remote - propagating failure"
           echo "   The push stderr above should explain why (permission denied, protected"
           echo "   ref, missing upstream, etc.). If this is a token/permissions issue,"
@@ -183,13 +207,12 @@ runs:
           exit "${push_rc}"
         fi
 
-        TARGET_SHA="$(remote_tag_commit)"
-        if [ "${TARGET_SHA}" = "${COMMIT}" ]; then
+        if [ "${REMOTE_SHA}" = "${COMMIT}" ]; then
           echo "⏭️ Tag ${TAG} was created concurrently at the same commit, treating as skip"
           exit 0
         fi
 
-        echo "❌ Tag ${TAG} exists on remote at ${TARGET_SHA} but this run wanted ${COMMIT}"
+        echo "❌ Tag ${TAG} exists on remote at ${REMOTE_SHA} but this run wanted ${COMMIT}"
         echo "   This is the 'rc1 failure shape': the tag points at the wrong commit."
         echo "   Recovery (verify the intended release commit first):"
         echo "     - delete the wrong tag:   git push --delete origin ${TAG}"
diff --git a/.github/actions/utils/wait-for-crate/action.yml b/.github/actions/utils/wait-for-crate/action.yml
index 6a5d994d8d..c65695c6de 100644
--- a/.github/actions/utils/wait-for-crate/action.yml
+++ b/.github/actions/utils/wait-for-crate/action.yml
@@ -42,9 +42,13 @@ inputs:
     required: false
     default: "30"
   initial_sleep_seconds:
-    description: "Sleep between the first two attempts in seconds. Doubles each attempt, capped at 30."
+    description: "Sleep between the first two attempts in seconds. Doubles each attempt, capped at max_sleep_seconds."
     required: false
     default: "3"
+  max_sleep_seconds:
+    description: "Upper bound on per-attempt sleep, in seconds."
+    required: false
+    default: "30"
 
 runs:
   using: composite
@@ -56,6 +60,7 @@ runs:
         VERSION: ${{ inputs.version }}
         MAX_ATTEMPTS: ${{ inputs.max_attempts }}
         INITIAL_SLEEP_SECONDS: ${{ inputs.initial_sleep_seconds }}
+        MAX_SLEEP_SECONDS: ${{ inputs.max_sleep_seconds }}
       run: |
         set -euo pipefail
 
@@ -85,6 +90,10 @@ runs:
           echo "❌ wait-for-crate: initial_sleep_seconds '${INITIAL_SLEEP_SECONDS}' must be a non-negative integer"
           exit 1
         fi
+        if ! [[ "${MAX_SLEEP_SECONDS}" =~ ^[0-9]+$ ]] || [ "${MAX_SLEEP_SECONDS}" -lt 1 ]; then
+          echo "❌ wait-for-crate: max_sleep_seconds '${MAX_SLEEP_SECONDS}' must be a positive integer"
+          exit 1
+        fi
 
         # Compute the sparse-index prefix path from the leading characters
         # of the lowercased crate name. Layout is documented at
@@ -106,22 +115,50 @@ runs:
         echo "🎯 Target version:   ${VERSION}"
         echo ""
 
+        # Capture curl stderr AND body to tempfiles. Stderr preserves hard
+        # curl errors (DNS, TLS, connection refused, --max-time timeouts) so
+        # they can be surfaced on the final failure branch instead of being
+        # silenced by 2>/dev/null. Body goes to a file so we can distinguish
+        # "HTTP 200 but empty body" from "HTTP 4xx" from "network failure",
+        # and only parse JSON when the HTTP code says it's worth parsing.
+        curl_stderr="$(mktemp)"
+        body_file="$(mktemp)"
+        trap 'rm -f "${curl_stderr}" "${body_file}"' EXIT
+
         sleep_s="${INITIAL_SLEEP_SECONDS}"
+        http_code="000"
         for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do
-          # -f: fail on HTTP >= 400, so a 404 ("crate not yet on index")
-          #     surfaces as a non-zero exit with an empty body, which the
-          #     `|| true` swallows so the loop can keep going.
-          # -sS: quiet but still show hard curl errors on stderr (network
-          #     failure, DNS, TLS) so operators see them in the step log.
+          # -sS: silent but still show hard curl errors on stderr (network,
+          #     DNS, TLS, --max-time timeout) — captured to ${curl_stderr}.
           # -L: follow redirects (the CDN may redirect to a mirror).
-          body=$(curl -fsSL "${URL}" 2>/dev/null || true)
+          # -w '%{http_code}': capture the HTTP status code so we can tell
+          #     404 ("not yet on index") from 5xx ("transient") from 200
+          #     ("live") in the per-attempt log and the final-failure
+          #     branch. Symmetric with wait-for-url.
+          # --max-time 30: per-request wall-clock cap so a wedged TCP
+          #     connection does not burn the whole retry budget on one
+          #     attempt. Without this, kernel TCP keepalive can hold a
+          #     dead connection for ~5 min silently — the exact tail
+          #     latency bug we are trying to avoid. Symmetric with
+          #     wait-for-url:115.
+          # No -f: we inspect the HTTP code ourselves, so 4xx surfaces as
+          #     a code we can log instead of an empty body + non-zero exit.
+          : >"${curl_stderr}"
+          : >"${body_file}"
+          http_code=$(curl -sSL -o "${body_file}" -w '%{http_code}' \
+                        --max-time 30 "${URL}" 2>"${curl_stderr}" || echo "000")
 
           # The sparse-index body is newline-delimited JSON: one record
-          # per published version. `jq -r '.vers'` emits one version per
-          # line and `grep -Fxq` does an exact literal full-line match,
-          # so `0.10.0+build.5` is matched as itself, not as a regex
-          # where `.` and `+` would mean something else.
-          if [ -n "${body}" ] && echo "${body}" | jq -r '.vers' 2>/dev/null | grep -Fxq "${VERSION}"; then
+          # per published version. `jq -R 'fromjson?'` reads each line as
+          # a raw string and silently drops lines that fail to parse, so
+          # a malformed or partially-truncated response (e.g., CDN
+          # serving an HTML error page or an incomplete body) does not
+          # kill the whole pipeline under `set -o pipefail`. `grep -Fxq`
+          # does an exact literal full-line match, so `0.10.0+build.5`
+          # is matched as itself, not as a regex where `.` and `+` would
+          # mean something else.
+          if [ "${http_code}" = "200" ] && [ -s "${body_file}" ] \
+             && jq -Rr 'fromjson? | .vers // empty' "${body_file}" 2>/dev/null | grep -Fxq "${VERSION}"; then
             echo "✅ ${PACKAGE} v${VERSION} is on the sparse index"
             exit 0
           fi
@@ -129,11 +166,11 @@ runs:
           if [ "${attempt}" -eq "${MAX_ATTEMPTS}" ]; then
             break
           fi
-          echo "⏳ ${PACKAGE} v${VERSION} not yet visible (attempt ${attempt}/${MAX_ATTEMPTS}, sleep ${sleep_s}s)"
+          echo "⏳ HTTP ${http_code} - ${PACKAGE} v${VERSION} not yet visible (attempt ${attempt}/${MAX_ATTEMPTS}, sleep ${sleep_s}s)"
           sleep "${sleep_s}"
           sleep_s=$(( sleep_s * 2 ))
-          if [ "${sleep_s}" -gt 30 ]; then
-            sleep_s=30
+          if [ "${sleep_s}" -gt "${MAX_SLEEP_SECONDS}" ]; then
+            sleep_s="${MAX_SLEEP_SECONDS}"
           fi
         done
 
@@ -142,6 +179,11 @@ runs:
         #   * max_attempts>1 → "never appeared inside the budget"
         echo "❌ ${PACKAGE} v${VERSION} is not on the sparse index after ${MAX_ATTEMPTS} attempt(s)"
         echo "   URL: ${URL}"
+        echo "   Last HTTP code: ${http_code}"
+        if [ -s "${curl_stderr}" ]; then
+          echo "   last curl stderr:"
+          sed 's/^/     /' "${curl_stderr}"
+        fi
         echo "   Common causes:"
         echo "     - cargo publish did not actually land the upload (inspect the preceding publish step)"
         echo "     - the crates.io sparse-index CDN is lagging (usually seconds, rarely >1 min)"
diff --git a/.github/actions/utils/wait-for-url/action.yml b/.github/actions/utils/wait-for-url/action.yml
index dce8af29a7..bb7977c492 100644
--- a/.github/actions/utils/wait-for-url/action.yml
+++ b/.github/actions/utils/wait-for-url/action.yml
@@ -92,16 +92,27 @@ runs:
         echo "📊 Budget: up to ${MAX_ATTEMPTS} attempts"
         echo ""
 
+        # Capture curl stderr to a tempfile so hard errors (DNS, TLS,
+        # connection refused, --max-time timeouts) are preserved across
+        # retries and surfaced on the final failure branch. The previous
+        # code redirected stderr to /dev/null, which contradicted the -sS
+        # flag's intent of keeping hard errors visible.
+        curl_stderr="$(mktemp)"
+        trap 'rm -f "${curl_stderr}"' EXIT
+
         sleep_s="${INITIAL_SLEEP_SECONDS}"
         http_code="000"
         for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do
-          # -sS: silent but still report hard errors. -L: follow redirects
-          # (registries often sit behind a CDN that issues a 301/302).
+          # -sS: silent but still report hard errors on stderr (captured
+          # to ${curl_stderr} for the final-failure dump below).
+          # -L: follow redirects (registries sit behind CDNs that issue
+          # 301/302).
           # No -f: we want the HTTP code on 4xx so we can distinguish 404
           # ("not yet there") from 5xx ("transient") in the log.
           # --max-time: cap per-request so a dead TCP connection does not
           # burn the whole budget on one attempt.
-          http_code=$(curl -sSL -o /dev/null -w '%{http_code}' --max-time 30 "${URL}" 2>/dev/null || echo "000")
+          : >"${curl_stderr}"
+          http_code=$(curl -sSL -o /dev/null -w '%{http_code}' --max-time 30 "${URL}" 2>"${curl_stderr}" || echo "000")
 
           if [ "${http_code}" = "200" ]; then
             echo "✅ ${DESCRIPTION} is available (HTTP 200 from ${URL})"
@@ -122,6 +133,10 @@ runs:
         echo "❌ Timed out waiting for ${DESCRIPTION} after ${MAX_ATTEMPTS} attempts"
         echo "   URL: ${URL}"
         echo "   Last HTTP code: ${http_code}"
+        if [ -s "${curl_stderr}" ]; then
+          echo "   last curl stderr:"
+          sed 's/^/     /' "${curl_stderr}"
+        fi
         echo "   Common causes:"
         echo "     - registry propagation is slow (Maven Central especially can take >10 minutes)"
         echo "     - the publish step did not actually land the artifact (inspect the preceding publish step)"
diff --git a/.github/workflows/_publish_rust_crates.yml b/.github/workflows/_publish_rust_crates.yml
index 613ab05b44..c962c131cc 100644
--- a/.github/workflows/_publish_rust_crates.yml
+++ b/.github/workflows/_publish_rust_crates.yml
@@ -30,14 +30,22 @@ on:
         description: "Dry run mode - validate without publishing"
       commit:
         type: string
-        required: false
-        default: ""
-        description: "Specific commit to checkout (defaults to github.sha)"
+        required: true
+        description: |
+          Full 40-char commit SHA to publish from. The caller is responsible
+          for resolving and validating this (including master-ancestry) before
+          invoking. This workflow re-verifies master-ancestry defensively in
+          the `Resolve commit` step below.
       use_latest_ci:
         type: boolean
         required: false
         default: false
         description: "Use latest CI configuration from master branch"
+      skip_tag_creation:
+        type: boolean
+        required: false
+        default: false
+        description: "Skip git tag creation after successful publish (useful for re-publishing)"
     secrets:
       CARGO_REGISTRY_TOKEN:
         required: true
@@ -52,6 +60,19 @@ permissions:
 env:
   IGGY_CI_BUILD: true
 
+# Child-level concurrency group as defense-in-depth. The in-tree caller
+# (publish.yml) already holds `publish-release` via its own concurrency
+# block, and reusable workflow runs are nested under the parent's group,
+# so this is a no-op for the current topology. But `workflow_call` is a
+# publicly-reachable entry point: any future direct caller (auto-publish
+# runbook, vendored fork, parallel workflow) would bypass the parent
+# group and race on the inline tag steps below. A distinct group name
+# (`publish-release-rust` vs parent `publish-release`) avoids self-block
+# when this workflow is called from publish.yml.
+concurrency:
+  group: publish-release-rust
+  cancel-in-progress: false
+
 jobs:
   publish:
     name: Publish Rust crates
@@ -72,7 +93,13 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
         with:
-          ref: ${{ inputs.commit || github.sha }}
+          # No `|| github.sha` fallback: the `Resolve commit` step below
+          # requires an explicit, non-empty `inputs.commit` so the tag step
+          # downstream points at the exact reviewed commit. A silent fall
+          # back to github.sha would bypass publish.yml's master-ancestry
+          # check for direct workflow_call callers. Empty input here will
+          # fail the checkout fast, which is the intended hard-fail shape.
+          ref: ${{ inputs.commit }}
           fetch-depth: 0
 
       - name: Save and apply latest CI from master
@@ -118,14 +145,49 @@ jobs:
             echo "❌ Could not resolve commit '${INPUT_COMMIT}' to a full SHA"
             exit 1
           fi
+
+          # Defensive re-check of the master-ancestry invariant that publish.yml's
+          # validate job enforces. This workflow is a publicly-reachable
+          # workflow_call entry point; a direct caller (runbook, vendored fork)
+          # that bypasses publish.yml would otherwise skip the ancestry check
+          # entirely. Re-running it here is cheap (one shallow fetch + one
+          # merge-base call) and closes the I2 invariant hole.
+          git fetch origin master --depth=1000 --quiet
+          if ! git merge-base --is-ancestor "$FULL_SHA" origin/master; then
+            echo "❌ Commit $FULL_SHA is not an ancestor of origin/master"
+            echo ""
+            echo "_publish_rust_crates.yml refuses to publish from a non-master commit."
+            echo "If you reached this error via a direct workflow_call, route through"
+            echo "publish.yml instead (which does the same check in its validate job)"
+            echo "or rebase your branch onto master before invoking."
+            exit 1
+          fi
+
           echo "commit=$FULL_SHA" >> "$GITHUB_OUTPUT"
-          echo "✅ Resolved commit: $FULL_SHA"
+          echo "✅ Resolved commit: $FULL_SHA (on origin/master)"
 
       - name: Extract versions and tags
         id: versions
+        shell: bash
         run: |
+          set -euo pipefail
           chmod +x scripts/extract-version.sh
 
+          # Cache cargo metadata once across ALL extract-version.sh and
+          # composite post-merge.yml invocations in this job. The 8
+          # extract-version calls in this step plus the 4 per-crate
+          # composite `Validate package` steps all hit the same workspace
+          # snapshot, and cargo metadata on this 36-crate workspace costs
+          # ~8s cold per fork. File-based rather than env-var inlined
+          # because the JSON is ~220 KB, exceeding Linux MAX_ARG_STRLEN
+          # (128 KB). Propagated via $GITHUB_ENV so it also reaches the
+          # downstream post-merge composite steps; a plain `export` would
+          # only affect THIS step's shell.
+          IGGY_CARGO_METADATA_FILE="${RUNNER_TEMP:-/tmp}/iggy-cargo-metadata.json"
+          cargo metadata --format-version 1 --no-deps > "${IGGY_CARGO_METADATA_FILE}"
+          echo "IGGY_CARGO_METADATA_FILE=${IGGY_CARGO_METADATA_FILE}" >> "${GITHUB_ENV}"
+          export IGGY_CARGO_METADATA_FILE  # also make it available to this step's extract-version calls
+
           common=$(scripts/extract-version.sh rust-common)
           protocol=$(scripts/extract-version.sh rust-binary-protocol)
           sdk=$(scripts/extract-version.sh rust-sdk)
@@ -169,6 +231,31 @@ jobs:
       # the upload landed. If the wait times out, the tag is never pushed and
       # a rerun can safely re-enter the chain: the publish step is idempotent
       # via a sparse-index pre-check inside actions/rust/post-merge.
+      #
+      # Two-layer wait (defense-in-depth budget cascading - NOT independent
+      # signals; both layers poll the same sparse index URL):
+      #
+      #   * Inner CAS verify inside actions/rust/post-merge on a tight
+      #     ~8 min worst-case budget (max_attempts=10, initial_sleep=2,
+      #     cap=30; sleeps 2+4+8+16+30*5=180s + 10 * 30s curl = ~480s).
+      #     Runs only when the idempotency pre-check fell through to the
+      #     publish path, and acts as an authoritative per-step state
+      #     check: "did my cargo publish land on the sparse index in time
+      #     to continue cleanly?"
+      #   * Outer wait-for-crate below on a larger ~28 min worst-case
+      #     budget (default max_attempts=30, initial_sleep=3, cap=30;
+      #     sleeps 3+6+12+24+30*25=795s + 30 * 30s curl = ~1725s). Runs
+      #     on every crate in the chain and acts as the operational
+      #     safety net before the NEXT downstream crate's `cargo publish`
+      #     resolves its path deps from crates.io. Separate budget so a
+      #     slow CDN does not starve the tight inner check.
+      #
+      # 4-crate chain aggregate worst case: ~144 min (4 * (8+28)). In
+      # practice the happy path short-circuits on the first 200 response
+      # and total wait per crate is single-digit seconds. Chain halts on
+      # the first failing wait, so the 144 min figure is a ceiling, not
+      # an expected duration. Operators should only intervene after ~60
+      # min of sustained CDN 404s on a single crate.
 
       # Step 1: Publish iggy_binary_protocol (depends on nothing in-tree)
       - name: Publish iggy_binary_protocol
@@ -186,7 +273,7 @@ jobs:
           version: ${{ steps.versions.outputs.protocol }}
 
       - name: Tag iggy_binary_protocol
-        if: inputs.dry_run == false && contains(inputs.crates, 'rust-binary-protocol')
+        if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-binary-protocol')
         uses: ./.github/actions/utils/create-git-tag
         with:
           tag: ${{ steps.versions.outputs.protocol_tag }}
@@ -215,7 +302,7 @@ jobs:
           version: ${{ steps.versions.outputs.common }}
 
       - name: Tag iggy_common
-        if: inputs.dry_run == false && contains(inputs.crates, 'rust-common')
+        if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-common')
         uses: ./.github/actions/utils/create-git-tag
         with:
           tag: ${{ steps.versions.outputs.common_tag }}
@@ -244,7 +331,7 @@ jobs:
           version: ${{ steps.versions.outputs.sdk }}
 
       - name: Tag iggy SDK
-        if: inputs.dry_run == false && contains(inputs.crates, 'rust-sdk')
+        if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-sdk')
         uses: ./.github/actions/utils/create-git-tag
         with:
           tag: ${{ steps.versions.outputs.sdk_tag }}
@@ -273,7 +360,7 @@ jobs:
           version: ${{ steps.versions.outputs.cli }}
 
       - name: Tag iggy-cli
-        if: inputs.dry_run == false && contains(inputs.crates, 'rust-cli')
+        if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-cli')
         uses: ./.github/actions/utils/create-git-tag
         with:
           tag: ${{ steps.versions.outputs.cli_tag }}
diff --git a/.github/workflows/post-merge.yml b/.github/workflows/post-merge.yml
index 9f49334f01..bc085ecbbb 100644
--- a/.github/workflows/post-merge.yml
+++ b/.github/workflows/post-merge.yml
@@ -79,7 +79,7 @@ jobs:
 
             echo "Checking $crate: version=$VERSION, tag=$TAG"
 
-            if [[ ! "$VERSION" =~ -(edge|rc) ]]; then
+            if [ "$(scripts/extract-version.sh "$crate" --is-pre-release)" != "true" ]; then
               echo "  ⏭️ Stable version - skipping"
               continue
             fi
@@ -117,7 +117,7 @@ jobs:
               continue
             fi
 
-            if [[ ! "$VERSION" =~ -(edge|rc) ]] && [[ ! "$VERSION" =~ (\.dev|rc)[0-9]+$ ]]; then
+            if [ "$(scripts/extract-version.sh "$sdk" --is-pre-release)" != "true" ]; then
               echo "  ⏭️ Stable version - skipping"
               continue
             fi
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 766ab892cd..4302e0bd7d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -140,6 +140,17 @@ jobs:
         with:
           fetch-depth: 0
 
+      # Detect whether we are invoked via workflow_call (reusable) or
+      # workflow_dispatch (manual). Trust chain: when called via
+      # workflow_call, the caller is responsible for having validated its
+      # own trigger — the only in-tree caller today is
+      # .github/workflows/post-merge.yml, which is branch-filtered to
+      # master, so the SHA passed in is by construction a commit on
+      # master. Direct workflow_dispatch callers have no such guarantee
+      # and must pass through the master-ancestry check in Resolve
+      # commit below. Any future direct workflow_call caller added to
+      # this repo must preserve the master-ancestry property or the
+      # branch gate below will not apply to them.
       - name: Detect trigger type
         id: detect
         run: |
@@ -419,7 +430,9 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ needs.validate.outputs.commit }}
-          fetch-depth: 0
+          # check-tags only runs `git ls-remote --tags` against origin,
+          # which does not need local history. fetch-depth:1 is enough.
+          fetch-depth: 1
 
       - name: Save and apply latest CI from master
         if: inputs.use_latest_ci
@@ -462,11 +475,16 @@ jobs:
 
           EXISTING_TAGS=()
           NEW_TAGS=()
+          WRONG_TARGET_TAGS=()  # populated when a tag exists at a commit != this run's target
 
           echo "| Component | Version | Tag | Status |" >> $GITHUB_STEP_SUMMARY
           echo "|-----------|---------|-----|--------|" >> $GITHUB_STEP_SUMMARY
 
-          echo "$TARGETS_JSON" | jq -r '.include[] | select(.key!="noop") | @base64' | while read -r row; do
+          # Process substitution (not pipe) so EXISTING_TAGS / NEW_TAGS
+          # mutations survive the loop: `jq | while ...` puts the loop
+          # body in a pipe subshell and discards the arrays on exit,
+          # which silently killed the aggregate summary block below.
+          while IFS= read -r row; do
             _jq() { echo "$row" | base64 -d | jq -r "$1"; }
 
             KEY=$(_jq '.key')
@@ -485,9 +503,17 @@ jobs:
             VERSION=$(scripts/extract-version.sh "$KEY" 2>/dev/null || echo "ERROR")
             TAG=$(scripts/extract-version.sh "$KEY" --tag 2>/dev/null || echo "ERROR")
 
-            if [[ "$VERSION" =~ -SNAPSHOT$ ]]; then
-              echo "ℹ️ $NAME: SNAPSHOT version, no tag will be created"
-              echo "| $NAME | $VERSION | _(none)_ | ℹ️ SNAPSHOT (no tag) |" >> $GITHUB_STEP_SUMMARY
+            # Consult --should-tag as the single source of truth for
+            # taggability. It handles SNAPSHOT and "no tag_pattern" in
+            # one place (extract-version.sh:348-360), which keeps this
+            # check in sync with the SDK matrix's own should_tag gate.
+            # A previous inline `[[ $VERSION =~ -SNAPSHOT$ ]]` check
+            # duplicated the rule and would silently drift when a new
+            # SDK added another SNAPSHOT-style pre-release marker.
+            SHOULD_TAG=$(scripts/extract-version.sh "$KEY" --should-tag 2>/dev/null || echo "false")
+            if [ "$SHOULD_TAG" = "false" ]; then
+              echo "ℹ️ $NAME: no tag will be created (SNAPSHOT or no tag_pattern)"
+              echo "| $NAME | $VERSION | _(none)_ | ℹ️ No tag (SNAPSHOT / no pattern) |" >> $GITHUB_STEP_SUMMARY
               continue
             fi
 
@@ -521,14 +547,41 @@ jobs:
               REMOTE_RAW=$(echo "$REMOTE_LINE" | awk '{print $1}')
               EXISTING_SHA="${REMOTE_PEELED:-${REMOTE_RAW}}"
               SHORT_SHA=$(echo "$EXISTING_SHA" | head -c 8)
-              echo "⚠️  Tag exists on remote: $TAG (points to $SHORT_SHA)"
-              echo "| $NAME | $VERSION | $TAG | ⚠️ Exists at $SHORT_SHA |" >> $GITHUB_STEP_SUMMARY
+
+              # Fail-fast on wrong-target. A wrong-target tag means
+              # create-git-tag would hard-fail 20-40 minutes later after
+              # publishing artifacts to crates.io / PyPI / npm / Maven /
+              # NuGet / DockerHub. Catching it at check-tags converts that
+              # into a fast, cheap failure at the top of the run.
+              # Same-target is still benign (rerun convergence).
+              #
+              # skip_tag_creation=true accepts the invariant hole (the
+              # operator opted out of tag writes) but the wrong-target
+              # state is still rendered loudly under a DISTINCT cell label,
+              # so an operator cannot mistake it for a benign same-target
+              # skip. Operators must reconcile the tag/registry divergence
+              # manually or run a follow-up without skip_tag_creation to
+              # converge. The fail-fast exit 1 below is gated on
+              # skip_tag_creation != true.
+              if [ "$EXISTING_SHA" != "${{ needs.validate.outputs.commit }}" ]; then
+                WRONG_TARGET_TAGS+=("$TAG|$SHORT_SHA")
+                if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then
+                  echo "⚠️  Tag exists on remote at WRONG target: $TAG (points to $SHORT_SHA, not enforced: skip_tag_creation=true)"
+                  echo "| $NAME | $VERSION | $TAG | ⚠️ Wrong target at $SHORT_SHA (NOT enforced) |" >> $GITHUB_STEP_SUMMARY
+                else
+                  echo "❌ Tag exists on remote at wrong target: $TAG (points to $SHORT_SHA)"
+                  echo "| $NAME | $VERSION | $TAG | ❌ Wrong target at $SHORT_SHA |" >> $GITHUB_STEP_SUMMARY
+                fi
+              else
+                echo "⚠️  Tag exists on remote at same target: $TAG (points to $SHORT_SHA)"
+                echo "| $NAME | $VERSION | $TAG | ⚠️ Exists at $SHORT_SHA (benign) |" >> $GITHUB_STEP_SUMMARY
+              fi
             else
               NEW_TAGS+=("$TAG")
               echo "✅ Tag will be created: $TAG"
               echo "| $NAME | $VERSION | $TAG | ✅ Will create |" >> $GITHUB_STEP_SUMMARY
             fi
-          done
+          done < <(echo "$TARGETS_JSON" | jq -r '.include[] | select(.key!="noop") | @base64')
 
           echo "" >> $GITHUB_STEP_SUMMARY
 
@@ -537,7 +590,7 @@ jobs:
             echo "" >> $GITHUB_STEP_SUMMARY
             echo "### ⚠️ Warning: Existing Tags Detected" >> $GITHUB_STEP_SUMMARY
             echo "" >> $GITHUB_STEP_SUMMARY
-            echo "The following tags already exist and will be skipped:" >> $GITHUB_STEP_SUMMARY
+            echo "The following tags already exist on the remote:" >> $GITHUB_STEP_SUMMARY
             for tag in "${EXISTING_TAGS[@]}"; do
               echo "- $tag" >> $GITHUB_STEP_SUMMARY
             done
@@ -546,23 +599,26 @@ jobs:
             if [ "${{ inputs.dry_run }}" = "false" ]; then
               if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then
                 echo "**Note:** Tag creation is disabled for this run." >> $GITHUB_STEP_SUMMARY
-                echo "Components will be published/republished without updating git tags." >> $GITHUB_STEP_SUMMARY
+                echo "Components will be (re)published, but no git tags will be pushed." >> $GITHUB_STEP_SUMMARY
               else
-                echo "**These components will NOT be republished.** Tags are immutable in git." >> $GITHUB_STEP_SUMMARY
+                echo "**Tag behavior with \`create-git-tag\` (SHA-match invariant):**" >> $GITHUB_STEP_SUMMARY
                 echo "" >> $GITHUB_STEP_SUMMARY
-                echo "If you need to republish:" >> $GITHUB_STEP_SUMMARY
-                echo "1. Delete the existing tag: \`git push --delete origin <tag>\`" >> $GITHUB_STEP_SUMMARY
-                echo "2. Bump the version in the source file" >> $GITHUB_STEP_SUMMARY
-                echo "3. Run the publish workflow again" >> $GITHUB_STEP_SUMMARY
+                echo "- If a pre-existing tag points at the SAME commit this run is publishing, the tag step is a no-op (benign skip). The artifact is (re)published; the tag stays." >> $GITHUB_STEP_SUMMARY
+                echo "- If a pre-existing tag points at a DIFFERENT commit, the tag step hard-fails with recovery instructions. The artifact publish still runs (registries are idempotent) but no tag is pushed." >> $GITHUB_STEP_SUMMARY
                 echo "" >> $GITHUB_STEP_SUMMARY
-                echo "Alternatively, use \`skip_tag_creation: true\` to republish without tags." >> $GITHUB_STEP_SUMMARY
+                echo "To recover from a wrong-target tag:" >> $GITHUB_STEP_SUMMARY
+                echo "1. Verify the intended release commit." >> $GITHUB_STEP_SUMMARY
+                echo "2. Delete the existing tag on origin: \`git push --delete origin <tag>\`" >> $GITHUB_STEP_SUMMARY
+                echo "3. Rerun this workflow." >> $GITHUB_STEP_SUMMARY
+                echo "" >> $GITHUB_STEP_SUMMARY
+                echo "To republish without touching tags, set \`skip_tag_creation: true\` on the workflow dispatch." >> $GITHUB_STEP_SUMMARY
               fi
             fi
           fi
 
           if [ ${#NEW_TAGS[@]} -eq 0 ] && [ ${#EXISTING_TAGS[@]} -gt 0 ]; then
             echo "### ℹ️ No New Tags to Create" >> $GITHUB_STEP_SUMMARY
-            echo "All specified components have already been tagged. Consider bumping versions if you need to publish new releases." >> $GITHUB_STEP_SUMMARY
+            echo "All specified components are already tagged at some commit. If tags match this run's commit, the rerun converges cleanly. If not, see the wrong-target recovery above or bump versions." >> $GITHUB_STEP_SUMMARY
           elif [ ${#NEW_TAGS[@]} -gt 0 ]; then
             if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then
               echo "### ℹ️ Tags That Would Be Created (Skipped)" >> $GITHUB_STEP_SUMMARY
@@ -577,6 +633,42 @@ jobs:
             done
           fi
 
+          # Fail-fast on wrong-target tags. If any tag_pattern resolved to a
+          # name that already exists on origin at a commit different from
+          # this run's target, create-git-tag would hard-fail at tag push
+          # time anyway - but only after spending 20-40 min publishing
+          # artifacts. Catching it here converts that waste into a fast
+          # diagnostic at the top of the run. Kept as the LAST thing this
+          # step does so the operator summary block above is already
+          # populated before we exit.
+          if [ "${#WRONG_TARGET_TAGS[@]}" -gt 0 ]; then
+            {
+              echo ""
+              echo "### ❌ Wrong-target tags detected"
+              echo ""
+              echo "One or more tags already exist on the remote at a commit DIFFERENT from this run's target (\`${{ needs.validate.outputs.commit }}\`):"
+              echo ""
+              for entry in "${WRONG_TARGET_TAGS[@]}"; do
+                tag="${entry%|*}"
+                sha="${entry#*|}"
+                echo "- \`$tag\` currently points at \`$sha\`"
+              done
+              echo ""
+              echo "Failing fast to save 20-40 minutes of wasted publishing work; \`create-git-tag\` would hard-fail at tag push time anyway."
+              echo ""
+              echo "Recovery (verify the intended release commit first):"
+              echo "1. Delete the wrong tag(s) on origin: \`git push --delete origin <tag>\`"
+              echo "2. Or bump the version(s) and rerun the workflow"
+              echo "3. Or rerun with \`skip_tag_creation: true\` to republish artifacts only"
+            } >> $GITHUB_STEP_SUMMARY
+            echo "❌ Wrong-target tags detected: ${WRONG_TARGET_TAGS[*]}"
+            echo "See the Wrong-target tags block in the step summary for recovery steps."
+            if [ "${{ inputs.skip_tag_creation }}" != "true" ]; then
+              exit 1
+            fi
+            echo "ℹ️  skip_tag_creation=true, continuing despite wrong-target tags (operator opt-out)."
+          fi
+
   build-python-wheels:
     name: Build Python wheels
     needs: [validate, plan, check-tags]
@@ -602,6 +694,7 @@ jobs:
       dry_run: ${{ inputs.dry_run }}
       commit: ${{ needs.validate.outputs.commit }}
       use_latest_ci: ${{ inputs.use_latest_ci }}
+      skip_tag_creation: ${{ inputs.skip_tag_creation }}
     secrets:
       CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
 
@@ -731,9 +824,12 @@ jobs:
       - uses: actions/checkout@v4
         with:
           ref: ${{ needs.validate.outputs.commit }}
-          # Full history so create-git-tag can `git tag -a <commit>` against
-          # any historical SHA the operator passed in, not just HEAD.
-          fetch-depth: 0
+          # create-git-tag's shallow-safe fallback at action.yml:86-96
+          # will `git fetch --no-tags --depth=1 origin <commit>` if the
+          # commit is not in the local clone. GitHub allows single-commit
+          # fetches via allowReachableSHA1InWant=true, so fetch-depth:1
+          # here is enough - no need to pay for full history.
+          fetch-depth: 1
 
       - name: Ensure version extractor is executable
         run: |
@@ -763,7 +859,7 @@ jobs:
           SHOULD_TAG=$(scripts/extract-version.sh "$MATRIX_KEY" --should-tag)
           if [ "$SHOULD_TAG" = "true" ] \
              && [ "$CREATE_EDGE_DOCKER_TAG" = "true" ] \
-             && [[ ! "$VERSION" =~ -(edge|rc) ]]; then
+             && [ "$(scripts/extract-version.sh "$MATRIX_KEY" --is-pre-release)" != "true" ]; then
             SHOULD_TAG=false
           fi
 
@@ -933,7 +1029,9 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: ${{ needs.validate.outputs.commit }}
-          fetch-depth: 0
+          # create-git-tag falls back to a shallow fetch when the commit
+          # is missing locally, so fetch-depth:1 is sufficient.
+          fetch-depth: 1
 
       - name: Save and apply latest CI from master
         if: inputs.use_latest_ci
@@ -953,8 +1051,13 @@ jobs:
         run: |
           test -x scripts/extract-version.sh || chmod +x scripts/extract-version.sh
 
+      # matrix.type == 'rust' is dead: Rust crates are routed to the
+      # dedicated publish-rust-crates reusable workflow and never reach
+      # this SDK matrix (see plan job's hasRustCrates split at L349-354).
+      # Python uses maturin which depends on Rust, so python rows still
+      # need the toolchain.
       - name: Setup Rust toolchain (if needed)
-        if: matrix.type == 'rust' || matrix.type == 'python'
+        if: matrix.type == 'python'
         uses: ./.github/actions/utils/setup-rust-with-cache
 
       - name: Debug matrix
@@ -966,6 +1069,7 @@ jobs:
         env:
           MATRIX_KEY: ${{ matrix.key }}
           MATRIX_TAG_PATTERN: ${{ matrix.tag_pattern }}
+          CREATE_EDGE_DOCKER_TAG: ${{ inputs.create_edge_docker_tag }}
         run: |
           set -euo pipefail
           VERSION=$(scripts/extract-version.sh "$MATRIX_KEY")
@@ -977,6 +1081,22 @@ jobs:
           fi
           # Single source of truth for the SNAPSHOT/no-tag-pattern skip rule.
           SHOULD_TAG=$(scripts/extract-version.sh "$MATRIX_KEY" --should-tag)
+
+          # Symmetric auto-publish stable-skip override matching
+          # docker-manifests at L786-791. In auto-publish mode
+          # (create_edge_docker_tag=true), stable versions never get
+          # versioned git tags, only the rolling :edge Docker tag. Latent
+          # today because post-merge.yml filters stable SDK versions out
+          # of publish_other before calling publish.yml, but the symmetry
+          # with docker-manifests protects against future auto-publish
+          # callers that do not pre-filter and keeps the two matrices
+          # aligned on taggability rules.
+          if [ "$SHOULD_TAG" = "true" ] \
+             && [ "$CREATE_EDGE_DOCKER_TAG" = "true" ] \
+             && [ "$(scripts/extract-version.sh "$MATRIX_KEY" --is-pre-release)" != "true" ]; then
+            SHOULD_TAG=false
+          fi
+
           {
             echo "version=$VERSION"
             echo "tag=$TAG"
@@ -1071,16 +1191,51 @@ jobs:
           max_attempts: "15"
           initial_sleep_seconds: "3"
 
-      # Maven Central propagation via the Central Portal is usually minutes
-      # but has a long tail; budget is ~25 minutes of real wall time (cap 30s
-      # sleep × ~50 attempts). `iggy-<version>.pom` is the lightest per-version
-      # URL that only appears once the artifact is fully indexed.
+      # Java publishes to ASF Nexus staging via `./gradlew publish`
+      # (repository.apache.org/service/local/staging/deploy/maven2). The
+      # staging -> Maven Central handoff requires a Nexus Close+Release
+      # action which in the Apache governance model is operator-driven
+      # (often behind a dev@ release vote). No in-tree automation performs
+      # that handoff today. wait-for-url here polls the downstream Maven
+      # Central mirror (repo1.maven.org), which will not serve the
+      # artifact until Close+Release completes.
+      #
+      # CRITICAL DESIGN CARVE-OUT — DO NOT REMOVE `continue-on-error: true`
+      # BELOW without reading this:
+      #
+      # This wait step is the ONLY wait gate in the publish chain that uses
+      # `continue-on-error: true`, and the `Tag SDK release` step further
+      # down uses `if: success()`. GitHub Actions step semantics:
+      #   continue-on-error=true on a failing step
+      #     -> step outcome    = failure
+      #     -> step conclusion = success
+      #     -> downstream `if: success()` STILL evaluates true
+      # So when Maven Central has not caught up within the ~24-minute
+      # budget (Central propagation is frequently slow, rarely >45 min),
+      # the Java git tag is STILL pushed after `gradle publish` returned
+      # 0 at the staging step above. This matches pre-PR behavior (the
+      # Java tag was always pushed after `gradle publish` returned 0)
+      # and is INTENTIONAL: without the carve-out, every Java release
+      # would time-out at this wait and the operator would have to push
+      # the tag manually — a worse UX than the current best-effort
+      # shape, and still subject to the same invariant hole anyway.
+      #
+      # Follow-up to remove the carve-out: automate staging -> Central
+      # via `io.github.gradle-nexus.publish-plugin` with
+      # `closeAndReleaseStagingRepositories`, then drop the
+      # continue-on-error below and the Java path will match the
+      # PyPI/npm/NuGet "wait then tag" shape. This is how Kafka / Camel
+      # / Pulsar / Beam handle it. Blocked on confirming ASF governance
+      # allows automated promotion for iggy.
+      # TODO(#NNNN): track the gradle-nexus-publish-plugin adoption and
+      # drop `continue-on-error` from the step below once it lands.
       - name: Wait for Maven Central availability
         if: |
           success() &&
           inputs.dry_run == false &&
           matrix.type == 'java' &&
           steps.ver.outputs.should_tag == 'true'
+        continue-on-error: true
         uses: ./.github/actions/utils/wait-for-url
         with:
           url: https://repo1.maven.org/maven2/org/apache/iggy/iggy/${{ steps.ver.outputs.version }}/iggy-${{ steps.ver.outputs.version }}.pom
diff --git a/scripts/extract-version.sh b/scripts/extract-version.sh
index 8b747800ec..47f92f9d79 100755
--- a/scripts/extract-version.sh
+++ b/scripts/extract-version.sh
@@ -115,6 +115,27 @@ extract_cargo_version() {
 
     cd "$REPO_ROOT"
 
+    # Caller-provided cache: if IGGY_CARGO_METADATA_FILE points at a
+    # readable file containing `cargo metadata --no-deps --format-version=1`
+    # JSON, use it instead of re-forking cargo. This is the fast path used
+    # by .github/workflows/_publish_rust_crates.yml's Extract versions and
+    # tags step, which needs 8 version lookups against the same workspace
+    # snapshot and would otherwise pay the cargo metadata cost 8 times.
+    # File-based (not env-var-based) because cargo metadata for a 36-crate
+    # workspace is ~220 KB, which exceeds Linux's per-env-var limit
+    # MAX_ARG_STRLEN (128 KB) and would fail with E2BIG on exec().
+    if [[ -n "${IGGY_CARGO_METADATA_FILE:-}" ]] && [[ -r "${IGGY_CARGO_METADATA_FILE}" ]] \
+       && command -v jq &> /dev/null; then
+        local version
+        version=$(jq -r --arg pkg "$package" \
+                    '.packages[] | select(.name == $pkg) | .version' \
+                    "${IGGY_CARGO_METADATA_FILE}" | head -1)
+        if [[ -n "$version" ]]; then
+            echo "$version"
+            return 0
+        fi
+    fi
+
     if command -v cargo &> /dev/null && command -v jq &> /dev/null; then
         local version
         version=$(cargo metadata --no-deps --format-version=1 2>/dev/null | \
@@ -253,6 +274,7 @@ handle_check() {
 COMPONENT=""
 RETURN_TAG=false
 RETURN_SHOULD_TAG=false
+RETURN_IS_PRE_RELEASE=false
 
 # Detect mode flags as first argument only
 case "${1:-}" in
@@ -274,6 +296,10 @@ while [[ $# -gt 0 ]]; do
             RETURN_SHOULD_TAG=true
             shift
             ;;
+        --is-pre-release)
+            RETURN_IS_PRE_RELEASE=true
+            shift
+            ;;
         *)
             echo "Unknown option: $1" >&2
             exit 1
@@ -281,21 +307,31 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-if [[ "$RETURN_TAG" == "true" && "$RETURN_SHOULD_TAG" == "true" ]]; then
-    echo "Error: --tag and --should-tag are mutually exclusive" >&2
+mutex_count=0
+[[ "$RETURN_TAG" == "true" ]] && mutex_count=$((mutex_count + 1))
+[[ "$RETURN_SHOULD_TAG" == "true" ]] && mutex_count=$((mutex_count + 1))
+[[ "$RETURN_IS_PRE_RELEASE" == "true" ]] && mutex_count=$((mutex_count + 1))
+if [[ $mutex_count -gt 1 ]]; then
+    echo "Error: --tag, --should-tag, and --is-pre-release are mutually exclusive" >&2
     exit 1
 fi
 
 if [[ -z "$COMPONENT" ]]; then
-    echo "Usage: $0 <component> [--tag|--should-tag]" >&2
+    echo "Usage: $0 <component> [--tag|--should-tag|--is-pre-release]" >&2
     echo "       $0 --all" >&2
     echo "       $0 --check" >&2
     echo "" >&2
-    echo "  --tag         Print the git tag this component would use for its current version." >&2
-    echo "  --should-tag  Print 'true' if the current version should produce a git tag, 'false'" >&2
-    echo "                otherwise (SNAPSHOT or missing tag_pattern). This is the SINGLE" >&2
-    echo "                source of truth for taggability; publish.yml consults it for every" >&2
-    echo "                SDK matrix row." >&2
+    echo "  --tag             Print the git tag this component would use for its current version." >&2
+    echo "  --should-tag      Print 'true' if the current version should produce a git tag, 'false'" >&2
+    echo "                    otherwise (SNAPSHOT or missing tag_pattern). This is the SINGLE" >&2
+    echo "                    source of truth for taggability; publish.yml consults it for every" >&2
+    echo "                    SDK matrix row." >&2
+    echo "  --is-pre-release  Print 'true' if the current version is a pre-release/pre-stable" >&2
+    echo "                    marker across ANY SDK version scheme (-edge, -rc, .devN, bare rcN)," >&2
+    echo "                    'false' otherwise. SINGLE source of truth for the auto-publish and" >&2
+    echo "                    stable-Docker skip rules in post-merge.yml and publish.yml." >&2
+    echo "" >&2
+    echo "  --tag, --should-tag, and --is-pre-release are mutually exclusive." >&2
     echo "" >&2
     echo "Available components:" >&2
     yq eval '.components | keys | .[]' "$CONFIG_FILE" | sed 's/^/  - /' >&2
@@ -357,6 +393,32 @@ if [[ "$RETURN_SHOULD_TAG" == "true" ]]; then
     exit 0
 fi
 
+# --is-pre-release: returns "true" for versions that are pre-release/
+# pre-stable markers across ALL SDK version schemes we publish. This is
+# THE SINGLE SOURCE OF TRUTH for the "is this a pre-release" rule.
+# post-merge.yml uses it to decide whether to auto-publish; publish.yml
+# uses it for the auto-publish stable-Docker skip rule. Keeping one
+# regex here prevents the two call sites from drifting (which they
+# previously did - post-merge.yml accepted `.devN` and bare `rcN` while
+# publish.yml only accepted `-edge`/`-rc`, so a Python SDK `.devN`
+# version would be auto-published to PyPI but never git-tagged).
+#
+# Matches (any of):
+#   -edge[.N]   (rust crates, docker, node SDK)
+#   -rc[.N]     (all SDKs)
+#   .devN       (Python SDK PEP 440 development markers)
+#   rcN$        (legacy bare rcN, retained for compatibility)
+if [[ "$RETURN_IS_PRE_RELEASE" == "true" ]]; then
+    if [[ "$VERSION" =~ -(edge|rc) ]] \
+       || [[ "$VERSION" =~ \.dev[0-9]+$ ]] \
+       || [[ "$VERSION" =~ rc[0-9]+$ ]]; then
+        echo "true"
+    else
+        echo "false"
+    fi
+    exit 0
+fi
+
 # Return tag or version based on flag
 if [[ "$RETURN_TAG" == "true" ]]; then
     TAG_PATTERN=$(get_config "$COMPONENT" "tag_pattern")