From 4d9c5e1ab722db96168be694d582482106689cd6 Mon Sep 17 00:00:00 2001 From: Hubert Gruszecki Date: Tue, 14 Apr 2026 14:34:27 +0200 Subject: [PATCH] fix(ci): harden publish pipeline Follow-up to #3124 plus review-round fixes on the release chain. Narrow cargo publish continue-on-error to the "already uploaded" class; capture stderr to a tempfile instead of a process-sub tee that raced the classifier. Make `commit` required in the rust publish reusable workflow and re-verify master-ancestry so direct workflow_call callers cannot bypass the check. Bring wait-for-crate curl up to wait-for-url parity and switch to `jq -Rr 'fromjson?'` so malformed NDJSON survives pipefail. Fail-fast on wrong-target tags; render them distinctly and keep rendering under skip_tag_creation=true, gating only exit 1. Cache cargo metadata once per job via \$GITHUB_ENV so the four post-merge composite Validate steps reuse it. Consolidate the pre-release vocabulary behind \`extract-version.sh --is-pre-release\` so post-merge.yml and publish.yml no longer diverge on \`.devN\` / bare \`rcN\`. Retune the idempotency pre-check to initial_sleep=1 and rewrite the two-layer wait comment with correct budgets (~8 min inner, ~28 min outer). Includes the Maven \`continue-on-error\` carve-out on publish.yml's Central wait step, rationale documented in-tree above the step. --- .github/actions/rust/post-merge/action.yml | 183 ++++++++++----- .../actions/utils/create-git-tag/action.yml | 63 ++++-- .../actions/utils/wait-for-crate/action.yml | 72 ++++-- .github/actions/utils/wait-for-url/action.yml | 21 +- .github/workflows/_publish_rust_crates.yml | 105 ++++++++- .github/workflows/post-merge.yml | 4 +- .github/workflows/publish.yml | 209 +++++++++++++++--- scripts/extract-version.sh | 78 ++++++- 8 files changed, 599 insertions(+), 136 deletions(-) diff --git a/.github/actions/rust/post-merge/action.yml b/.github/actions/rust/post-merge/action.yml index b303ccf05e..c8fb2284d6 100644 --- a/.github/actions/rust/post-merge/action.yml +++ b/.github/actions/rust/post-merge/action.yml @@ -18,8 +18,10 @@ name: rust-post-merge description: > Publish a single Rust crate to crates.io. Idempotent on rerun via a - sparse-index pre-check. Intended to be called once per crate, in - dependency order, from .github/workflows/_publish_rust_crates.yml. + sparse-index pre-check and a post-publish CAS verify. Intended to be + called once per crate, in dependency order, from + .github/workflows/_publish_rust_crates.yml. Dry-run publishing is + handled one level up by scripts/verify-crates-publish.sh. inputs: package: @@ -28,41 +30,14 @@ inputs: version: description: "Version for publishing" required: true - dry_run: - description: | - Deprecated. Retained only to avoid silently breaking downstream forks - that pin this composite by SHA and still pass `dry_run: true`. - - Dry-run publishing is now handled one level up, by - scripts/verify-crates-publish.sh invoked from - .github/workflows/_publish_rust_crates.yml on the dry-run path. When - this composite is called with dry_run=true it prints a deprecation - warning and no-ops every publish step so forks keep getting the - "don't touch the real registry" semantics they expected. This input - will be removed in a future release once forks have migrated. - required: false - default: "false" runs: using: "composite" steps: - - name: Deprecated dry_run warning - if: inputs.dry_run == 'true' - shell: bash - run: | - echo "::warning::rust/post-merge: the 'dry_run' input is deprecated." - echo "::warning::Dry-run publishing now happens at the workflow level via" - echo "::warning::scripts/verify-crates-publish.sh (see _publish_rust_crates.yml)." - echo "::warning::Honoring dry_run=true by skipping every step in this composite." - echo "::warning::This input will be removed in a future release; please migrate." - echo "⏭️ dry_run=true → skipping all publish steps" - - name: Setup Rust with cache - if: inputs.dry_run != 'true' uses: ./.github/actions/utils/setup-rust-with-cache - name: Validate package - if: inputs.dry_run != 'true' env: PACKAGE: ${{ inputs.package }} VERSION: ${{ inputs.version }} @@ -74,24 +49,48 @@ runs: echo "Version: $VERSION" echo "" - if ! cargo metadata --format-version 1 | jq -e --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg)' > /dev/null; then + # Single cargo metadata invocation reused for presence check, version, + # and manifest path. --no-deps keeps all three fields we read and + # avoids walking the dep graph, saving ~30-60s across a 4-crate release. + # + # Reuse the cache written by _publish_rust_crates.yml's `Extract + # versions and tags` step if present (propagated via $GITHUB_ENV). + # Saves ~8s per crate on a 36-crate workspace; across 4 crates in + # the chain, ~30s per release. Falls back to a fresh cargo metadata + # fork if the cache is missing (e.g., the composite is invoked from + # a different workflow that doesn't set up the cache). + if [[ -n "${IGGY_CARGO_METADATA_FILE:-}" ]] && [[ -r "${IGGY_CARGO_METADATA_FILE}" ]]; then + META=$(cat "${IGGY_CARGO_METADATA_FILE}") + else + META=$(cargo metadata --format-version 1 --no-deps) + fi + + if ! echo "$META" | jq -e --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg)' > /dev/null; then echo "❌ Package '$PACKAGE' not found in workspace" echo "" echo "Available packages:" - cargo metadata --format-version 1 | jq -r '.packages[].name' | sort + echo "$META" | jq -r '.packages[].name' | sort exit 1 fi - CARGO_VERSION=$(cargo metadata --format-version 1 | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .version') - CARGO_PATH=$(cargo metadata --format-version 1 | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .manifest_path') + CARGO_VERSION=$(echo "$META" | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .version') + CARGO_PATH=$(echo "$META" | jq -r --arg pkg "$PACKAGE" '.packages[] | select(.name == $pkg) | .manifest_path') echo "Current Cargo.toml version: $CARGO_VERSION" echo "Target version: $VERSION" echo "Manifest path: $CARGO_PATH" if [ "$CARGO_VERSION" != "$VERSION" ]; then - echo "⚠️ Warning: Cargo.toml version ($CARGO_VERSION) doesn't match target version ($VERSION)" - echo "Make sure to update Cargo.toml before publishing" + echo "❌ Cargo.toml version ($CARGO_VERSION) doesn't match target version ($VERSION)" + echo "" + echo "cargo publish uses the Cargo.toml version, not the input, so a mismatch" + echo "would upload the WRONG version and then fail the downstream wait-for-crate" + echo "on the target version ~15 min later. Fail fast here instead." + echo "" + echo "Recovery:" + echo " scripts/bump-version.sh $PACKAGE --set $VERSION" + echo " git commit -am 'chore(release): bump $PACKAGE to $VERSION'" + exit 1 fi echo "" @@ -99,7 +98,6 @@ runs: cargo tree -p "$PACKAGE" --depth 1 | head -20 - name: Build package - if: inputs.dry_run != 'true' env: PACKAGE: ${{ inputs.package }} shell: bash @@ -112,7 +110,6 @@ runs: echo "✅ Package built successfully" - name: Verify package contents - if: inputs.dry_run != 'true' env: PACKAGE: ${{ inputs.package }} shell: bash @@ -129,28 +126,47 @@ runs: cargo package -p "$PACKAGE" --list | wc -l echo "files would be included" - # Idempotency pre-check: ask the crates.io sparse index (same data the - # publish wait gate uses) whether this exact version is already live. - # If it is, we skip `cargo publish` cleanly instead of hard-failing on - # "crate version already uploaded", which is the failure mode that blocks - # reruns after a transient post-publish issue (e.g. tag push failure). + # Idempotency pre-check: ask the crates.io sparse index whether this + # exact version is already live. A success here is a warm-cache fast + # path that skips `cargo publish` entirely (used by reruns after a + # transient post-publish issue like a tag push failure). A failure + # here does NOT prove the crate is absent - the CDN can serve a + # stale 404 or 5xx - so a failure flips through to the publish path, + # which is guarded by the post-publish CAS verify below. # - # continue-on-error: true so an exit 1 ("not there") flips through to - # steps.already_published.outcome == 'failure' and gates the publish - # step below, instead of failing the job. + # max_attempts=5 with initial_sleep=1 closes the common cold-cache + # race (CDN not yet caught up from a prior successful publish: sleeps + # 1+2+4+8=15s worst case per crate). continue-on-error: true so an + # exit 1 surfaces as steps.already_published.outcome == 'failure' + # instead of failing the job. - name: Check if crate is already on crates.io - if: inputs.dry_run != 'true' id: already_published continue-on-error: true uses: ./.github/actions/utils/wait-for-crate with: package: ${{ inputs.package }} version: ${{ inputs.version }} - max_attempts: "1" + max_attempts: "5" initial_sleep_seconds: "1" + # Publish runs WITHOUT continue-on-error so any failure that is NOT the + # "already uploaded" class (invalid CARGO_REGISTRY_TOKEN, 401/403, 429 + # rate limit, 5xx, Cargo.toml validation error, dependency resolution) + # surfaces loudly with its actual error instead of getting swallowed + # into a misleading "not on sparse index" CAS timeout ~3 min later. + # + # The only expected benign failure is the race where a prior run + # already uploaded this exact version between our pre-check and our + # publish attempt; cargo emits that as "crate version X.Y.Z is + # already uploaded", which the stderr-grep below classifies as a + # benign skip and translates into exit 0. The CAS verify step + # immediately below is then the authoritative state oracle: if the + # sparse index serves this version after the publish path ran, the + # crate is live regardless of whether THIS run uploaded it or an + # earlier one did. - name: Publish to crates.io - if: inputs.dry_run != 'true' && steps.already_published.outcome == 'failure' + if: steps.already_published.outcome == 'failure' + id: publish shell: bash env: CARGO_REGISTRY_TOKEN: ${{ env.CARGO_REGISTRY_TOKEN }} @@ -167,14 +183,77 @@ runs: echo "📦 Publishing $PACKAGE v$VERSION to crates.io..." echo "" - cargo publish -p "$PACKAGE" + # Capture cargo publish stderr to a tempfile so we can classify the + # "already uploaded" benign class after the fact. A previous version + # used `2> >(tee ...)` to also stream stderr live to the job log, but + # bash does not wait on process-substitution children before the + # grep classifier runs, producing a measured 2-3% race where the + # classifier misses the benign-rerun signature. GitHub Actions step + # logs are line-buffered via the agent regardless, so `cat` after + # cargo exits gives the operator the same experience without the + # race. + publish_stderr="$(mktemp)" + trap 'rm -f "${publish_stderr}"' EXIT + + rc=0 + cargo publish -p "$PACKAGE" 2>"${publish_stderr}" || rc=$? + cat "${publish_stderr}" >&2 + + if [ "${rc}" -eq 0 ]; then + echo "" + echo "✅ cargo publish reports success" + echo "View on crates.io: https://crates.io/crates/$PACKAGE/$VERSION" + exit 0 + fi + + # Narrow benign class: race where a prior run already uploaded + # this exact version. cargo's message shape has changed across + # releases: + # * cargo <1.75 (server-side error passed through): + # error: failed to publish to registry at https://... + # caused by: the remote server responded with an error + # (status 200 OK): crate version `X.Y.Z` is already uploaded + # * cargo ≥1.75 (local sparse-index pre-check): + # error: crate @ already exists on crates.io index + # We match both by disjunction: "is already uploaded" OR "already + # exists on ... index". Both substrings are specific to this + # class of failure and would not appear in token/network/5xx/ + # validation errors. + if grep -qE "(is already uploaded|already exists on .*index)" "${publish_stderr}"; then + echo "" + echo "ℹ️ $PACKAGE v$VERSION is already uploaded (race with prior run)" + echo " Continuing to CAS verify to confirm the crate is live on the sparse index." + exit 0 + fi echo "" - echo "✅ Successfully published to crates.io" - echo "View on crates.io: https://crates.io/crates/$PACKAGE/$VERSION" + echo "❌ cargo publish failed with rc=${rc} and no 'already uploaded' signature" + echo " The actual error is in the stderr above. Common causes:" + echo " - invalid or expired CARGO_REGISTRY_TOKEN (401/403)" + echo " - crates.io rate limit (429)" + echo " - crates.io 5xx (transient, rerun should recover)" + echo " - Cargo.toml validation error or dependency resolution failure" + exit "${rc}" + + # CAS verify: authoritative post-publish state check. Runs whenever + # the pre-check fell through to the publish path, regardless of + # whether cargo publish itself succeeded. Success = crate is live on + # the sparse index, which is the same contract the top-level wait + # gates in _publish_rust_crates.yml use before tagging. Failure of + # this step fails the job (no continue-on-error), so a genuine + # upload failure still surfaces loudly - we only swallow the + # "already uploaded" false-negative class. + - name: Verify crate landed on crates.io (CAS) + if: steps.already_published.outcome == 'failure' + uses: ./.github/actions/utils/wait-for-crate + with: + package: ${{ inputs.package }} + version: ${{ inputs.version }} + max_attempts: "10" + initial_sleep_seconds: "2" - name: Publish skipped (crate already on crates.io) - if: inputs.dry_run != 'true' && steps.already_published.outcome == 'success' + if: steps.already_published.outcome == 'success' shell: bash env: PACKAGE: ${{ inputs.package }} diff --git a/.github/actions/utils/create-git-tag/action.yml b/.github/actions/utils/create-git-tag/action.yml index cec8117bc8..7c9aea411e 100644 --- a/.github/actions/utils/create-git-tag/action.yml +++ b/.github/actions/utils/create-git-tag/action.yml @@ -57,19 +57,32 @@ runs: exit 1 fi - # Reject inputs that could mangle git invocation. Tag and commit are - # both derived from trusted sources today (extract-version.sh outputs - # and pre-validated SHAs), but the composite has no caller context, so - # validate defensively. - # - # `+` is allowed in the tag alphabet because every tag_pattern in - # .github/config/publish.yml already permits the semver build - # metadata suffix `(?:\+[0-9A-Za-z.-]+)?`. Rejecting `+` here would - # hard-fail the entire chain after a successful publish the first - # time a release uses a `X.Y.Z+build.N` version - the exact rc1 - # failure shape this PR is trying to eliminate. - if ! [[ "${TAG}" =~ ^[A-Za-z0-9._/+-]+$ ]]; then - echo "❌ create-git-tag: tag '${TAG}' contains characters outside [A-Za-z0-9._/+-]" + # Validate the tag name with two layers of defense: + # 1. Shell-option injection: reject anything not starting with + # alphanumeric / `_` / `/` so the composite cannot be coerced + # into parsing the tag as a git or shell short-option (a + # leading `-` would be the classic attack shape). A leading + # `.` is also rejected because git's own check_refname_format + # would reject it later and we prefer a fast, actionable + # failure here. + # 2. Git refname format: delegate to `git check-ref-format`, which + # enforces the full refs/tags/ restrictions (no `..`, no + # `.lock`, no trailing slash, no control chars, etc.). + # Strictly stronger than a hand-rolled alphabet rule, and + # stays in sync with git's own receive-pack check instead of + # drifting from it. In particular this accepts every real + # tag_pattern in .github/config/publish.yml, including the + # semver build metadata suffix `X.Y.Z+build.N` that rc1 + # choked on, and the `foreign/go/v0.7.0` slash-containing + # Go module tag shape. + if ! [[ "${TAG}" =~ ^[A-Za-z0-9_/] ]]; then + echo "❌ create-git-tag: tag '${TAG}' must start with [A-Za-z0-9_/]" + exit 1 + fi + if ! git check-ref-format "refs/tags/${TAG}" 2>/dev/null; then + echo "❌ create-git-tag: tag '${TAG}' is not a valid git ref name" + echo " git check-ref-format rejected it. See" + echo " https://git-scm.com/docs/git-check-ref-format for the rules." exit 1 fi if ! [[ "${COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then @@ -83,14 +96,16 @@ runs: # Ensure the commit object exists locally; required by `git tag -a`. # If the workflow used a shallow checkout, fetch just the one commit. + # GitHub enables allowReachableSHA1InWant=true, so single-commit + # fetches work even when the caller's checkout used fetch-depth:1. if ! git cat-file -e "${COMMIT}^{commit}" 2>/dev/null; then echo "ℹ️ Commit ${COMMIT} not in local clone, fetching..." if ! git fetch --no-tags --depth=1 origin "${COMMIT}" 2>/dev/null; then echo "❌ Failed to fetch commit ${COMMIT} from origin" echo " Recovery:" - echo " - verify the caller's checkout step uses fetch-depth: 0" echo " - verify the commit still exists on origin (was it force-pushed away?)" echo " - verify the commit is reachable from a branch on origin (not only from a PR ref)" + echo " - if the caller's network restricts single-commit fetches, increase fetch-depth on the calling checkout step" exit 1 fi fi @@ -163,7 +178,12 @@ runs: # always 0. push_rc=0 push_stderr_file="$(mktemp)" - trap 'rm -f "${push_stderr_file}"' EXIT + # Cleanup wrapped in a named function so future traps can append + # to it instead of overwriting (trap 'foo' EXIT replaces any + # earlier EXIT trap). No earlier EXIT trap exists today, so this + # is purely refactor-defensive. + _create_git_tag_cleanup() { rm -f "${push_stderr_file}"; } + trap _create_git_tag_cleanup EXIT git push origin "${TAG}" 2>"${push_stderr_file}" || push_rc=$? if [ "${push_rc}" -eq 0 ]; then echo "✅ Created and pushed tag: ${TAG}" @@ -174,8 +194,12 @@ runs: echo " push stderr:" sed 's/^/ /' "${push_stderr_file}" - REMOTE_RAW=$(git ls-remote --tags origin "refs/tags/${TAG}" | awk '{print $1}') - if [ -z "${REMOTE_RAW}" ]; then + # Use the same peeled-then-raw resolver as the early-skip and + # post-push branches so all three agree on what the tag points at. + # The previous inline `git ls-remote ... | awk` only read the raw + # line, which would miss an annotated-tag same-commit race. + REMOTE_SHA="$(remote_tag_commit)" + if [ -z "${REMOTE_SHA}" ]; then echo "❌ Push failed and tag ${TAG} is not on remote - propagating failure" echo " The push stderr above should explain why (permission denied, protected" echo " ref, missing upstream, etc.). If this is a token/permissions issue," @@ -183,13 +207,12 @@ runs: exit "${push_rc}" fi - TARGET_SHA="$(remote_tag_commit)" - if [ "${TARGET_SHA}" = "${COMMIT}" ]; then + if [ "${REMOTE_SHA}" = "${COMMIT}" ]; then echo "⏭️ Tag ${TAG} was created concurrently at the same commit, treating as skip" exit 0 fi - echo "❌ Tag ${TAG} exists on remote at ${TARGET_SHA} but this run wanted ${COMMIT}" + echo "❌ Tag ${TAG} exists on remote at ${REMOTE_SHA} but this run wanted ${COMMIT}" echo " This is the 'rc1 failure shape': the tag points at the wrong commit." echo " Recovery (verify the intended release commit first):" echo " - delete the wrong tag: git push --delete origin ${TAG}" diff --git a/.github/actions/utils/wait-for-crate/action.yml b/.github/actions/utils/wait-for-crate/action.yml index 6a5d994d8d..c65695c6de 100644 --- a/.github/actions/utils/wait-for-crate/action.yml +++ b/.github/actions/utils/wait-for-crate/action.yml @@ -42,9 +42,13 @@ inputs: required: false default: "30" initial_sleep_seconds: - description: "Sleep between the first two attempts in seconds. Doubles each attempt, capped at 30." + description: "Sleep between the first two attempts in seconds. Doubles each attempt, capped at max_sleep_seconds." required: false default: "3" + max_sleep_seconds: + description: "Upper bound on per-attempt sleep, in seconds." + required: false + default: "30" runs: using: composite @@ -56,6 +60,7 @@ runs: VERSION: ${{ inputs.version }} MAX_ATTEMPTS: ${{ inputs.max_attempts }} INITIAL_SLEEP_SECONDS: ${{ inputs.initial_sleep_seconds }} + MAX_SLEEP_SECONDS: ${{ inputs.max_sleep_seconds }} run: | set -euo pipefail @@ -85,6 +90,10 @@ runs: echo "❌ wait-for-crate: initial_sleep_seconds '${INITIAL_SLEEP_SECONDS}' must be a non-negative integer" exit 1 fi + if ! [[ "${MAX_SLEEP_SECONDS}" =~ ^[0-9]+$ ]] || [ "${MAX_SLEEP_SECONDS}" -lt 1 ]; then + echo "❌ wait-for-crate: max_sleep_seconds '${MAX_SLEEP_SECONDS}' must be a positive integer" + exit 1 + fi # Compute the sparse-index prefix path from the leading characters # of the lowercased crate name. Layout is documented at @@ -106,22 +115,50 @@ runs: echo "🎯 Target version: ${VERSION}" echo "" + # Capture curl stderr AND body to tempfiles. Stderr preserves hard + # curl errors (DNS, TLS, connection refused, --max-time timeouts) so + # they can be surfaced on the final failure branch instead of being + # silenced by 2>/dev/null. Body goes to a file so we can distinguish + # "HTTP 200 but empty body" from "HTTP 4xx" from "network failure", + # and only parse JSON when the HTTP code says it's worth parsing. + curl_stderr="$(mktemp)" + body_file="$(mktemp)" + trap 'rm -f "${curl_stderr}" "${body_file}"' EXIT + sleep_s="${INITIAL_SLEEP_SECONDS}" + http_code="000" for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do - # -f: fail on HTTP >= 400, so a 404 ("crate not yet on index") - # surfaces as a non-zero exit with an empty body, which the - # `|| true` swallows so the loop can keep going. - # -sS: quiet but still show hard curl errors on stderr (network - # failure, DNS, TLS) so operators see them in the step log. + # -sS: silent but still show hard curl errors on stderr (network, + # DNS, TLS, --max-time timeout) — captured to ${curl_stderr}. # -L: follow redirects (the CDN may redirect to a mirror). - body=$(curl -fsSL "${URL}" 2>/dev/null || true) + # -w '%{http_code}': capture the HTTP status code so we can tell + # 404 ("not yet on index") from 5xx ("transient") from 200 + # ("live") in the per-attempt log and the final-failure + # branch. Symmetric with wait-for-url. + # --max-time 30: per-request wall-clock cap so a wedged TCP + # connection does not burn the whole retry budget on one + # attempt. Without this, kernel TCP keepalive can hold a + # dead connection for ~5 min silently — the exact tail + # latency bug we are trying to avoid. Symmetric with + # wait-for-url:115. + # No -f: we inspect the HTTP code ourselves, so 4xx surfaces as + # a code we can log instead of an empty body + non-zero exit. + : >"${curl_stderr}" + : >"${body_file}" + http_code=$(curl -sSL -o "${body_file}" -w '%{http_code}' \ + --max-time 30 "${URL}" 2>"${curl_stderr}" || echo "000") # The sparse-index body is newline-delimited JSON: one record - # per published version. `jq -r '.vers'` emits one version per - # line and `grep -Fxq` does an exact literal full-line match, - # so `0.10.0+build.5` is matched as itself, not as a regex - # where `.` and `+` would mean something else. - if [ -n "${body}" ] && echo "${body}" | jq -r '.vers' 2>/dev/null | grep -Fxq "${VERSION}"; then + # per published version. `jq -R 'fromjson?'` reads each line as + # a raw string and silently drops lines that fail to parse, so + # a malformed or partially-truncated response (e.g., CDN + # serving an HTML error page or an incomplete body) does not + # kill the whole pipeline under `set -o pipefail`. `grep -Fxq` + # does an exact literal full-line match, so `0.10.0+build.5` + # is matched as itself, not as a regex where `.` and `+` would + # mean something else. + if [ "${http_code}" = "200" ] && [ -s "${body_file}" ] \ + && jq -Rr 'fromjson? | .vers // empty' "${body_file}" 2>/dev/null | grep -Fxq "${VERSION}"; then echo "✅ ${PACKAGE} v${VERSION} is on the sparse index" exit 0 fi @@ -129,11 +166,11 @@ runs: if [ "${attempt}" -eq "${MAX_ATTEMPTS}" ]; then break fi - echo "⏳ ${PACKAGE} v${VERSION} not yet visible (attempt ${attempt}/${MAX_ATTEMPTS}, sleep ${sleep_s}s)" + echo "⏳ HTTP ${http_code} - ${PACKAGE} v${VERSION} not yet visible (attempt ${attempt}/${MAX_ATTEMPTS}, sleep ${sleep_s}s)" sleep "${sleep_s}" sleep_s=$(( sleep_s * 2 )) - if [ "${sleep_s}" -gt 30 ]; then - sleep_s=30 + if [ "${sleep_s}" -gt "${MAX_SLEEP_SECONDS}" ]; then + sleep_s="${MAX_SLEEP_SECONDS}" fi done @@ -142,6 +179,11 @@ runs: # * max_attempts>1 → "never appeared inside the budget" echo "❌ ${PACKAGE} v${VERSION} is not on the sparse index after ${MAX_ATTEMPTS} attempt(s)" echo " URL: ${URL}" + echo " Last HTTP code: ${http_code}" + if [ -s "${curl_stderr}" ]; then + echo " last curl stderr:" + sed 's/^/ /' "${curl_stderr}" + fi echo " Common causes:" echo " - cargo publish did not actually land the upload (inspect the preceding publish step)" echo " - the crates.io sparse-index CDN is lagging (usually seconds, rarely >1 min)" diff --git a/.github/actions/utils/wait-for-url/action.yml b/.github/actions/utils/wait-for-url/action.yml index dce8af29a7..bb7977c492 100644 --- a/.github/actions/utils/wait-for-url/action.yml +++ b/.github/actions/utils/wait-for-url/action.yml @@ -92,16 +92,27 @@ runs: echo "📊 Budget: up to ${MAX_ATTEMPTS} attempts" echo "" + # Capture curl stderr to a tempfile so hard errors (DNS, TLS, + # connection refused, --max-time timeouts) are preserved across + # retries and surfaced on the final failure branch. The previous + # code redirected stderr to /dev/null, which contradicted the -sS + # flag's intent of keeping hard errors visible. + curl_stderr="$(mktemp)" + trap 'rm -f "${curl_stderr}"' EXIT + sleep_s="${INITIAL_SLEEP_SECONDS}" http_code="000" for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do - # -sS: silent but still report hard errors. -L: follow redirects - # (registries often sit behind a CDN that issues a 301/302). + # -sS: silent but still report hard errors on stderr (captured + # to ${curl_stderr} for the final-failure dump below). + # -L: follow redirects (registries sit behind CDNs that issue + # 301/302). # No -f: we want the HTTP code on 4xx so we can distinguish 404 # ("not yet there") from 5xx ("transient") in the log. # --max-time: cap per-request so a dead TCP connection does not # burn the whole budget on one attempt. - http_code=$(curl -sSL -o /dev/null -w '%{http_code}' --max-time 30 "${URL}" 2>/dev/null || echo "000") + : >"${curl_stderr}" + http_code=$(curl -sSL -o /dev/null -w '%{http_code}' --max-time 30 "${URL}" 2>"${curl_stderr}" || echo "000") if [ "${http_code}" = "200" ]; then echo "✅ ${DESCRIPTION} is available (HTTP 200 from ${URL})" @@ -122,6 +133,10 @@ runs: echo "❌ Timed out waiting for ${DESCRIPTION} after ${MAX_ATTEMPTS} attempts" echo " URL: ${URL}" echo " Last HTTP code: ${http_code}" + if [ -s "${curl_stderr}" ]; then + echo " last curl stderr:" + sed 's/^/ /' "${curl_stderr}" + fi echo " Common causes:" echo " - registry propagation is slow (Maven Central especially can take >10 minutes)" echo " - the publish step did not actually land the artifact (inspect the preceding publish step)" diff --git a/.github/workflows/_publish_rust_crates.yml b/.github/workflows/_publish_rust_crates.yml index 613ab05b44..c962c131cc 100644 --- a/.github/workflows/_publish_rust_crates.yml +++ b/.github/workflows/_publish_rust_crates.yml @@ -30,14 +30,22 @@ on: description: "Dry run mode - validate without publishing" commit: type: string - required: false - default: "" - description: "Specific commit to checkout (defaults to github.sha)" + required: true + description: | + Full 40-char commit SHA to publish from. The caller is responsible + for resolving and validating this (including master-ancestry) before + invoking. This workflow re-verifies master-ancestry defensively in + the `Resolve commit` step below. use_latest_ci: type: boolean required: false default: false description: "Use latest CI configuration from master branch" + skip_tag_creation: + type: boolean + required: false + default: false + description: "Skip git tag creation after successful publish (useful for re-publishing)" secrets: CARGO_REGISTRY_TOKEN: required: true @@ -52,6 +60,19 @@ permissions: env: IGGY_CI_BUILD: true +# Child-level concurrency group as defense-in-depth. The in-tree caller +# (publish.yml) already holds `publish-release` via its own concurrency +# block, and reusable workflow runs are nested under the parent's group, +# so this is a no-op for the current topology. But `workflow_call` is a +# publicly-reachable entry point: any future direct caller (auto-publish +# runbook, vendored fork, parallel workflow) would bypass the parent +# group and race on the inline tag steps below. A distinct group name +# (`publish-release-rust` vs parent `publish-release`) avoids self-block +# when this workflow is called from publish.yml. +concurrency: + group: publish-release-rust + cancel-in-progress: false + jobs: publish: name: Publish Rust crates @@ -72,7 +93,13 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - ref: ${{ inputs.commit || github.sha }} + # No `|| github.sha` fallback: the `Resolve commit` step below + # requires an explicit, non-empty `inputs.commit` so the tag step + # downstream points at the exact reviewed commit. A silent fall + # back to github.sha would bypass publish.yml's master-ancestry + # check for direct workflow_call callers. Empty input here will + # fail the checkout fast, which is the intended hard-fail shape. + ref: ${{ inputs.commit }} fetch-depth: 0 - name: Save and apply latest CI from master @@ -118,14 +145,49 @@ jobs: echo "❌ Could not resolve commit '${INPUT_COMMIT}' to a full SHA" exit 1 fi + + # Defensive re-check of the master-ancestry invariant that publish.yml's + # validate job enforces. This workflow is a publicly-reachable + # workflow_call entry point; a direct caller (runbook, vendored fork) + # that bypasses publish.yml would otherwise skip the ancestry check + # entirely. Re-running it here is cheap (one shallow fetch + one + # merge-base call) and closes the I2 invariant hole. + git fetch origin master --depth=1000 --quiet + if ! git merge-base --is-ancestor "$FULL_SHA" origin/master; then + echo "❌ Commit $FULL_SHA is not an ancestor of origin/master" + echo "" + echo "_publish_rust_crates.yml refuses to publish from a non-master commit." + echo "If you reached this error via a direct workflow_call, route through" + echo "publish.yml instead (which does the same check in its validate job)" + echo "or rebase your branch onto master before invoking." + exit 1 + fi + echo "commit=$FULL_SHA" >> "$GITHUB_OUTPUT" - echo "✅ Resolved commit: $FULL_SHA" + echo "✅ Resolved commit: $FULL_SHA (on origin/master)" - name: Extract versions and tags id: versions + shell: bash run: | + set -euo pipefail chmod +x scripts/extract-version.sh + # Cache cargo metadata once across ALL extract-version.sh and + # composite post-merge.yml invocations in this job. The 8 + # extract-version calls in this step plus the 4 per-crate + # composite `Validate package` steps all hit the same workspace + # snapshot, and cargo metadata on this 36-crate workspace costs + # ~8s cold per fork. File-based rather than env-var inlined + # because the JSON is ~220 KB, exceeding Linux MAX_ARG_STRLEN + # (128 KB). Propagated via $GITHUB_ENV so it also reaches the + # downstream post-merge composite steps; a plain `export` would + # only affect THIS step's shell. + IGGY_CARGO_METADATA_FILE="${RUNNER_TEMP:-/tmp}/iggy-cargo-metadata.json" + cargo metadata --format-version 1 --no-deps > "${IGGY_CARGO_METADATA_FILE}" + echo "IGGY_CARGO_METADATA_FILE=${IGGY_CARGO_METADATA_FILE}" >> "${GITHUB_ENV}" + export IGGY_CARGO_METADATA_FILE # also make it available to this step's extract-version calls + common=$(scripts/extract-version.sh rust-common) protocol=$(scripts/extract-version.sh rust-binary-protocol) sdk=$(scripts/extract-version.sh rust-sdk) @@ -169,6 +231,31 @@ jobs: # the upload landed. If the wait times out, the tag is never pushed and # a rerun can safely re-enter the chain: the publish step is idempotent # via a sparse-index pre-check inside actions/rust/post-merge. + # + # Two-layer wait (defense-in-depth budget cascading - NOT independent + # signals; both layers poll the same sparse index URL): + # + # * Inner CAS verify inside actions/rust/post-merge on a tight + # ~8 min worst-case budget (max_attempts=10, initial_sleep=2, + # cap=30; sleeps 2+4+8+16+30*5=180s + 10 * 30s curl = ~480s). + # Runs only when the idempotency pre-check fell through to the + # publish path, and acts as an authoritative per-step state + # check: "did my cargo publish land on the sparse index in time + # to continue cleanly?" + # * Outer wait-for-crate below on a larger ~28 min worst-case + # budget (default max_attempts=30, initial_sleep=3, cap=30; + # sleeps 3+6+12+24+30*25=795s + 30 * 30s curl = ~1725s). Runs + # on every crate in the chain and acts as the operational + # safety net before the NEXT downstream crate's `cargo publish` + # resolves its path deps from crates.io. Separate budget so a + # slow CDN does not starve the tight inner check. + # + # 4-crate chain aggregate worst case: ~144 min (4 * (8+28)). In + # practice the happy path short-circuits on the first 200 response + # and total wait per crate is single-digit seconds. Chain halts on + # the first failing wait, so the 144 min figure is a ceiling, not + # an expected duration. Operators should only intervene after ~60 + # min of sustained CDN 404s on a single crate. # Step 1: Publish iggy_binary_protocol (depends on nothing in-tree) - name: Publish iggy_binary_protocol @@ -186,7 +273,7 @@ jobs: version: ${{ steps.versions.outputs.protocol }} - name: Tag iggy_binary_protocol - if: inputs.dry_run == false && contains(inputs.crates, 'rust-binary-protocol') + if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-binary-protocol') uses: ./.github/actions/utils/create-git-tag with: tag: ${{ steps.versions.outputs.protocol_tag }} @@ -215,7 +302,7 @@ jobs: version: ${{ steps.versions.outputs.common }} - name: Tag iggy_common - if: inputs.dry_run == false && contains(inputs.crates, 'rust-common') + if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-common') uses: ./.github/actions/utils/create-git-tag with: tag: ${{ steps.versions.outputs.common_tag }} @@ -244,7 +331,7 @@ jobs: version: ${{ steps.versions.outputs.sdk }} - name: Tag iggy SDK - if: inputs.dry_run == false && contains(inputs.crates, 'rust-sdk') + if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-sdk') uses: ./.github/actions/utils/create-git-tag with: tag: ${{ steps.versions.outputs.sdk_tag }} @@ -273,7 +360,7 @@ jobs: version: ${{ steps.versions.outputs.cli }} - name: Tag iggy-cli - if: inputs.dry_run == false && contains(inputs.crates, 'rust-cli') + if: inputs.dry_run == false && inputs.skip_tag_creation == false && contains(inputs.crates, 'rust-cli') uses: ./.github/actions/utils/create-git-tag with: tag: ${{ steps.versions.outputs.cli_tag }} diff --git a/.github/workflows/post-merge.yml b/.github/workflows/post-merge.yml index 9f49334f01..bc085ecbbb 100644 --- a/.github/workflows/post-merge.yml +++ b/.github/workflows/post-merge.yml @@ -79,7 +79,7 @@ jobs: echo "Checking $crate: version=$VERSION, tag=$TAG" - if [[ ! "$VERSION" =~ -(edge|rc) ]]; then + if [ "$(scripts/extract-version.sh "$crate" --is-pre-release)" != "true" ]; then echo " ⏭️ Stable version - skipping" continue fi @@ -117,7 +117,7 @@ jobs: continue fi - if [[ ! "$VERSION" =~ -(edge|rc) ]] && [[ ! "$VERSION" =~ (\.dev|rc)[0-9]+$ ]]; then + if [ "$(scripts/extract-version.sh "$sdk" --is-pre-release)" != "true" ]; then echo " ⏭️ Stable version - skipping" continue fi diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 766ab892cd..4302e0bd7d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -140,6 +140,17 @@ jobs: with: fetch-depth: 0 + # Detect whether we are invoked via workflow_call (reusable) or + # workflow_dispatch (manual). Trust chain: when called via + # workflow_call, the caller is responsible for having validated its + # own trigger — the only in-tree caller today is + # .github/workflows/post-merge.yml, which is branch-filtered to + # master, so the SHA passed in is by construction a commit on + # master. Direct workflow_dispatch callers have no such guarantee + # and must pass through the master-ancestry check in Resolve + # commit below. Any future direct workflow_call caller added to + # this repo must preserve the master-ancestry property or the + # branch gate below will not apply to them. - name: Detect trigger type id: detect run: | @@ -419,7 +430,9 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ needs.validate.outputs.commit }} - fetch-depth: 0 + # check-tags only runs `git ls-remote --tags` against origin, + # which does not need local history. fetch-depth:1 is enough. + fetch-depth: 1 - name: Save and apply latest CI from master if: inputs.use_latest_ci @@ -462,11 +475,16 @@ jobs: EXISTING_TAGS=() NEW_TAGS=() + WRONG_TARGET_TAGS=() # populated when a tag exists at a commit != this run's target echo "| Component | Version | Tag | Status |" >> $GITHUB_STEP_SUMMARY echo "|-----------|---------|-----|--------|" >> $GITHUB_STEP_SUMMARY - echo "$TARGETS_JSON" | jq -r '.include[] | select(.key!="noop") | @base64' | while read -r row; do + # Process substitution (not pipe) so EXISTING_TAGS / NEW_TAGS + # mutations survive the loop: `jq | while ...` puts the loop + # body in a pipe subshell and discards the arrays on exit, + # which silently killed the aggregate summary block below. + while IFS= read -r row; do _jq() { echo "$row" | base64 -d | jq -r "$1"; } KEY=$(_jq '.key') @@ -485,9 +503,17 @@ jobs: VERSION=$(scripts/extract-version.sh "$KEY" 2>/dev/null || echo "ERROR") TAG=$(scripts/extract-version.sh "$KEY" --tag 2>/dev/null || echo "ERROR") - if [[ "$VERSION" =~ -SNAPSHOT$ ]]; then - echo "ℹ️ $NAME: SNAPSHOT version, no tag will be created" - echo "| $NAME | $VERSION | _(none)_ | ℹ️ SNAPSHOT (no tag) |" >> $GITHUB_STEP_SUMMARY + # Consult --should-tag as the single source of truth for + # taggability. It handles SNAPSHOT and "no tag_pattern" in + # one place (extract-version.sh:348-360), which keeps this + # check in sync with the SDK matrix's own should_tag gate. + # A previous inline `[[ $VERSION =~ -SNAPSHOT$ ]]` check + # duplicated the rule and would silently drift when a new + # SDK added another SNAPSHOT-style pre-release marker. + SHOULD_TAG=$(scripts/extract-version.sh "$KEY" --should-tag 2>/dev/null || echo "false") + if [ "$SHOULD_TAG" = "false" ]; then + echo "ℹ️ $NAME: no tag will be created (SNAPSHOT or no tag_pattern)" + echo "| $NAME | $VERSION | _(none)_ | ℹ️ No tag (SNAPSHOT / no pattern) |" >> $GITHUB_STEP_SUMMARY continue fi @@ -521,14 +547,41 @@ jobs: REMOTE_RAW=$(echo "$REMOTE_LINE" | awk '{print $1}') EXISTING_SHA="${REMOTE_PEELED:-${REMOTE_RAW}}" SHORT_SHA=$(echo "$EXISTING_SHA" | head -c 8) - echo "⚠️ Tag exists on remote: $TAG (points to $SHORT_SHA)" - echo "| $NAME | $VERSION | $TAG | ⚠️ Exists at $SHORT_SHA |" >> $GITHUB_STEP_SUMMARY + + # Fail-fast on wrong-target. A wrong-target tag means + # create-git-tag would hard-fail 20-40 minutes later after + # publishing artifacts to crates.io / PyPI / npm / Maven / + # NuGet / DockerHub. Catching it at check-tags converts that + # into a fast, cheap failure at the top of the run. + # Same-target is still benign (rerun convergence). + # + # skip_tag_creation=true accepts the invariant hole (the + # operator opted out of tag writes) but the wrong-target + # state is still rendered loudly under a DISTINCT cell label, + # so an operator cannot mistake it for a benign same-target + # skip. Operators must reconcile the tag/registry divergence + # manually or run a follow-up without skip_tag_creation to + # converge. The fail-fast exit 1 below is gated on + # skip_tag_creation != true. + if [ "$EXISTING_SHA" != "${{ needs.validate.outputs.commit }}" ]; then + WRONG_TARGET_TAGS+=("$TAG|$SHORT_SHA") + if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then + echo "⚠️ Tag exists on remote at WRONG target: $TAG (points to $SHORT_SHA, not enforced: skip_tag_creation=true)" + echo "| $NAME | $VERSION | $TAG | ⚠️ Wrong target at $SHORT_SHA (NOT enforced) |" >> $GITHUB_STEP_SUMMARY + else + echo "❌ Tag exists on remote at wrong target: $TAG (points to $SHORT_SHA)" + echo "| $NAME | $VERSION | $TAG | ❌ Wrong target at $SHORT_SHA |" >> $GITHUB_STEP_SUMMARY + fi + else + echo "⚠️ Tag exists on remote at same target: $TAG (points to $SHORT_SHA)" + echo "| $NAME | $VERSION | $TAG | ⚠️ Exists at $SHORT_SHA (benign) |" >> $GITHUB_STEP_SUMMARY + fi else NEW_TAGS+=("$TAG") echo "✅ Tag will be created: $TAG" echo "| $NAME | $VERSION | $TAG | ✅ Will create |" >> $GITHUB_STEP_SUMMARY fi - done + done < <(echo "$TARGETS_JSON" | jq -r '.include[] | select(.key!="noop") | @base64') echo "" >> $GITHUB_STEP_SUMMARY @@ -537,7 +590,7 @@ jobs: echo "" >> $GITHUB_STEP_SUMMARY echo "### ⚠️ Warning: Existing Tags Detected" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "The following tags already exist and will be skipped:" >> $GITHUB_STEP_SUMMARY + echo "The following tags already exist on the remote:" >> $GITHUB_STEP_SUMMARY for tag in "${EXISTING_TAGS[@]}"; do echo "- $tag" >> $GITHUB_STEP_SUMMARY done @@ -546,23 +599,26 @@ jobs: if [ "${{ inputs.dry_run }}" = "false" ]; then if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then echo "**Note:** Tag creation is disabled for this run." >> $GITHUB_STEP_SUMMARY - echo "Components will be published/republished without updating git tags." >> $GITHUB_STEP_SUMMARY + echo "Components will be (re)published, but no git tags will be pushed." >> $GITHUB_STEP_SUMMARY else - echo "**These components will NOT be republished.** Tags are immutable in git." >> $GITHUB_STEP_SUMMARY + echo "**Tag behavior with \`create-git-tag\` (SHA-match invariant):**" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "If you need to republish:" >> $GITHUB_STEP_SUMMARY - echo "1. Delete the existing tag: \`git push --delete origin \`" >> $GITHUB_STEP_SUMMARY - echo "2. Bump the version in the source file" >> $GITHUB_STEP_SUMMARY - echo "3. Run the publish workflow again" >> $GITHUB_STEP_SUMMARY + echo "- If a pre-existing tag points at the SAME commit this run is publishing, the tag step is a no-op (benign skip). The artifact is (re)published; the tag stays." >> $GITHUB_STEP_SUMMARY + echo "- If a pre-existing tag points at a DIFFERENT commit, the tag step hard-fails with recovery instructions. The artifact publish still runs (registries are idempotent) but no tag is pushed." >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "Alternatively, use \`skip_tag_creation: true\` to republish without tags." >> $GITHUB_STEP_SUMMARY + echo "To recover from a wrong-target tag:" >> $GITHUB_STEP_SUMMARY + echo "1. Verify the intended release commit." >> $GITHUB_STEP_SUMMARY + echo "2. Delete the existing tag on origin: \`git push --delete origin \`" >> $GITHUB_STEP_SUMMARY + echo "3. Rerun this workflow." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "To republish without touching tags, set \`skip_tag_creation: true\` on the workflow dispatch." >> $GITHUB_STEP_SUMMARY fi fi fi if [ ${#NEW_TAGS[@]} -eq 0 ] && [ ${#EXISTING_TAGS[@]} -gt 0 ]; then echo "### ℹ️ No New Tags to Create" >> $GITHUB_STEP_SUMMARY - echo "All specified components have already been tagged. Consider bumping versions if you need to publish new releases." >> $GITHUB_STEP_SUMMARY + echo "All specified components are already tagged at some commit. If tags match this run's commit, the rerun converges cleanly. If not, see the wrong-target recovery above or bump versions." >> $GITHUB_STEP_SUMMARY elif [ ${#NEW_TAGS[@]} -gt 0 ]; then if [ "${{ inputs.skip_tag_creation }}" = "true" ]; then echo "### ℹ️ Tags That Would Be Created (Skipped)" >> $GITHUB_STEP_SUMMARY @@ -577,6 +633,42 @@ jobs: done fi + # Fail-fast on wrong-target tags. If any tag_pattern resolved to a + # name that already exists on origin at a commit different from + # this run's target, create-git-tag would hard-fail at tag push + # time anyway - but only after spending 20-40 min publishing + # artifacts. Catching it here converts that waste into a fast + # diagnostic at the top of the run. Kept as the LAST thing this + # step does so the operator summary block above is already + # populated before we exit. + if [ "${#WRONG_TARGET_TAGS[@]}" -gt 0 ]; then + { + echo "" + echo "### ❌ Wrong-target tags detected" + echo "" + echo "One or more tags already exist on the remote at a commit DIFFERENT from this run's target (\`${{ needs.validate.outputs.commit }}\`):" + echo "" + for entry in "${WRONG_TARGET_TAGS[@]}"; do + tag="${entry%|*}" + sha="${entry#*|}" + echo "- \`$tag\` currently points at \`$sha\`" + done + echo "" + echo "Failing fast to save 20-40 minutes of wasted publishing work; \`create-git-tag\` would hard-fail at tag push time anyway." + echo "" + echo "Recovery (verify the intended release commit first):" + echo "1. Delete the wrong tag(s) on origin: \`git push --delete origin \`" + echo "2. Or bump the version(s) and rerun the workflow" + echo "3. Or rerun with \`skip_tag_creation: true\` to republish artifacts only" + } >> $GITHUB_STEP_SUMMARY + echo "❌ Wrong-target tags detected: ${WRONG_TARGET_TAGS[*]}" + echo "See the Wrong-target tags block in the step summary for recovery steps." + if [ "${{ inputs.skip_tag_creation }}" != "true" ]; then + exit 1 + fi + echo "ℹ️ skip_tag_creation=true, continuing despite wrong-target tags (operator opt-out)." + fi + build-python-wheels: name: Build Python wheels needs: [validate, plan, check-tags] @@ -602,6 +694,7 @@ jobs: dry_run: ${{ inputs.dry_run }} commit: ${{ needs.validate.outputs.commit }} use_latest_ci: ${{ inputs.use_latest_ci }} + skip_tag_creation: ${{ inputs.skip_tag_creation }} secrets: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} @@ -731,9 +824,12 @@ jobs: - uses: actions/checkout@v4 with: ref: ${{ needs.validate.outputs.commit }} - # Full history so create-git-tag can `git tag -a ` against - # any historical SHA the operator passed in, not just HEAD. - fetch-depth: 0 + # create-git-tag's shallow-safe fallback at action.yml:86-96 + # will `git fetch --no-tags --depth=1 origin ` if the + # commit is not in the local clone. GitHub allows single-commit + # fetches via allowReachableSHA1InWant=true, so fetch-depth:1 + # here is enough - no need to pay for full history. + fetch-depth: 1 - name: Ensure version extractor is executable run: | @@ -763,7 +859,7 @@ jobs: SHOULD_TAG=$(scripts/extract-version.sh "$MATRIX_KEY" --should-tag) if [ "$SHOULD_TAG" = "true" ] \ && [ "$CREATE_EDGE_DOCKER_TAG" = "true" ] \ - && [[ ! "$VERSION" =~ -(edge|rc) ]]; then + && [ "$(scripts/extract-version.sh "$MATRIX_KEY" --is-pre-release)" != "true" ]; then SHOULD_TAG=false fi @@ -933,7 +1029,9 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ needs.validate.outputs.commit }} - fetch-depth: 0 + # create-git-tag falls back to a shallow fetch when the commit + # is missing locally, so fetch-depth:1 is sufficient. + fetch-depth: 1 - name: Save and apply latest CI from master if: inputs.use_latest_ci @@ -953,8 +1051,13 @@ jobs: run: | test -x scripts/extract-version.sh || chmod +x scripts/extract-version.sh + # matrix.type == 'rust' is dead: Rust crates are routed to the + # dedicated publish-rust-crates reusable workflow and never reach + # this SDK matrix (see plan job's hasRustCrates split at L349-354). + # Python uses maturin which depends on Rust, so python rows still + # need the toolchain. - name: Setup Rust toolchain (if needed) - if: matrix.type == 'rust' || matrix.type == 'python' + if: matrix.type == 'python' uses: ./.github/actions/utils/setup-rust-with-cache - name: Debug matrix @@ -966,6 +1069,7 @@ jobs: env: MATRIX_KEY: ${{ matrix.key }} MATRIX_TAG_PATTERN: ${{ matrix.tag_pattern }} + CREATE_EDGE_DOCKER_TAG: ${{ inputs.create_edge_docker_tag }} run: | set -euo pipefail VERSION=$(scripts/extract-version.sh "$MATRIX_KEY") @@ -977,6 +1081,22 @@ jobs: fi # Single source of truth for the SNAPSHOT/no-tag-pattern skip rule. SHOULD_TAG=$(scripts/extract-version.sh "$MATRIX_KEY" --should-tag) + + # Symmetric auto-publish stable-skip override matching + # docker-manifests at L786-791. In auto-publish mode + # (create_edge_docker_tag=true), stable versions never get + # versioned git tags, only the rolling :edge Docker tag. Latent + # today because post-merge.yml filters stable SDK versions out + # of publish_other before calling publish.yml, but the symmetry + # with docker-manifests protects against future auto-publish + # callers that do not pre-filter and keeps the two matrices + # aligned on taggability rules. + if [ "$SHOULD_TAG" = "true" ] \ + && [ "$CREATE_EDGE_DOCKER_TAG" = "true" ] \ + && [ "$(scripts/extract-version.sh "$MATRIX_KEY" --is-pre-release)" != "true" ]; then + SHOULD_TAG=false + fi + { echo "version=$VERSION" echo "tag=$TAG" @@ -1071,16 +1191,51 @@ jobs: max_attempts: "15" initial_sleep_seconds: "3" - # Maven Central propagation via the Central Portal is usually minutes - # but has a long tail; budget is ~25 minutes of real wall time (cap 30s - # sleep × ~50 attempts). `iggy-.pom` is the lightest per-version - # URL that only appears once the artifact is fully indexed. + # Java publishes to ASF Nexus staging via `./gradlew publish` + # (repository.apache.org/service/local/staging/deploy/maven2). The + # staging -> Maven Central handoff requires a Nexus Close+Release + # action which in the Apache governance model is operator-driven + # (often behind a dev@ release vote). No in-tree automation performs + # that handoff today. wait-for-url here polls the downstream Maven + # Central mirror (repo1.maven.org), which will not serve the + # artifact until Close+Release completes. + # + # CRITICAL DESIGN CARVE-OUT — DO NOT REMOVE `continue-on-error: true` + # BELOW without reading this: + # + # This wait step is the ONLY wait gate in the publish chain that uses + # `continue-on-error: true`, and the `Tag SDK release` step further + # down uses `if: success()`. GitHub Actions step semantics: + # continue-on-error=true on a failing step + # -> step outcome = failure + # -> step conclusion = success + # -> downstream `if: success()` STILL evaluates true + # So when Maven Central has not caught up within the ~24-minute + # budget (Central propagation is frequently slow, rarely >45 min), + # the Java git tag is STILL pushed after `gradle publish` returned + # 0 at the staging step above. This matches pre-PR behavior (the + # Java tag was always pushed after `gradle publish` returned 0) + # and is INTENTIONAL: without the carve-out, every Java release + # would time-out at this wait and the operator would have to push + # the tag manually — a worse UX than the current best-effort + # shape, and still subject to the same invariant hole anyway. + # + # Follow-up to remove the carve-out: automate staging -> Central + # via `io.github.gradle-nexus.publish-plugin` with + # `closeAndReleaseStagingRepositories`, then drop the + # continue-on-error below and the Java path will match the + # PyPI/npm/NuGet "wait then tag" shape. This is how Kafka / Camel + # / Pulsar / Beam handle it. Blocked on confirming ASF governance + # allows automated promotion for iggy. + # TODO(#NNNN): track the gradle-nexus-publish-plugin adoption and + # drop `continue-on-error` from the step below once it lands. - name: Wait for Maven Central availability if: | success() && inputs.dry_run == false && matrix.type == 'java' && steps.ver.outputs.should_tag == 'true' + continue-on-error: true uses: ./.github/actions/utils/wait-for-url with: url: https://repo1.maven.org/maven2/org/apache/iggy/iggy/${{ steps.ver.outputs.version }}/iggy-${{ steps.ver.outputs.version }}.pom diff --git a/scripts/extract-version.sh b/scripts/extract-version.sh index 8b747800ec..47f92f9d79 100755 --- a/scripts/extract-version.sh +++ b/scripts/extract-version.sh @@ -115,6 +115,27 @@ extract_cargo_version() { cd "$REPO_ROOT" + # Caller-provided cache: if IGGY_CARGO_METADATA_FILE points at a + # readable file containing `cargo metadata --no-deps --format-version=1` + # JSON, use it instead of re-forking cargo. This is the fast path used + # by .github/workflows/_publish_rust_crates.yml's Extract versions and + # tags step, which needs 8 version lookups against the same workspace + # snapshot and would otherwise pay the cargo metadata cost 8 times. + # File-based (not env-var-based) because cargo metadata for a 36-crate + # workspace is ~220 KB, which exceeds Linux's per-env-var limit + # MAX_ARG_STRLEN (128 KB) and would fail with E2BIG on exec(). + if [[ -n "${IGGY_CARGO_METADATA_FILE:-}" ]] && [[ -r "${IGGY_CARGO_METADATA_FILE}" ]] \ + && command -v jq &> /dev/null; then + local version + version=$(jq -r --arg pkg "$package" \ + '.packages[] | select(.name == $pkg) | .version' \ + "${IGGY_CARGO_METADATA_FILE}" | head -1) + if [[ -n "$version" ]]; then + echo "$version" + return 0 + fi + fi + if command -v cargo &> /dev/null && command -v jq &> /dev/null; then local version version=$(cargo metadata --no-deps --format-version=1 2>/dev/null | \ @@ -253,6 +274,7 @@ handle_check() { COMPONENT="" RETURN_TAG=false RETURN_SHOULD_TAG=false +RETURN_IS_PRE_RELEASE=false # Detect mode flags as first argument only case "${1:-}" in @@ -274,6 +296,10 @@ while [[ $# -gt 0 ]]; do RETURN_SHOULD_TAG=true shift ;; + --is-pre-release) + RETURN_IS_PRE_RELEASE=true + shift + ;; *) echo "Unknown option: $1" >&2 exit 1 @@ -281,21 +307,31 @@ while [[ $# -gt 0 ]]; do esac done -if [[ "$RETURN_TAG" == "true" && "$RETURN_SHOULD_TAG" == "true" ]]; then - echo "Error: --tag and --should-tag are mutually exclusive" >&2 +mutex_count=0 +[[ "$RETURN_TAG" == "true" ]] && mutex_count=$((mutex_count + 1)) +[[ "$RETURN_SHOULD_TAG" == "true" ]] && mutex_count=$((mutex_count + 1)) +[[ "$RETURN_IS_PRE_RELEASE" == "true" ]] && mutex_count=$((mutex_count + 1)) +if [[ $mutex_count -gt 1 ]]; then + echo "Error: --tag, --should-tag, and --is-pre-release are mutually exclusive" >&2 exit 1 fi if [[ -z "$COMPONENT" ]]; then - echo "Usage: $0 [--tag|--should-tag]" >&2 + echo "Usage: $0 [--tag|--should-tag|--is-pre-release]" >&2 echo " $0 --all" >&2 echo " $0 --check" >&2 echo "" >&2 - echo " --tag Print the git tag this component would use for its current version." >&2 - echo " --should-tag Print 'true' if the current version should produce a git tag, 'false'" >&2 - echo " otherwise (SNAPSHOT or missing tag_pattern). This is the SINGLE" >&2 - echo " source of truth for taggability; publish.yml consults it for every" >&2 - echo " SDK matrix row." >&2 + echo " --tag Print the git tag this component would use for its current version." >&2 + echo " --should-tag Print 'true' if the current version should produce a git tag, 'false'" >&2 + echo " otherwise (SNAPSHOT or missing tag_pattern). This is the SINGLE" >&2 + echo " source of truth for taggability; publish.yml consults it for every" >&2 + echo " SDK matrix row." >&2 + echo " --is-pre-release Print 'true' if the current version is a pre-release/pre-stable" >&2 + echo " marker across ANY SDK version scheme (-edge, -rc, .devN, bare rcN)," >&2 + echo " 'false' otherwise. SINGLE source of truth for the auto-publish and" >&2 + echo " stable-Docker skip rules in post-merge.yml and publish.yml." >&2 + echo "" >&2 + echo " --tag, --should-tag, and --is-pre-release are mutually exclusive." >&2 echo "" >&2 echo "Available components:" >&2 yq eval '.components | keys | .[]' "$CONFIG_FILE" | sed 's/^/ - /' >&2 @@ -357,6 +393,32 @@ if [[ "$RETURN_SHOULD_TAG" == "true" ]]; then exit 0 fi +# --is-pre-release: returns "true" for versions that are pre-release/ +# pre-stable markers across ALL SDK version schemes we publish. This is +# THE SINGLE SOURCE OF TRUTH for the "is this a pre-release" rule. +# post-merge.yml uses it to decide whether to auto-publish; publish.yml +# uses it for the auto-publish stable-Docker skip rule. Keeping one +# regex here prevents the two call sites from drifting (which they +# previously did - post-merge.yml accepted `.devN` and bare `rcN` while +# publish.yml only accepted `-edge`/`-rc`, so a Python SDK `.devN` +# version would be auto-published to PyPI but never git-tagged). +# +# Matches (any of): +# -edge[.N] (rust crates, docker, node SDK) +# -rc[.N] (all SDKs) +# .devN (Python SDK PEP 440 development markers) +# rcN$ (legacy bare rcN, retained for compatibility) +if [[ "$RETURN_IS_PRE_RELEASE" == "true" ]]; then + if [[ "$VERSION" =~ -(edge|rc) ]] \ + || [[ "$VERSION" =~ \.dev[0-9]+$ ]] \ + || [[ "$VERSION" =~ rc[0-9]+$ ]]; then + echo "true" + else + echo "false" + fi + exit 0 +fi + # Return tag or version based on flag if [[ "$RETURN_TAG" == "true" ]]; then TAG_PATTERN=$(get_config "$COMPONENT" "tag_pattern")