From f3238fca1baba1ad32aff28a68f9871c1887bd7a Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Fri, 6 Oct 2023 15:11:15 +1000
Subject: [PATCH 1/8] fix(ci): Replace busybox with ubuntu to avoid "device or
 resource busy" failures (#7686)

---
 .github/workflows/deploy-gcp-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 65cd9c93bf8..905a8a3d148 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -171,7 +171,7 @@ jobs:
           --image-project=cos-cloud \
           --image-family=cos-stable \
           --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
-          --container-image=gcr.io/google-containers/busybox \
+          --container-image=gcr.io/google-containers/ubuntu \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
           --scopes cloud-platform \
@@ -373,7 +373,7 @@ jobs:
           --image-project=cos-cloud \
           --image-family=cos-stable \
           --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
-          --container-image=gcr.io/google-containers/busybox \
+          --container-image=gcr.io/google-containers/ubuntu \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
           --scopes cloud-platform \

From 5c3a02a1d0d5b8225110de6264c98f7144a911fe Mon Sep 17 00:00:00 2001
From: Gustavo Valverde <gustavo@iterativo.do>
Date: Fri, 6 Oct 2023 14:00:57 +0100
Subject: [PATCH 2/8] fix(ci): disk validation for docker volume mount (#7665)

* fix(ci): disk validation for docker volume mount

* Use a symlink for lightwalletd cached state rather than mounting the same volume twice

* Avoid "sdb seems to be busy" errors from docker by adding extra sleeps

* Add a missing backslash

* Remove symlink from workflow

* Symlink lightwalletd path in entrypoint.sh

* Retry on failure and check Docker logs

* End ssh shell lines with explicit terminators

* Delete Docker containers if Docker mount fails

* Revert symlink changes in entrypoint.sh

* Debug using lsof

* Use correct lsof commands

* Use correct syntax for lsof +D

* fix(ci): make multiple validations before mounting

Loop and checks for three conditions:
The device `/dev/sdb` exists.
No process is using the device `/dev/sdb`.
No process is using the Docker volume directory.

* fix: do not pre-mount docker volume

The Docker version available with the newer `cos-stable` OS (https://cloud.google.com/release-notes#cos-109-17800-0-45) allows to mount the image when running it.

Mounting it before makes the disk unavailable.

* fix: remove extra `;`

* fix: just confirm with `lsof` and show it's output

* chore: reduce diff

---------

Co-authored-by: teor <teor@riseup.net>
---
 .github/workflows/deploy-gcp-tests.yml | 55 ++++++++------------------
 1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 905a8a3d148..b1242ce4c4d 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -171,7 +171,7 @@ jobs:
           --image-project=cos-cloud \
           --image-family=cos-stable \
           --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
-          --container-image=gcr.io/google-containers/ubuntu \
+          --container-image=gcr.io/google-containers/busybox \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
           --scopes cloud-platform \
@@ -180,12 +180,9 @@ jobs:
           --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
           --tags ${{ inputs.app_name }} \
           --zone ${{ vars.GCP_ZONE }}
-          sleep 60
 
-      # Create a docker volume with the new disk we just created.
-      #
-      # SSH into the just created VM, and create a docker volume with the newly created disk.
-      - name: Create ${{ inputs.test_id }} Docker volume
+      # Format the mounted disk if the test doesn't use a cached state.
+      - name: Format ${{ inputs.test_id }} volume
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
@@ -194,10 +191,11 @@ jobs:
           --ssh-flag="-o ConnectTimeout=5" \
           --command \
           "\
+          while sudo lsof /dev/sdb; do \
+            echo 'Waiting for /dev/sdb to be free...'; \
+            sleep 10; \
+          done; \
           sudo mkfs.ext4 -v /dev/sdb \
-          && \
-          sudo docker volume create --driver local --opt type=ext4 --opt device=/dev/sdb \
-          ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
           "
 
       # Launch the test without any cached state
@@ -215,7 +213,7 @@ jobs:
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
           "
 
@@ -296,6 +294,7 @@ jobs:
       - name: Find ${{ inputs.test_id }} cached state disk
         id: get-disk-name
         run: |
+          set -x
           LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "$GITHUB_WORKSPACE/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1)
           echo "STATE_VERSION: $LOCAL_STATE_VERSION"
 
@@ -373,7 +372,7 @@ jobs:
           --image-project=cos-cloud \
           --image-family=cos-stable \
           --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
-          --container-image=gcr.io/google-containers/ubuntu \
+          --container-image=gcr.io/google-containers/busybox \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
           --scopes cloud-platform \
@@ -384,24 +383,6 @@ jobs:
           --zone ${{ vars.GCP_ZONE }}
           sleep 60
 
-      # Create a docker volume with the selected cached state.
-      #
-      # SSH into the just created VM and create a docker volume with the recently attached disk.
-      # (The cached state and disk are usually the same size,
-      # but the cached state can be smaller if we just increased the disk size.)
-      - name: Create ${{ inputs.test_id }} Docker volume
-        run: |
-          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
-          --zone ${{ vars.GCP_ZONE }} \
-          --ssh-flag="-o ServerAliveInterval=5" \
-          --ssh-flag="-o ConnectionAttempts=20" \
-          --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
-          sudo docker volume create --driver local --opt type=ext4 --opt device=/dev/sdb \
-          ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
-          "
-
       # Launch the test with the previously created Zebra-only cached state.
       # Each test runs one of the "Launch test" steps, and skips the other.
       #
@@ -432,14 +413,12 @@ jobs:
           --ssh-flag="-o ConnectTimeout=5" \
           --command \
           "\
-          # Wait for the disk to be attached
-          while [[ ! -e /dev/sdb ]]; do sleep 1; done && \
           sudo docker run \
           --name ${{ inputs.test_id }} \
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
           "
 
@@ -453,11 +432,13 @@ jobs:
       # VM and to the container might require more steps in this workflow, and additional
       # considerations.
       #
-      # The disk mounted in the VM is located at /dev/sdb, we mount the root `/` of this disk to the docker
-      # container in two different paths:
+      # The disk mounted in the VM is located at /dev/sdb, we want the root `/` of this disk to be
+      # available in the docker container at two different paths:
       # - /var/cache/zebrad-cache -> ${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} -> $ZEBRA_CACHED_STATE_DIR
       # - /var/cache/lwd-cache -> ${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} -> $LIGHTWALLETD_DATA_DIR
       #
+      # Currently we do this by mounting the same disk at both paths.
+      #
       # This doesn't cause any path conflicts, because Zebra and lightwalletd create different
       # subdirectories for their data. (But Zebra, lightwalletd, and the test harness must not
       # delete the whole cache directory.)
@@ -482,15 +463,13 @@ jobs:
           --ssh-flag="-o ConnectTimeout=5" \
           --command \
           "\
-          # Wait for the disk to be attached
-          while [[ ! -e /dev/sdb ]]; do sleep 1; done && \
           sudo docker run \
           --name ${{ inputs.test_id }} \
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
-          --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
           "
 

From f6346eb889202223f65c97f4fe1235f3430468cc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 8 Oct 2023 23:10:18 +0000
Subject: [PATCH 3/8] build(deps): bump tj-actions/changed-files from 39.2.0 to
 39.2.1 (#7668)

Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 39.2.0 to 39.2.1.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](https://github.com/tj-actions/changed-files/compare/v39.2.0...v39.2.1)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/lint.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7373290d9a8..53faebe7ca3 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Rust files
         id: changed-files-rust
-        uses: tj-actions/changed-files@v39.2.0
+        uses: tj-actions/changed-files@v39.2.1
         with:
           files: |
             **/*.rs
@@ -49,7 +49,7 @@ jobs:
 
       - name: Workflow files
         id: changed-files-workflows
-        uses: tj-actions/changed-files@v39.2.0
+        uses: tj-actions/changed-files@v39.2.1
         with:
           files: |
             .github/workflows/*.yml

From f6bba8c88e63dec1597dbd46855fbb2cfa74e521 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 8 Oct 2023 23:26:10 +0000
Subject: [PATCH 4/8] build(deps): bump reviewdog/action-actionlint from 1.39.0
 to 1.39.1 (#7669)

Bumps [reviewdog/action-actionlint](https://github.com/reviewdog/action-actionlint) from 1.39.0 to 1.39.1.
- [Release notes](https://github.com/reviewdog/action-actionlint/releases)
- [Commits](https://github.com/reviewdog/action-actionlint/compare/v1.39.0...v1.39.1)

---
updated-dependencies:
- dependency-name: reviewdog/action-actionlint
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 53faebe7ca3..80ee6d8e181 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -144,7 +144,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4.0.0
       - name: actionlint
-        uses: reviewdog/action-actionlint@v1.39.0
+        uses: reviewdog/action-actionlint@v1.39.1
         with:
           level: warning
           fail_on_error: false

From c498eee67f18addf316e92f4f69ff5d26429966b Mon Sep 17 00:00:00 2001
From: Alfredo Garcia <oxarbitrage@gmail.com>
Date: Sun, 8 Oct 2023 23:01:51 -0300
Subject: [PATCH 5/8] change(ci): Create automatic tickets on CI failure for
 more workflows (#7620)

* add buld failures ticket generation to most important workflows

* add missing newline at the end of the file

* doc fixes

* add missing testnet build check

Co-authored-by: teor <teor@riseup.net>

---------

Co-authored-by: teor <teor@riseup.net>
---
 .../workflows/build-crates-individually.yml   | 18 +++++++++++++++++
 .github/workflows/continous-delivery.yml      | 18 +++++++++++++++++
 .../continous-integration-docker.yml          |  8 +++-----
 .../workflows/continous-integration-os.yml    | 18 +++++++++++++++++
 .github/workflows/release-binaries.yml        | 17 ++++++++++++++++
 .github/workflows/release-crates-io.yml       | 20 ++++++++++++++++++-
 6 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-crates-individually.yml b/.github/workflows/build-crates-individually.yml
index 505d9796921..8bbf4765c7d 100644
--- a/.github/workflows/build-crates-individually.yml
+++ b/.github/workflows/build-crates-individually.yml
@@ -138,3 +138,21 @@ jobs:
       - name: Build ${{ matrix.crate }} crate with all features
         run: |
           cargo build --package ${{ matrix.crate }} --all-features
+
+  failure-issue:
+    name: Open or update issues for building crates individually failures
+    # When a new job is added to this workflow, add it to this list.
+    needs: [ matrix, build ]
+    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
+    # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
+    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: jayqi/failed-build-issue-action@v1
+        with:
+          title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
+          # New failures open an issue with this label.
+          label-name: S-ci-fail-build-crates-auto-issue
+          # If there is already an open issue with this label, any failures become comments on that issue.
+          always-create-new-issue: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/continous-delivery.yml b/.github/workflows/continous-delivery.yml
index 1d1c6efba11..3bc314beb97 100644
--- a/.github/workflows/continous-delivery.yml
+++ b/.github/workflows/continous-delivery.yml
@@ -382,3 +382,21 @@ jobs:
           --labels=app=zebrad,environment=qa,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \
           --tags zebrad \
           --zone ${{ vars.GCP_ZONE }}
+
+  failure-issue:
+    name: Open or update issues for release failures
+    # When a new job is added to this workflow, add it to this list.
+    needs: [ versioning, build,  test-configuration-file, deploy-nodes, deploy-instance ]
+    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
+    # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
+    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: jayqi/failed-build-issue-action@v1
+        with:
+          title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
+          # New failures open an issue with this label.
+          label-name: S-ci-fail-release-auto-issue
+          # If there is already an open issue with this label, any failures become comments on that issue.
+          always-create-new-issue: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/continous-integration-docker.yml b/.github/workflows/continous-integration-docker.yml
index 492c9556c86..aeafb9a0bea 100644
--- a/.github/workflows/continous-integration-docker.yml
+++ b/.github/workflows/continous-integration-docker.yml
@@ -790,18 +790,16 @@ jobs:
     # This list is for reliable tests that are run on the `main` branch.
     # Testnet jobs are not in this list, because we expect testnet to fail occasionally.
     needs: [ regenerate-stateful-disks, test-full-sync, lightwalletd-full-sync, test-all, test-all-getblocktemplate-rpcs, test-fake-activation-heights, test-empty-sync, test-lightwalletd-integration, test-configuration-file, test-zebra-conf-path, test-stateful-sync, test-update-sync, generate-checkpoints-mainnet, lightwalletd-update-sync, lightwalletd-rpc-test, lightwalletd-transactions-test, lightwalletd-grpc-test, get-block-template-test, submit-block-test ]
-    # Only open tickets for failed scheduled jobs, manual workflow runs, or `main` branch merges.
+    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
     # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
-    # TODO: if a job times out, we want to create a ticket. Does failure() do that? Or do we need cancelled()?
-    if: failure() && github.event.pull_request == null
+    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
     runs-on: ubuntu-latest
     steps:
       - uses: jayqi/failed-build-issue-action@v1
         with:
           title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
           # New failures open an issue with this label.
-          # TODO: do we want a different label for each workflow, or each kind of workflow?
-          label-name: S-ci-fail-auto-issue
+          label-name: S-ci-fail-main-branch-auto-issue
           # If there is already an open issue with this label, any failures become comments on that issue.
           always-create-new-issue: false
           github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/continous-integration-os.yml b/.github/workflows/continous-integration-os.yml
index 3266f24e561..4f33c0206d6 100644
--- a/.github/workflows/continous-integration-os.yml
+++ b/.github/workflows/continous-integration-os.yml
@@ -319,3 +319,21 @@ jobs:
           else
               echo "No unused dependencies found."
           fi
+
+  failure-issue:
+    name: Open or update issues for OS integration failures
+    # When a new job is added to this workflow, add it to this list.
+    needs: [ test,  install-from-lockfile-no-cache, check-cargo-lock, cargo-deny, unused-deps ]
+    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
+    # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
+    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: jayqi/failed-build-issue-action@v1
+        with:
+          title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
+          # New failures open an issue with this label.
+          label-name: S-ci-fail-os-integration-auto-issue
+          # If there is already an open issue with this label, any failures become comments on that issue.
+          always-create-new-issue: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index a96c15c2867..2fe2243e0b5 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -43,3 +43,20 @@ jobs:
       rust_log: info
     # This step needs access to Docker Hub secrets to run successfully
     secrets: inherit
+
+  failure-issue:
+    name: Open or update issues for release binaries failures
+    # When a new job is added to this workflow, add it to this list.
+    needs: [ build, build-mining-testnet ]
+    # Open tickets for any failed build in this workflow.
+    if: failure() || cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - uses: jayqi/failed-build-issue-action@v1
+        with:
+          title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
+          # New failures open an issue with this label.
+          label-name: S-ci-fail-binaries-auto-issue
+          # If there is already an open issue with this label, any failures become comments on that issue.
+          always-create-new-issue: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/release-crates-io.yml b/.github/workflows/release-crates-io.yml
index e4651556300..94535fccba6 100644
--- a/.github/workflows/release-crates-io.yml
+++ b/.github/workflows/release-crates-io.yml
@@ -114,7 +114,6 @@ jobs:
           # TODO: check all crates after fixing these errors
           cargo release publish --verbose --dry-run --allow-branch '*' --workspace --exclude zebra-consensus --exclude zebra-utils --exclude zebrad
 
-
   # TODO: actually do the release here
   #release-crates:
   #  name: Release Zebra Crates
@@ -123,3 +122,22 @@ jobs:
   #  timeout-minutes: 30
   #  if: ${{ !cancelled() && !failure() && github.event_name == 'release' }}
   #  steps:
+  #  ...
+
+  failure-issue:
+    name: Open or update issues for release crates failures
+    # When a new job is added to this workflow, add it to this list.
+    needs: [ check-release ]
+    # Only open tickets for failed or cancelled jobs that are not coming from PRs.
+    # (PR statuses are already reported in the PR jobs list, and checked by Mergify.)
+    if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: jayqi/failed-build-issue-action@v1
+        with:
+          title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
+          # New failures open an issue with this label.
+          label-name: S-ci-fail-release-crates-auto-issue
+          # If there is already an open issue with this label, any failures become comments on that issue.
+          always-create-new-issue: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}

From 1d45938e0f41d0fd82eb5efe10bee8e1a043abdc Mon Sep 17 00:00:00 2001
From: Alfredo Garcia <oxarbitrage@gmail.com>
Date: Sun, 8 Oct 2023 23:02:04 -0300
Subject: [PATCH 6/8] fix(note-commitment-trees): Populate subtrees (#7636)

* add `sapling_subtree_for_tip` and `orchard_subtree_for_tip` methods to `ZebraDb`

* add methods for non finalized state, move functions

* call `zs_last_key_value` the right way

* fix and simplify `*_subtree_for_tip` methods

Co-authored-by: Arya <aryasolhi@gmail.com>

* apply filter

* rename all tree and subtree methods that use tip

* rename tip tree and subtree methods in non finalized chain

* apply simplify suggestions

Co-authored-by: teor <teor@riseup.net>

---------

Co-authored-by: Arya <aryasolhi@gmail.com>
Co-authored-by: teor <teor@riseup.net>
---
 zebra-state/src/service/finalized_state.rs    |  4 +-
 .../disk_format/upgrade/add_subtrees.rs       | 18 ++---
 .../zebra_db/block/tests/snapshot.rs          |  6 +-
 .../finalized_state/zebra_db/shielded.rs      | 72 +++++++++++++++----
 .../src/service/non_finalized_state.rs        |  6 +-
 .../src/service/non_finalized_state/chain.rs  | 40 ++++++++---
 .../service/non_finalized_state/tests/prop.rs | 12 ++--
 .../non_finalized_state/tests/vectors.rs      |  8 +--
 8 files changed, 116 insertions(+), 50 deletions(-)

diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs
index 3f4e0d4dd0f..acafda7b2f8 100644
--- a/zebra-state/src/service/finalized_state.rs
+++ b/zebra-state/src/service/finalized_state.rs
@@ -242,8 +242,8 @@ impl FinalizedState {
 
                 let block = checkpoint_verified.block.clone();
                 let mut history_tree = self.db.history_tree();
-                let prev_note_commitment_trees =
-                    prev_note_commitment_trees.unwrap_or_else(|| self.db.note_commitment_trees());
+                let prev_note_commitment_trees = prev_note_commitment_trees
+                    .unwrap_or_else(|| self.db.note_commitment_trees_for_tip());
 
                 // Update the note commitment trees.
                 let mut note_commitment_trees = prev_note_commitment_trees.clone();
diff --git a/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs b/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs
index 35209b463d2..72dc4d55c90 100644
--- a/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs
+++ b/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs
@@ -207,12 +207,13 @@ fn quick_check_sapling_subtrees(db: &ZebraDb) -> Result<(), &'static str> {
         return Ok(());
     }
 
-    let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = db.sapling_tree().subtree_index()
+    let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) =
+        db.sapling_tree_for_tip().subtree_index()
     else {
         return Ok(());
     };
 
-    if tip_subtree_index == 0 && !db.sapling_tree().is_complete_subtree() {
+    if tip_subtree_index == 0 && !db.sapling_tree_for_tip().is_complete_subtree() {
         return Ok(());
     }
 
@@ -260,12 +261,13 @@ fn quick_check_orchard_subtrees(db: &ZebraDb) -> Result<(), &'static str> {
         return Ok(());
     }
 
-    let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = db.orchard_tree().subtree_index()
+    let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) =
+        db.orchard_tree_for_tip().subtree_index()
     else {
         return Ok(());
     };
 
-    if tip_subtree_index == 0 && !db.orchard_tree().is_complete_subtree() {
+    if tip_subtree_index == 0 && !db.orchard_tree_for_tip().is_complete_subtree() {
         return Ok(());
     }
 
@@ -333,13 +335,13 @@ fn check_sapling_subtrees(
     cancel_receiver: &mpsc::Receiver<CancelFormatChange>,
 ) -> Result<Result<(), &'static str>, CancelFormatChange> {
     let Some(NoteCommitmentSubtreeIndex(mut first_incomplete_subtree_index)) =
-        db.sapling_tree().subtree_index()
+        db.sapling_tree_for_tip().subtree_index()
     else {
         return Ok(Ok(()));
     };
 
     // If there are no incomplete subtrees in the tree, also expect a subtree for the final index.
-    if db.sapling_tree().is_complete_subtree() {
+    if db.sapling_tree_for_tip().is_complete_subtree() {
         first_incomplete_subtree_index += 1;
     }
 
@@ -463,13 +465,13 @@ fn check_orchard_subtrees(
     cancel_receiver: &mpsc::Receiver<CancelFormatChange>,
 ) -> Result<Result<(), &'static str>, CancelFormatChange> {
     let Some(NoteCommitmentSubtreeIndex(mut first_incomplete_subtree_index)) =
-        db.orchard_tree().subtree_index()
+        db.orchard_tree_for_tip().subtree_index()
     else {
         return Ok(Ok(()));
     };
 
     // If there are no incomplete subtrees in the tree, also expect a subtree for the final index.
-    if db.orchard_tree().is_complete_subtree() {
+    if db.orchard_tree_for_tip().is_complete_subtree() {
         first_incomplete_subtree_index += 1;
     }
 
diff --git a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs
index 2754cd69c3a..6fc96f8dff2 100644
--- a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs
+++ b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs
@@ -247,9 +247,9 @@ fn snapshot_block_and_transaction_data(state: &FinalizedState) {
         let mut stored_sapling_trees = Vec::new();
         let mut stored_orchard_trees = Vec::new();
 
-        let sprout_tree_at_tip = state.sprout_tree();
-        let sapling_tree_at_tip = state.sapling_tree();
-        let orchard_tree_at_tip = state.orchard_tree();
+        let sprout_tree_at_tip = state.sprout_tree_for_tip();
+        let sapling_tree_at_tip = state.sapling_tree_for_tip();
+        let orchard_tree_at_tip = state.orchard_tree_for_tip();
 
         // Test the history tree.
         //
diff --git a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs
index fc0cca9d5a4..75b5db8da64 100644
--- a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs
+++ b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs
@@ -83,7 +83,7 @@ impl ZebraDb {
 
     /// Returns the Sprout note commitment tree of the finalized tip
     /// or the empty tree if the state is empty.
-    pub fn sprout_tree(&self) -> Arc<sprout::tree::NoteCommitmentTree> {
+    pub fn sprout_tree_for_tip(&self) -> Arc<sprout::tree::NoteCommitmentTree> {
         if self.is_empty() {
             return Arc::<sprout::tree::NoteCommitmentTree>::default();
         }
@@ -161,7 +161,7 @@ impl ZebraDb {
 
     /// Returns the Sapling note commitment tree of the finalized tip or the empty tree if the state
     /// is empty.
-    pub fn sapling_tree(&self) -> Arc<sapling::tree::NoteCommitmentTree> {
+    pub fn sapling_tree_for_tip(&self) -> Arc<sapling::tree::NoteCommitmentTree> {
         let height = match self.finalized_tip_height() {
             Some(h) => h,
             None => return Default::default(),
@@ -303,11 +303,32 @@ impl ZebraDb {
         }
     }
 
+    /// Get the sapling note commitment subtress for the finalized tip.
+    #[allow(clippy::unwrap_in_result)]
+    fn sapling_subtree_for_tip(&self) -> Option<NoteCommitmentSubtree<sapling::tree::Node>> {
+        let sapling_subtrees = self
+            .db
+            .cf_handle("sapling_note_commitment_subtree")
+            .unwrap();
+
+        let (index, subtree_data): (
+            NoteCommitmentSubtreeIndex,
+            NoteCommitmentSubtreeData<sapling::tree::Node>,
+        ) = self.db.zs_last_key_value(&sapling_subtrees)?;
+
+        let tip_height = self.finalized_tip_height()?;
+        if subtree_data.end != tip_height {
+            return None;
+        }
+
+        Some(subtree_data.with_index(index))
+    }
+
     // Orchard trees
 
     /// Returns the Orchard note commitment tree of the finalized tip or the empty tree if the state
     /// is empty.
-    pub fn orchard_tree(&self) -> Arc<orchard::tree::NoteCommitmentTree> {
+    pub fn orchard_tree_for_tip(&self) -> Arc<orchard::tree::NoteCommitmentTree> {
         let height = match self.finalized_tip_height() {
             Some(h) => h,
             None => return Default::default(),
@@ -449,15 +470,38 @@ impl ZebraDb {
         }
     }
 
+    /// Get the orchard note commitment subtress for the finalized tip.
+    #[allow(clippy::unwrap_in_result)]
+    fn orchard_subtree_for_tip(&self) -> Option<NoteCommitmentSubtree<orchard::tree::Node>> {
+        let orchard_subtrees = self
+            .db
+            .cf_handle("orchard_note_commitment_subtree")
+            .unwrap();
+
+        let (index, subtree_data): (
+            NoteCommitmentSubtreeIndex,
+            NoteCommitmentSubtreeData<orchard::tree::Node>,
+        ) = self.db.zs_last_key_value(&orchard_subtrees)?;
+
+        let tip_height = self.finalized_tip_height()?;
+        if subtree_data.end != tip_height {
+            return None;
+        }
+
+        Some(subtree_data.with_index(index))
+    }
+
     /// Returns the shielded note commitment trees of the finalized tip
     /// or the empty trees if the state is empty.
-    pub fn note_commitment_trees(&self) -> NoteCommitmentTrees {
+    /// Additionally, returns the sapling and orchard subtrees for the finalized tip if
+    /// the current subtree is finalizing in the tip, None otherwise.
+    pub fn note_commitment_trees_for_tip(&self) -> NoteCommitmentTrees {
         NoteCommitmentTrees {
-            sprout: self.sprout_tree(),
-            sapling: self.sapling_tree(),
-            sapling_subtree: None,
-            orchard: self.orchard_tree(),
-            orchard_subtree: None,
+            sprout: self.sprout_tree_for_tip(),
+            sapling: self.sapling_tree_for_tip(),
+            sapling_subtree: self.sapling_subtree_for_tip(),
+            orchard: self.orchard_tree_for_tip(),
+            orchard_subtree: self.orchard_subtree_for_tip(),
         }
     }
 }
@@ -571,10 +615,10 @@ impl DiskWriteBatch {
 
         // Store the Sapling tree only if it is not already present at the previous height.
         if height.is_min()
-            || prev_note_commitment_trees
-                .as_ref()
-                .map_or_else(|| zebra_db.sapling_tree(), |trees| trees.sapling.clone())
-                != trees.sapling
+            || prev_note_commitment_trees.as_ref().map_or_else(
+                || zebra_db.sapling_tree_for_tip(),
+                |trees| trees.sapling.clone(),
+            ) != trees.sapling
         {
             self.zs_insert(&sapling_tree_cf, height, trees.sapling);
         }
@@ -582,7 +626,7 @@ impl DiskWriteBatch {
         // Store the Orchard tree only if it is not already present at the previous height.
         if height.is_min()
             || prev_note_commitment_trees
-                .map_or_else(|| zebra_db.orchard_tree(), |trees| trees.orchard)
+                .map_or_else(|| zebra_db.orchard_tree_for_tip(), |trees| trees.orchard)
                 != trees.orchard
         {
             self.zs_insert(&orchard_tree_cf, height, trees.orchard);
diff --git a/zebra-state/src/service/non_finalized_state.rs b/zebra-state/src/service/non_finalized_state.rs
index 6b303360b6f..bc342e5be9a 100644
--- a/zebra-state/src/service/non_finalized_state.rs
+++ b/zebra-state/src/service/non_finalized_state.rs
@@ -284,9 +284,9 @@ impl NonFinalizedState {
         let chain = Chain::new(
             self.network,
             finalized_tip_height,
-            finalized_state.sprout_tree(),
-            finalized_state.sapling_tree(),
-            finalized_state.orchard_tree(),
+            finalized_state.sprout_tree_for_tip(),
+            finalized_state.sapling_tree_for_tip(),
+            finalized_state.orchard_tree_for_tip(),
             finalized_state.history_tree(),
             finalized_state.finalized_value_pool(),
         );
diff --git a/zebra-state/src/service/non_finalized_state/chain.rs b/zebra-state/src/service/non_finalized_state/chain.rs
index 670901ba9af..0297c93f67d 100644
--- a/zebra-state/src/service/non_finalized_state/chain.rs
+++ b/zebra-state/src/service/non_finalized_state/chain.rs
@@ -504,7 +504,7 @@ impl Chain {
     /// # Panics
     ///
     /// If this chain has no sprout trees. (This should be impossible.)
-    pub fn sprout_note_commitment_tree(&self) -> Arc<sprout::tree::NoteCommitmentTree> {
+    pub fn sprout_note_commitment_tree_for_tip(&self) -> Arc<sprout::tree::NoteCommitmentTree> {
         self.sprout_trees_by_height
             .last_key_value()
             .expect("only called while sprout_trees_by_height is populated")
@@ -668,7 +668,7 @@ impl Chain {
     /// # Panics
     ///
     /// If this chain has no sapling trees. (This should be impossible.)
-    pub fn sapling_note_commitment_tree(&self) -> Arc<sapling::tree::NoteCommitmentTree> {
+    pub fn sapling_note_commitment_tree_for_tip(&self) -> Arc<sapling::tree::NoteCommitmentTree> {
         self.sapling_trees_by_height
             .last_key_value()
             .expect("only called while sapling_trees_by_height is populated")
@@ -737,6 +737,16 @@ impl Chain {
             .collect()
     }
 
+    /// Returns the Sapling [`NoteCommitmentSubtree`] if it was completed at the tip height.
+    pub fn sapling_subtree_for_tip(&self) -> Option<NoteCommitmentSubtree<sapling::tree::Node>> {
+        if !self.is_empty() {
+            let tip = self.non_finalized_tip_height();
+            self.sapling_subtree(tip.into())
+        } else {
+            None
+        }
+    }
+
     /// Adds the Sapling `tree` to the tree and anchor indexes at `height`.
     ///
     /// `height` can be either:
@@ -869,7 +879,7 @@ impl Chain {
     /// # Panics
     ///
     /// If this chain has no orchard trees. (This should be impossible.)
-    pub fn orchard_note_commitment_tree(&self) -> Arc<orchard::tree::NoteCommitmentTree> {
+    pub fn orchard_note_commitment_tree_for_tip(&self) -> Arc<orchard::tree::NoteCommitmentTree> {
         self.orchard_trees_by_height
             .last_key_value()
             .expect("only called while orchard_trees_by_height is populated")
@@ -939,6 +949,16 @@ impl Chain {
             .collect()
     }
 
+    /// Returns the Orchard [`NoteCommitmentSubtree`] if it was completed at the tip height.
+    pub fn orchard_subtree_for_tip(&self) -> Option<NoteCommitmentSubtree<orchard::tree::Node>> {
+        if !self.is_empty() {
+            let tip = self.non_finalized_tip_height();
+            self.orchard_subtree(tip.into())
+        } else {
+            None
+        }
+    }
+
     /// Adds the Orchard `tree` to the tree and anchor indexes at `height`.
     ///
     /// `height` can be either:
@@ -1387,11 +1407,11 @@ impl Chain {
 
         // Prepare data for parallel execution
         let mut nct = NoteCommitmentTrees {
-            sprout: self.sprout_note_commitment_tree(),
-            sapling: self.sapling_note_commitment_tree(),
-            sapling_subtree: None,
-            orchard: self.orchard_note_commitment_tree(),
-            orchard_subtree: None,
+            sprout: self.sprout_note_commitment_tree_for_tip(),
+            sapling: self.sapling_note_commitment_tree_for_tip(),
+            sapling_subtree: self.sapling_subtree_for_tip(),
+            orchard: self.orchard_note_commitment_tree_for_tip(),
+            orchard_subtree: self.orchard_subtree_for_tip(),
         };
 
         let mut tree_result = None;
@@ -1427,8 +1447,8 @@ impl Chain {
                 .insert(subtree.index, subtree.into_data());
         }
 
-        let sapling_root = self.sapling_note_commitment_tree().root();
-        let orchard_root = self.orchard_note_commitment_tree().root();
+        let sapling_root = self.sapling_note_commitment_tree_for_tip().root();
+        let orchard_root = self.orchard_note_commitment_tree_for_tip().root();
 
         // TODO: update the history trees in a rayon thread, if they show up in CPU profiles
         let mut history_tree = self.history_block_commitment_tree();
diff --git a/zebra-state/src/service/non_finalized_state/tests/prop.rs b/zebra-state/src/service/non_finalized_state/tests/prop.rs
index 76ebd3770cd..b7a3e5b00d4 100644
--- a/zebra-state/src/service/non_finalized_state/tests/prop.rs
+++ b/zebra-state/src/service/non_finalized_state/tests/prop.rs
@@ -328,9 +328,9 @@ fn finalized_equals_pushed_genesis() -> Result<()> {
         let mut partial_chain = Chain::new(
             network,
             full_chain.non_finalized_tip_height(),
-            full_chain.sprout_note_commitment_tree(),
-            full_chain.sapling_note_commitment_tree(),
-            full_chain.orchard_note_commitment_tree(),
+            full_chain.sprout_note_commitment_tree_for_tip(),
+            full_chain.sapling_note_commitment_tree_for_tip(),
+            full_chain.orchard_note_commitment_tree_for_tip(),
             full_chain.history_block_commitment_tree(),
             full_chain.chain_value_pools,
         );
@@ -406,9 +406,9 @@ fn finalized_equals_pushed_history_tree() -> Result<()> {
         let mut partial_chain = Chain::new(
             network,
             Height(finalized_count.try_into().unwrap()),
-            full_chain.sprout_note_commitment_tree(),
-            full_chain.sapling_note_commitment_tree(),
-            full_chain.orchard_note_commitment_tree(),
+            full_chain.sprout_note_commitment_tree_for_tip(),
+            full_chain.sapling_note_commitment_tree_for_tip(),
+            full_chain.orchard_note_commitment_tree_for_tip(),
             full_chain.history_block_commitment_tree(),
             full_chain.chain_value_pools,
         );
diff --git a/zebra-state/src/service/non_finalized_state/tests/vectors.rs b/zebra-state/src/service/non_finalized_state/tests/vectors.rs
index 34242be752a..78c5de9d84a 100644
--- a/zebra-state/src/service/non_finalized_state/tests/vectors.rs
+++ b/zebra-state/src/service/non_finalized_state/tests/vectors.rs
@@ -513,8 +513,8 @@ fn history_tree_is_updated_for_network_upgrade(
     let tree = NonEmptyHistoryTree::from_block(
         Network::Mainnet,
         activation_block.clone(),
-        &chain.sapling_note_commitment_tree().root(),
-        &chain.orchard_note_commitment_tree().root(),
+        &chain.sapling_note_commitment_tree_for_tip().root(),
+        &chain.orchard_note_commitment_tree_for_tip().root(),
     )
     .unwrap();
 
@@ -598,8 +598,8 @@ fn commitment_is_validated_for_network_upgrade(network: Network, network_upgrade
     let tree = NonEmptyHistoryTree::from_block(
         Network::Mainnet,
         activation_block.clone(),
-        &chain.sapling_note_commitment_tree().root(),
-        &chain.orchard_note_commitment_tree().root(),
+        &chain.sapling_note_commitment_tree_for_tip().root(),
+        &chain.orchard_note_commitment_tree_for_tip().root(),
     )
     .unwrap();
 

From a2b7859e8edc0b4d9ce42ac1da3e25c7e1cc751f Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Mon, 9 Oct 2023 13:10:08 +1000
Subject: [PATCH 7/8] Try larger cached state disk sizes (#7684)

---
 .github/workflows/deploy-gcp-tests.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index b1242ce4c4d..8751698783c 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -166,11 +166,11 @@ jobs:
         id: create-instance
         run: |
           gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
-          --boot-disk-size 300GB \
+          --boot-disk-size 50GB \
           --boot-disk-type pd-ssd \
           --image-project=cos-cloud \
           --image-family=cos-stable \
-          --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
+          --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \
           --container-image=gcr.io/google-containers/busybox \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
@@ -367,11 +367,11 @@ jobs:
         id: create-instance
         run: |
           gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
-          --boot-disk-size 300GB \
+          --boot-disk-size 50GB \
           --boot-disk-type pd-ssd \
           --image-project=cos-cloud \
           --image-family=cos-stable \
-          --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \
+          --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \
           --container-image=gcr.io/google-containers/busybox \
           --machine-type ${{ vars.GCP_LARGE_MACHINE }} \
           --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \

From 8d0a17ee1ca174de65b1994e6da729ad3fa94758 Mon Sep 17 00:00:00 2001
From: Gustavo Valverde <gustavo@iterativo.do>
Date: Mon, 9 Oct 2023 18:59:59 +0100
Subject: [PATCH 8/8] fix(ci): handle disk mounting and logs reading edge-cases
 (#7690)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: use `exit-nopipe` with consistent `shell` usage

Temporarily disabled the `set -e` option around the docker logs command to handle the broken pipe error gracefully.

Handle more complex scenarios in our `Result of ${{ inputs.test_id }} test` job

* fix: Use single quotes for the outer command

* fix: use same approach for CD

* test: check launch failure logs

* fix: revert CD changes

* fix: do not try to increase the disk size and wait mounting

* fix: increase GB a bit more

* fix: do not fail on pipe failure

* fix: use plain `tee /dev/stderr`

If this does not work try `(tee … || true)`

* fix: `tee` not stoping on cd config tests

* fix: match logic with GCP tests

* fix(cd): handle pipe and other errors correctly

* try `tee --output-error=exit-nopipe`

* fix: TRAP without pipefail

* test: pipefail with exit and trap

* fix: use a subshell

* fix(ci): wait for mounting and show system logs if fail

* fix(ci): GCP is not always mounting disks in the same order

* fix: use `grep` instead of `awk`

* fix: typo

* fix: use simpler `grep` command

* fix: do not sleep if not require

* chore: reduce diff
---
 .github/workflows/continous-delivery.yml |  91 ++++++++------
 .github/workflows/deploy-gcp-tests.yml   | 152 ++++++++++++++++-------
 2 files changed, 162 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/continous-delivery.yml b/.github/workflows/continous-delivery.yml
index 3bc314beb97..7fc5ec8d180 100644
--- a/.github/workflows/continous-delivery.yml
+++ b/.github/workflows/continous-delivery.yml
@@ -29,7 +29,7 @@ on:
         type: boolean
         default: false
 
-  # Temporarily disabled to reduce network load, see #6894.
+  # TODO: Temporarily disabled to reduce network load, see #6894.
   #push:
   #  branches:
   #    - main
@@ -132,29 +132,37 @@ jobs:
 
       # Make sure Zebra can sync at least one full checkpoint on mainnet
       - name: Run tests using the default config
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
-          set -ex
           docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
           docker run --detach --name default-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
-          docker logs --tail all --follow default-conf-tests | \
-          tee --output-error=exit /dev/stderr | \
-          grep --max-count=1 --extended-regexp --color=always \
-          'net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter'
+
+          # Use a subshell to handle the broken pipe error gracefully
+          (
+            trap "" PIPE;
+            docker logs \
+            --tail all \
+            --follow \
+            default-conf-tests | \
+            tee --output-error=exit /dev/stderr | \
+            grep --max-count=1 --extended-regexp --color=always \
+            -e "net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter"
+          ) || true
+          LOGS_EXIT_STATUS=$?
+
           docker stop default-conf-tests
-          # get the exit status from docker
-          EXIT_STATUS=$( \
-          docker wait default-conf-tests || \
-          docker inspect --format "{{.State.ExitCode}}" default-conf-tests || \
-          echo "missing container, or missing exit status for container" \
-          )
-          docker logs default-conf-tests
-          echo "docker exit status: $EXIT_STATUS"
-          if [[ "$EXIT_STATUS" = "137" ]]; then
-          echo "ignoring expected signal status"
-          exit 0
+
+          EXIT_STATUS=$(docker wait default-conf-tests || echo "Error retrieving exit status");
+          echo "docker exit status: $EXIT_STATUS";
+
+          # If grep found the pattern, exit with the Docker container exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
           fi
-          exit "$EXIT_STATUS"
+
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1;
 
   # Test reconfiguring the docker image for testnet.
   test-configuration-file-testnet:
@@ -172,30 +180,37 @@ jobs:
 
       # Make sure Zebra can sync the genesis block on testnet
       - name: Run tests using a testnet config
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
-          set -ex
           docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
           docker run --env "NETWORK=Testnet" --detach --name testnet-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }}
-          # show the logs, even if the job times out
-          docker logs --tail all --follow testnet-conf-tests | \
-          tee --output-error=exit /dev/stderr | \
-          grep --max-count=1 --extended-regexp --color=always \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*Genesis' \
-          -e 'net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter'
+          # Use a subshell to handle the broken pipe error gracefully
+          (
+            trap "" PIPE;
+            docker logs \
+            --tail all \
+            --follow \
+            testnet-conf-tests | \
+            tee --output-error=exit /dev/stderr | \
+            grep --max-count=1 --extended-regexp --color=always \
+            -e "net.*=.*Test.*estimated progress to chain tip.*Genesis" \
+            -e "net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter";
+          ) || true
+          LOGS_EXIT_STATUS=$?
+
           docker stop testnet-conf-tests
-          # get the exit status from docker
-          EXIT_STATUS=$( \
-          docker wait testnet-conf-tests || \
-          docker inspect --format "{{.State.ExitCode}}" testnet-conf-tests || \
-          echo "missing container, or missing exit status for container" \
-          )
-          docker logs testnet-conf-tests
-          echo "docker exit status: $EXIT_STATUS"
-          if [[ "$EXIT_STATUS" = "137" ]]; then
-          echo "ignoring expected signal status"
-          exit 0
+
+          EXIT_STATUS=$(docker wait testnet-conf-tests || echo "Error retrieving exit status");
+          echo "docker exit status: $EXIT_STATUS";
+
+          # If grep found the pattern, exit with the Docker container exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
           fi
-          exit "$EXIT_STATUS"
+
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1;
 
   # Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
   # with one node in the configured GCP region.
diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 8751698783c..d6820b9a311 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -183,39 +183,56 @@ jobs:
 
       # Format the mounted disk if the test doesn't use a cached state.
       - name: Format ${{ inputs.test_id }} volume
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
-          while sudo lsof /dev/sdb; do \
-            echo 'Waiting for /dev/sdb to be free...'; \
-            sleep 10; \
-          done; \
-          sudo mkfs.ext4 -v /dev/sdb \
-          "
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+          sudo mkfs.ext4 -v /dev/$DISK_NAME \
+          '
 
       # Launch the test without any cached state
       - name: Launch ${{ inputs.test_id }} test
+        id: launch-test
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
           sudo docker run \
           --name ${{ inputs.test_id }} \
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/sdb;
+          sudo dmesg;
+          sudo journalctl -b \
+          '
 
   # set up and launch the test, if it uses cached state
   # each test runs one of the *-with/without-cached-state job series, and skips the other
@@ -381,7 +398,6 @@ jobs:
           --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
           --tags ${{ inputs.app_name }} \
           --zone ${{ vars.GCP_ZONE }}
-          sleep 60
 
       # Launch the test with the previously created Zebra-only cached state.
       # Each test runs one of the "Launch test" steps, and skips the other.
@@ -405,22 +421,43 @@ jobs:
         # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
         # TODO: we should find a better logic for this use cases
         if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+
           sudo docker run \
           --name ${{ inputs.test_id }} \
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() && (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/$DISK_NAME;
+          sudo dmesg;
+          sudo journalctl -b \
+          '
 
       # Launch the test with the previously created Lightwalletd and Zebra cached state.
       # Each test runs one of the "Launch test" steps, and skips the other.
@@ -455,23 +492,44 @@ jobs:
         # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
         # TODO: we should find a better logic for this use cases
         if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
+          set -ex;
+          # Extract the correct disk name based on the device-name
+          export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \
+
           sudo docker run \
           --name ${{ inputs.test_id }} \
           --tty \
           --detach \
           ${{ inputs.test_variables }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
-          --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \
+          --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \
           ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
-          "
+          '
+
+      # Show debug logs if previous job failed
+      - name: Show debug logs if previous job failed
+        if: ${{ failure() && (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }}
+        shell: /usr/bin/bash -exo pipefail {0}
+        run: |
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=' \
+          lsblk;
+          sudo lsof /dev/$DISK_NAME;
+          sudo dmesg;
+          sudo journalctl -b \
+          '
 
   # Show all the test logs, then follow the logs of the test we just launched, until it finishes.
   # Then check the result of the test.
@@ -538,23 +596,23 @@ jobs:
       #
       # Errors in the tests are caught by the final test status job.
       - name: Check startup logs for ${{ inputs.test_id }}
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command \
-          "\
+          --command=' \
           sudo docker logs \
           --tail all \
           --follow \
           ${{ inputs.test_id }} | \
           head -700 | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          -e 'Zcash network: ${{ inputs.network }}' \
-          "
+          -e "Zcash network: ${{ inputs.network }}" \
+          '
 
       # Check that the container executed at least 1 Rust test harness test, and that all tests passed.
       # Then wait for the container to finish, and exit with the test's exit status.
@@ -567,6 +625,7 @@ jobs:
       # with that status.
       # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
       - name: Result of ${{ inputs.test_id }} test
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ vars.GCP_ZONE }} \
@@ -574,26 +633,31 @@ jobs:
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
           --command=' \
-          set -e;
-          set -o pipefail;
-          trap '' PIPE;
+          trap "" PIPE;
 
+          # Temporarily disable "set -e" to handle the broken pipe error gracefully
+          set +e;
           sudo docker logs \
           --tail all \
           --follow \
           ${{ inputs.test_id }} | \
-          tee --output-error=exit /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
-          "test result: .*ok.* [1-9][0-9]* passed.*finished in"; \
+          "test result: .*ok.* [1-9][0-9]* passed.*finished in";
+          LOGS_EXIT_STATUS=$?;
+          set -e;
+
+          EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status");
+          echo "sudo docker exit status: $EXIT_STATUS";
 
-          EXIT_STATUS=$( \
-          sudo docker wait ${{ inputs.test_id }} || \
-          sudo docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \
-          echo "missing container, or missing exit status for container" \
-          ); \
+          # If grep found the pattern, exit with the Docker container"s exit status
+          if [ $LOGS_EXIT_STATUS -eq 0 ]; then
+              exit $EXIT_STATUS;
+          fi
 
-          echo "sudo docker exit status: $EXIT_STATUS"; \
-          exit "$EXIT_STATUS" \
+          # Handle other potential errors here
+          echo "An error occurred while processing the logs.";
+          exit 1; \
           '
 
   # create a state image from the instance's state disk, if requested by the caller
@@ -707,6 +771,7 @@ jobs:
       # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
       # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
       - name: Get database versions from logs
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           INITIAL_DISK_DB_VERSION=""
           RUNNING_DB_VERSION=""
@@ -718,9 +783,9 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
           sudo docker logs ${{ inputs.test_id }} | head -1000 \
-          ")
+          ')
 
           # either a semantic version or "creating new database"
           INITIAL_DISK_DB_VERSION=$( \
@@ -796,6 +861,7 @@ jobs:
       #
       # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
       - name: Get sync height from logs
+        shell: /usr/bin/bash -exo pipefail {0}
         run: |
           SYNC_HEIGHT=""
 
@@ -805,9 +871,9 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --ssh-flag="-o ConnectionAttempts=20" \
           --ssh-flag="-o ConnectTimeout=5" \
-          --command=" \
+          --command=' \
           sudo docker logs ${{ inputs.test_id }} --tail 200 \
-          ")
+          ')
 
           SYNC_HEIGHT=$( \
           echo "$DOCKER_LOGS" | \