From f3238fca1baba1ad32aff28a68f9871c1887bd7a Mon Sep 17 00:00:00 2001 From: teor Date: Fri, 6 Oct 2023 15:11:15 +1000 Subject: [PATCH 1/8] fix(ci): Replace busybox with ubuntu to avoid "device or resource busy" failures (#7686) --- .github/workflows/deploy-gcp-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 65cd9c93bf8..905a8a3d148 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -171,7 +171,7 @@ jobs: --image-project=cos-cloud \ --image-family=cos-stable \ --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image=gcr.io/google-containers/busybox \ + --container-image=gcr.io/google-containers/ubuntu \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ --scopes cloud-platform \ @@ -373,7 +373,7 @@ jobs: --image-project=cos-cloud \ --image-family=cos-stable \ --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image=gcr.io/google-containers/busybox \ + --container-image=gcr.io/google-containers/ubuntu \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ --scopes cloud-platform \ From 5c3a02a1d0d5b8225110de6264c98f7144a911fe Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Fri, 6 Oct 2023 14:00:57 +0100 Subject: [PATCH 2/8] fix(ci): disk validation for docker volume mount (#7665) * fix(ci): disk validation for docker volume mount * Use a symlink for lightwalletd cached state rather than mounting the same volume twice * Avoid "sdb seems to be busy" errors from docker by adding extra sleeps * Add a missing backslash * Remove symlink from workflow * Symlink lightwalletd path in entrypoint.sh * Retry on failure and check Docker logs * End ssh shell lines with explicit terminators * Delete Docker containers if Docker mount fails * Revert symlink changes in entrypoint.sh * Debug using lsof * Use correct lsof commands * Use correct syntax for lsof +D * fix(ci): make multiple validations before mounting Loop and checks for three conditions: The device `/dev/sdb` exists. No process is using the device `/dev/sdb`. No process is using the Docker volume directory. * fix: do not pre-mount docker volume The Docker version available with the newer `cos-stable` OS (https://cloud.google.com/release-notes#cos-109-17800-0-45) allows to mount the image when running it. Mounting it before makes the disk unavailable. * fix: remove extra `;` * fix: just confirm with `lsof` and show it's output * chore: reduce diff --------- Co-authored-by: teor --- .github/workflows/deploy-gcp-tests.yml | 55 ++++++++------------------ 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 905a8a3d148..b1242ce4c4d 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -171,7 +171,7 @@ jobs: --image-project=cos-cloud \ --image-family=cos-stable \ --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image=gcr.io/google-containers/ubuntu \ + --container-image=gcr.io/google-containers/busybox \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ --scopes cloud-platform \ @@ -180,12 +180,9 @@ jobs: --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \ --tags ${{ inputs.app_name }} \ --zone ${{ vars.GCP_ZONE }} - sleep 60 - # Create a docker volume with the new disk we just created. - # - # SSH into the just created VM, and create a docker volume with the newly created disk. - - name: Create ${{ inputs.test_id }} Docker volume + # Format the mounted disk if the test doesn't use a cached state. + - name: Format ${{ inputs.test_id }} volume run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -194,10 +191,11 @@ jobs: --ssh-flag="-o ConnectTimeout=5" \ --command \ "\ + while sudo lsof /dev/sdb; do \ + echo 'Waiting for /dev/sdb to be free...'; \ + sleep 10; \ + done; \ sudo mkfs.ext4 -v /dev/sdb \ - && \ - sudo docker volume create --driver local --opt type=ext4 --opt device=/dev/sdb \ - ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \ " # Launch the test without any cached state @@ -215,7 +213,7 @@ jobs: --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ " @@ -296,6 +294,7 @@ jobs: - name: Find ${{ inputs.test_id }} cached state disk id: get-disk-name run: | + set -x LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "$GITHUB_WORKSPACE/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1) echo "STATE_VERSION: $LOCAL_STATE_VERSION" @@ -373,7 +372,7 @@ jobs: --image-project=cos-cloud \ --image-family=cos-stable \ --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ - --container-image=gcr.io/google-containers/ubuntu \ + --container-image=gcr.io/google-containers/busybox \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ --scopes cloud-platform \ @@ -384,24 +383,6 @@ jobs: --zone ${{ vars.GCP_ZONE }} sleep 60 - # Create a docker volume with the selected cached state. - # - # SSH into the just created VM and create a docker volume with the recently attached disk. - # (The cached state and disk are usually the same size, - # but the cached state can be smaller if we just increased the disk size.) - - name: Create ${{ inputs.test_id }} Docker volume - run: | - gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ - --zone ${{ vars.GCP_ZONE }} \ - --ssh-flag="-o ServerAliveInterval=5" \ - --ssh-flag="-o ConnectionAttempts=20" \ - --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ - sudo docker volume create --driver local --opt type=ext4 --opt device=/dev/sdb \ - ${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \ - " - # Launch the test with the previously created Zebra-only cached state. # Each test runs one of the "Launch test" steps, and skips the other. # @@ -432,14 +413,12 @@ jobs: --ssh-flag="-o ConnectTimeout=5" \ --command \ "\ - # Wait for the disk to be attached - while [[ ! -e /dev/sdb ]]; do sleep 1; done && \ sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ " @@ -453,11 +432,13 @@ jobs: # VM and to the container might require more steps in this workflow, and additional # considerations. # - # The disk mounted in the VM is located at /dev/sdb, we mount the root `/` of this disk to the docker - # container in two different paths: + # The disk mounted in the VM is located at /dev/sdb, we want the root `/` of this disk to be + # available in the docker container at two different paths: # - /var/cache/zebrad-cache -> ${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} -> $ZEBRA_CACHED_STATE_DIR # - /var/cache/lwd-cache -> ${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} -> $LIGHTWALLETD_DATA_DIR # + # Currently we do this by mounting the same disk at both paths. + # # This doesn't cause any path conflicts, because Zebra and lightwalletd create different # subdirectories for their data. (But Zebra, lightwalletd, and the test harness must not # delete the whole cache directory.) @@ -482,15 +463,13 @@ jobs: --ssh-flag="-o ConnectTimeout=5" \ --command \ "\ - # Wait for the disk to be attached - while [[ ! -e /dev/sdb ]]; do sleep 1; done && \ sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ - --mount type=volume,src=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }},dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ " From f6346eb889202223f65c97f4fe1235f3430468cc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 8 Oct 2023 23:10:18 +0000 Subject: [PATCH 3/8] build(deps): bump tj-actions/changed-files from 39.2.0 to 39.2.1 (#7668) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 39.2.0 to 39.2.1. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v39.2.0...v39.2.1) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/lint.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7373290d9a8..53faebe7ca3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -37,7 +37,7 @@ jobs: - name: Rust files id: changed-files-rust - uses: tj-actions/changed-files@v39.2.0 + uses: tj-actions/changed-files@v39.2.1 with: files: | **/*.rs @@ -49,7 +49,7 @@ jobs: - name: Workflow files id: changed-files-workflows - uses: tj-actions/changed-files@v39.2.0 + uses: tj-actions/changed-files@v39.2.1 with: files: | .github/workflows/*.yml From f6bba8c88e63dec1597dbd46855fbb2cfa74e521 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 8 Oct 2023 23:26:10 +0000 Subject: [PATCH 4/8] build(deps): bump reviewdog/action-actionlint from 1.39.0 to 1.39.1 (#7669) Bumps [reviewdog/action-actionlint](https://github.com/reviewdog/action-actionlint) from 1.39.0 to 1.39.1. - [Release notes](https://github.com/reviewdog/action-actionlint/releases) - [Commits](https://github.com/reviewdog/action-actionlint/compare/v1.39.0...v1.39.1) --- updated-dependencies: - dependency-name: reviewdog/action-actionlint dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 53faebe7ca3..80ee6d8e181 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -144,7 +144,7 @@ jobs: steps: - uses: actions/checkout@v4.0.0 - name: actionlint - uses: reviewdog/action-actionlint@v1.39.0 + uses: reviewdog/action-actionlint@v1.39.1 with: level: warning fail_on_error: false From c498eee67f18addf316e92f4f69ff5d26429966b Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Sun, 8 Oct 2023 23:01:51 -0300 Subject: [PATCH 5/8] change(ci): Create automatic tickets on CI failure for more workflows (#7620) * add buld failures ticket generation to most important workflows * add missing newline at the end of the file * doc fixes * add missing testnet build check Co-authored-by: teor --------- Co-authored-by: teor --- .../workflows/build-crates-individually.yml | 18 +++++++++++++++++ .github/workflows/continous-delivery.yml | 18 +++++++++++++++++ .../continous-integration-docker.yml | 8 +++----- .../workflows/continous-integration-os.yml | 18 +++++++++++++++++ .github/workflows/release-binaries.yml | 17 ++++++++++++++++ .github/workflows/release-crates-io.yml | 20 ++++++++++++++++++- 6 files changed, 93 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-crates-individually.yml b/.github/workflows/build-crates-individually.yml index 505d9796921..8bbf4765c7d 100644 --- a/.github/workflows/build-crates-individually.yml +++ b/.github/workflows/build-crates-individually.yml @@ -138,3 +138,21 @@ jobs: - name: Build ${{ matrix.crate }} crate with all features run: | cargo build --package ${{ matrix.crate }} --all-features + + failure-issue: + name: Open or update issues for building crates individually failures + # When a new job is added to this workflow, add it to this list. + needs: [ matrix, build ] + # Only open tickets for failed or cancelled jobs that are not coming from PRs. + # (PR statuses are already reported in the PR jobs list, and checked by Mergify.) + if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null) + runs-on: ubuntu-latest + steps: + - uses: jayqi/failed-build-issue-action@v1 + with: + title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" + # New failures open an issue with this label. + label-name: S-ci-fail-build-crates-auto-issue + # If there is already an open issue with this label, any failures become comments on that issue. + always-create-new-issue: false + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/continous-delivery.yml b/.github/workflows/continous-delivery.yml index 1d1c6efba11..3bc314beb97 100644 --- a/.github/workflows/continous-delivery.yml +++ b/.github/workflows/continous-delivery.yml @@ -382,3 +382,21 @@ jobs: --labels=app=zebrad,environment=qa,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \ --tags zebrad \ --zone ${{ vars.GCP_ZONE }} + + failure-issue: + name: Open or update issues for release failures + # When a new job is added to this workflow, add it to this list. + needs: [ versioning, build, test-configuration-file, deploy-nodes, deploy-instance ] + # Only open tickets for failed or cancelled jobs that are not coming from PRs. + # (PR statuses are already reported in the PR jobs list, and checked by Mergify.) + if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null) + runs-on: ubuntu-latest + steps: + - uses: jayqi/failed-build-issue-action@v1 + with: + title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" + # New failures open an issue with this label. + label-name: S-ci-fail-release-auto-issue + # If there is already an open issue with this label, any failures become comments on that issue. + always-create-new-issue: false + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/continous-integration-docker.yml b/.github/workflows/continous-integration-docker.yml index 492c9556c86..aeafb9a0bea 100644 --- a/.github/workflows/continous-integration-docker.yml +++ b/.github/workflows/continous-integration-docker.yml @@ -790,18 +790,16 @@ jobs: # This list is for reliable tests that are run on the `main` branch. # Testnet jobs are not in this list, because we expect testnet to fail occasionally. needs: [ regenerate-stateful-disks, test-full-sync, lightwalletd-full-sync, test-all, test-all-getblocktemplate-rpcs, test-fake-activation-heights, test-empty-sync, test-lightwalletd-integration, test-configuration-file, test-zebra-conf-path, test-stateful-sync, test-update-sync, generate-checkpoints-mainnet, lightwalletd-update-sync, lightwalletd-rpc-test, lightwalletd-transactions-test, lightwalletd-grpc-test, get-block-template-test, submit-block-test ] - # Only open tickets for failed scheduled jobs, manual workflow runs, or `main` branch merges. + # Only open tickets for failed or cancelled jobs that are not coming from PRs. # (PR statuses are already reported in the PR jobs list, and checked by Mergify.) - # TODO: if a job times out, we want to create a ticket. Does failure() do that? Or do we need cancelled()? - if: failure() && github.event.pull_request == null + if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null) runs-on: ubuntu-latest steps: - uses: jayqi/failed-build-issue-action@v1 with: title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" # New failures open an issue with this label. - # TODO: do we want a different label for each workflow, or each kind of workflow? - label-name: S-ci-fail-auto-issue + label-name: S-ci-fail-main-branch-auto-issue # If there is already an open issue with this label, any failures become comments on that issue. always-create-new-issue: false github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/continous-integration-os.yml b/.github/workflows/continous-integration-os.yml index 3266f24e561..4f33c0206d6 100644 --- a/.github/workflows/continous-integration-os.yml +++ b/.github/workflows/continous-integration-os.yml @@ -319,3 +319,21 @@ jobs: else echo "No unused dependencies found." fi + + failure-issue: + name: Open or update issues for OS integration failures + # When a new job is added to this workflow, add it to this list. + needs: [ test, install-from-lockfile-no-cache, check-cargo-lock, cargo-deny, unused-deps ] + # Only open tickets for failed or cancelled jobs that are not coming from PRs. + # (PR statuses are already reported in the PR jobs list, and checked by Mergify.) + if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null) + runs-on: ubuntu-latest + steps: + - uses: jayqi/failed-build-issue-action@v1 + with: + title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" + # New failures open an issue with this label. + label-name: S-ci-fail-os-integration-auto-issue + # If there is already an open issue with this label, any failures become comments on that issue. + always-create-new-issue: false + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index a96c15c2867..2fe2243e0b5 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -43,3 +43,20 @@ jobs: rust_log: info # This step needs access to Docker Hub secrets to run successfully secrets: inherit + + failure-issue: + name: Open or update issues for release binaries failures + # When a new job is added to this workflow, add it to this list. + needs: [ build, build-mining-testnet ] + # Open tickets for any failed build in this workflow. + if: failure() || cancelled() + runs-on: ubuntu-latest + steps: + - uses: jayqi/failed-build-issue-action@v1 + with: + title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" + # New failures open an issue with this label. + label-name: S-ci-fail-binaries-auto-issue + # If there is already an open issue with this label, any failures become comments on that issue. + always-create-new-issue: false + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release-crates-io.yml b/.github/workflows/release-crates-io.yml index e4651556300..94535fccba6 100644 --- a/.github/workflows/release-crates-io.yml +++ b/.github/workflows/release-crates-io.yml @@ -114,7 +114,6 @@ jobs: # TODO: check all crates after fixing these errors cargo release publish --verbose --dry-run --allow-branch '*' --workspace --exclude zebra-consensus --exclude zebra-utils --exclude zebrad - # TODO: actually do the release here #release-crates: # name: Release Zebra Crates @@ -123,3 +122,22 @@ jobs: # timeout-minutes: 30 # if: ${{ !cancelled() && !failure() && github.event_name == 'release' }} # steps: + # ... + + failure-issue: + name: Open or update issues for release crates failures + # When a new job is added to this workflow, add it to this list. + needs: [ check-release ] + # Only open tickets for failed or cancelled jobs that are not coming from PRs. + # (PR statuses are already reported in the PR jobs list, and checked by Mergify.) + if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null) + runs-on: ubuntu-latest + steps: + - uses: jayqi/failed-build-issue-action@v1 + with: + title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}" + # New failures open an issue with this label. + label-name: S-ci-fail-release-crates-auto-issue + # If there is already an open issue with this label, any failures become comments on that issue. + always-create-new-issue: false + github-token: ${{ secrets.GITHUB_TOKEN }} From 1d45938e0f41d0fd82eb5efe10bee8e1a043abdc Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Sun, 8 Oct 2023 23:02:04 -0300 Subject: [PATCH 6/8] fix(note-commitment-trees): Populate subtrees (#7636) * add `sapling_subtree_for_tip` and `orchard_subtree_for_tip` methods to `ZebraDb` * add methods for non finalized state, move functions * call `zs_last_key_value` the right way * fix and simplify `*_subtree_for_tip` methods Co-authored-by: Arya * apply filter * rename all tree and subtree methods that use tip * rename tip tree and subtree methods in non finalized chain * apply simplify suggestions Co-authored-by: teor --------- Co-authored-by: Arya Co-authored-by: teor --- zebra-state/src/service/finalized_state.rs | 4 +- .../disk_format/upgrade/add_subtrees.rs | 18 ++--- .../zebra_db/block/tests/snapshot.rs | 6 +- .../finalized_state/zebra_db/shielded.rs | 72 +++++++++++++++---- .../src/service/non_finalized_state.rs | 6 +- .../src/service/non_finalized_state/chain.rs | 40 ++++++++--- .../service/non_finalized_state/tests/prop.rs | 12 ++-- .../non_finalized_state/tests/vectors.rs | 8 +-- 8 files changed, 116 insertions(+), 50 deletions(-) diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs index 3f4e0d4dd0f..acafda7b2f8 100644 --- a/zebra-state/src/service/finalized_state.rs +++ b/zebra-state/src/service/finalized_state.rs @@ -242,8 +242,8 @@ impl FinalizedState { let block = checkpoint_verified.block.clone(); let mut history_tree = self.db.history_tree(); - let prev_note_commitment_trees = - prev_note_commitment_trees.unwrap_or_else(|| self.db.note_commitment_trees()); + let prev_note_commitment_trees = prev_note_commitment_trees + .unwrap_or_else(|| self.db.note_commitment_trees_for_tip()); // Update the note commitment trees. let mut note_commitment_trees = prev_note_commitment_trees.clone(); diff --git a/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs b/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs index 35209b463d2..72dc4d55c90 100644 --- a/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs +++ b/zebra-state/src/service/finalized_state/disk_format/upgrade/add_subtrees.rs @@ -207,12 +207,13 @@ fn quick_check_sapling_subtrees(db: &ZebraDb) -> Result<(), &'static str> { return Ok(()); } - let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = db.sapling_tree().subtree_index() + let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = + db.sapling_tree_for_tip().subtree_index() else { return Ok(()); }; - if tip_subtree_index == 0 && !db.sapling_tree().is_complete_subtree() { + if tip_subtree_index == 0 && !db.sapling_tree_for_tip().is_complete_subtree() { return Ok(()); } @@ -260,12 +261,13 @@ fn quick_check_orchard_subtrees(db: &ZebraDb) -> Result<(), &'static str> { return Ok(()); } - let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = db.orchard_tree().subtree_index() + let Some(NoteCommitmentSubtreeIndex(tip_subtree_index)) = + db.orchard_tree_for_tip().subtree_index() else { return Ok(()); }; - if tip_subtree_index == 0 && !db.orchard_tree().is_complete_subtree() { + if tip_subtree_index == 0 && !db.orchard_tree_for_tip().is_complete_subtree() { return Ok(()); } @@ -333,13 +335,13 @@ fn check_sapling_subtrees( cancel_receiver: &mpsc::Receiver, ) -> Result, CancelFormatChange> { let Some(NoteCommitmentSubtreeIndex(mut first_incomplete_subtree_index)) = - db.sapling_tree().subtree_index() + db.sapling_tree_for_tip().subtree_index() else { return Ok(Ok(())); }; // If there are no incomplete subtrees in the tree, also expect a subtree for the final index. - if db.sapling_tree().is_complete_subtree() { + if db.sapling_tree_for_tip().is_complete_subtree() { first_incomplete_subtree_index += 1; } @@ -463,13 +465,13 @@ fn check_orchard_subtrees( cancel_receiver: &mpsc::Receiver, ) -> Result, CancelFormatChange> { let Some(NoteCommitmentSubtreeIndex(mut first_incomplete_subtree_index)) = - db.orchard_tree().subtree_index() + db.orchard_tree_for_tip().subtree_index() else { return Ok(Ok(())); }; // If there are no incomplete subtrees in the tree, also expect a subtree for the final index. - if db.orchard_tree().is_complete_subtree() { + if db.orchard_tree_for_tip().is_complete_subtree() { first_incomplete_subtree_index += 1; } diff --git a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs index 2754cd69c3a..6fc96f8dff2 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs @@ -247,9 +247,9 @@ fn snapshot_block_and_transaction_data(state: &FinalizedState) { let mut stored_sapling_trees = Vec::new(); let mut stored_orchard_trees = Vec::new(); - let sprout_tree_at_tip = state.sprout_tree(); - let sapling_tree_at_tip = state.sapling_tree(); - let orchard_tree_at_tip = state.orchard_tree(); + let sprout_tree_at_tip = state.sprout_tree_for_tip(); + let sapling_tree_at_tip = state.sapling_tree_for_tip(); + let orchard_tree_at_tip = state.orchard_tree_for_tip(); // Test the history tree. // diff --git a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs index fc0cca9d5a4..75b5db8da64 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs @@ -83,7 +83,7 @@ impl ZebraDb { /// Returns the Sprout note commitment tree of the finalized tip /// or the empty tree if the state is empty. - pub fn sprout_tree(&self) -> Arc { + pub fn sprout_tree_for_tip(&self) -> Arc { if self.is_empty() { return Arc::::default(); } @@ -161,7 +161,7 @@ impl ZebraDb { /// Returns the Sapling note commitment tree of the finalized tip or the empty tree if the state /// is empty. - pub fn sapling_tree(&self) -> Arc { + pub fn sapling_tree_for_tip(&self) -> Arc { let height = match self.finalized_tip_height() { Some(h) => h, None => return Default::default(), @@ -303,11 +303,32 @@ impl ZebraDb { } } + /// Get the sapling note commitment subtress for the finalized tip. + #[allow(clippy::unwrap_in_result)] + fn sapling_subtree_for_tip(&self) -> Option> { + let sapling_subtrees = self + .db + .cf_handle("sapling_note_commitment_subtree") + .unwrap(); + + let (index, subtree_data): ( + NoteCommitmentSubtreeIndex, + NoteCommitmentSubtreeData, + ) = self.db.zs_last_key_value(&sapling_subtrees)?; + + let tip_height = self.finalized_tip_height()?; + if subtree_data.end != tip_height { + return None; + } + + Some(subtree_data.with_index(index)) + } + // Orchard trees /// Returns the Orchard note commitment tree of the finalized tip or the empty tree if the state /// is empty. - pub fn orchard_tree(&self) -> Arc { + pub fn orchard_tree_for_tip(&self) -> Arc { let height = match self.finalized_tip_height() { Some(h) => h, None => return Default::default(), @@ -449,15 +470,38 @@ impl ZebraDb { } } + /// Get the orchard note commitment subtress for the finalized tip. + #[allow(clippy::unwrap_in_result)] + fn orchard_subtree_for_tip(&self) -> Option> { + let orchard_subtrees = self + .db + .cf_handle("orchard_note_commitment_subtree") + .unwrap(); + + let (index, subtree_data): ( + NoteCommitmentSubtreeIndex, + NoteCommitmentSubtreeData, + ) = self.db.zs_last_key_value(&orchard_subtrees)?; + + let tip_height = self.finalized_tip_height()?; + if subtree_data.end != tip_height { + return None; + } + + Some(subtree_data.with_index(index)) + } + /// Returns the shielded note commitment trees of the finalized tip /// or the empty trees if the state is empty. - pub fn note_commitment_trees(&self) -> NoteCommitmentTrees { + /// Additionally, returns the sapling and orchard subtrees for the finalized tip if + /// the current subtree is finalizing in the tip, None otherwise. + pub fn note_commitment_trees_for_tip(&self) -> NoteCommitmentTrees { NoteCommitmentTrees { - sprout: self.sprout_tree(), - sapling: self.sapling_tree(), - sapling_subtree: None, - orchard: self.orchard_tree(), - orchard_subtree: None, + sprout: self.sprout_tree_for_tip(), + sapling: self.sapling_tree_for_tip(), + sapling_subtree: self.sapling_subtree_for_tip(), + orchard: self.orchard_tree_for_tip(), + orchard_subtree: self.orchard_subtree_for_tip(), } } } @@ -571,10 +615,10 @@ impl DiskWriteBatch { // Store the Sapling tree only if it is not already present at the previous height. if height.is_min() - || prev_note_commitment_trees - .as_ref() - .map_or_else(|| zebra_db.sapling_tree(), |trees| trees.sapling.clone()) - != trees.sapling + || prev_note_commitment_trees.as_ref().map_or_else( + || zebra_db.sapling_tree_for_tip(), + |trees| trees.sapling.clone(), + ) != trees.sapling { self.zs_insert(&sapling_tree_cf, height, trees.sapling); } @@ -582,7 +626,7 @@ impl DiskWriteBatch { // Store the Orchard tree only if it is not already present at the previous height. if height.is_min() || prev_note_commitment_trees - .map_or_else(|| zebra_db.orchard_tree(), |trees| trees.orchard) + .map_or_else(|| zebra_db.orchard_tree_for_tip(), |trees| trees.orchard) != trees.orchard { self.zs_insert(&orchard_tree_cf, height, trees.orchard); diff --git a/zebra-state/src/service/non_finalized_state.rs b/zebra-state/src/service/non_finalized_state.rs index 6b303360b6f..bc342e5be9a 100644 --- a/zebra-state/src/service/non_finalized_state.rs +++ b/zebra-state/src/service/non_finalized_state.rs @@ -284,9 +284,9 @@ impl NonFinalizedState { let chain = Chain::new( self.network, finalized_tip_height, - finalized_state.sprout_tree(), - finalized_state.sapling_tree(), - finalized_state.orchard_tree(), + finalized_state.sprout_tree_for_tip(), + finalized_state.sapling_tree_for_tip(), + finalized_state.orchard_tree_for_tip(), finalized_state.history_tree(), finalized_state.finalized_value_pool(), ); diff --git a/zebra-state/src/service/non_finalized_state/chain.rs b/zebra-state/src/service/non_finalized_state/chain.rs index 670901ba9af..0297c93f67d 100644 --- a/zebra-state/src/service/non_finalized_state/chain.rs +++ b/zebra-state/src/service/non_finalized_state/chain.rs @@ -504,7 +504,7 @@ impl Chain { /// # Panics /// /// If this chain has no sprout trees. (This should be impossible.) - pub fn sprout_note_commitment_tree(&self) -> Arc { + pub fn sprout_note_commitment_tree_for_tip(&self) -> Arc { self.sprout_trees_by_height .last_key_value() .expect("only called while sprout_trees_by_height is populated") @@ -668,7 +668,7 @@ impl Chain { /// # Panics /// /// If this chain has no sapling trees. (This should be impossible.) - pub fn sapling_note_commitment_tree(&self) -> Arc { + pub fn sapling_note_commitment_tree_for_tip(&self) -> Arc { self.sapling_trees_by_height .last_key_value() .expect("only called while sapling_trees_by_height is populated") @@ -737,6 +737,16 @@ impl Chain { .collect() } + /// Returns the Sapling [`NoteCommitmentSubtree`] if it was completed at the tip height. + pub fn sapling_subtree_for_tip(&self) -> Option> { + if !self.is_empty() { + let tip = self.non_finalized_tip_height(); + self.sapling_subtree(tip.into()) + } else { + None + } + } + /// Adds the Sapling `tree` to the tree and anchor indexes at `height`. /// /// `height` can be either: @@ -869,7 +879,7 @@ impl Chain { /// # Panics /// /// If this chain has no orchard trees. (This should be impossible.) - pub fn orchard_note_commitment_tree(&self) -> Arc { + pub fn orchard_note_commitment_tree_for_tip(&self) -> Arc { self.orchard_trees_by_height .last_key_value() .expect("only called while orchard_trees_by_height is populated") @@ -939,6 +949,16 @@ impl Chain { .collect() } + /// Returns the Orchard [`NoteCommitmentSubtree`] if it was completed at the tip height. + pub fn orchard_subtree_for_tip(&self) -> Option> { + if !self.is_empty() { + let tip = self.non_finalized_tip_height(); + self.orchard_subtree(tip.into()) + } else { + None + } + } + /// Adds the Orchard `tree` to the tree and anchor indexes at `height`. /// /// `height` can be either: @@ -1387,11 +1407,11 @@ impl Chain { // Prepare data for parallel execution let mut nct = NoteCommitmentTrees { - sprout: self.sprout_note_commitment_tree(), - sapling: self.sapling_note_commitment_tree(), - sapling_subtree: None, - orchard: self.orchard_note_commitment_tree(), - orchard_subtree: None, + sprout: self.sprout_note_commitment_tree_for_tip(), + sapling: self.sapling_note_commitment_tree_for_tip(), + sapling_subtree: self.sapling_subtree_for_tip(), + orchard: self.orchard_note_commitment_tree_for_tip(), + orchard_subtree: self.orchard_subtree_for_tip(), }; let mut tree_result = None; @@ -1427,8 +1447,8 @@ impl Chain { .insert(subtree.index, subtree.into_data()); } - let sapling_root = self.sapling_note_commitment_tree().root(); - let orchard_root = self.orchard_note_commitment_tree().root(); + let sapling_root = self.sapling_note_commitment_tree_for_tip().root(); + let orchard_root = self.orchard_note_commitment_tree_for_tip().root(); // TODO: update the history trees in a rayon thread, if they show up in CPU profiles let mut history_tree = self.history_block_commitment_tree(); diff --git a/zebra-state/src/service/non_finalized_state/tests/prop.rs b/zebra-state/src/service/non_finalized_state/tests/prop.rs index 76ebd3770cd..b7a3e5b00d4 100644 --- a/zebra-state/src/service/non_finalized_state/tests/prop.rs +++ b/zebra-state/src/service/non_finalized_state/tests/prop.rs @@ -328,9 +328,9 @@ fn finalized_equals_pushed_genesis() -> Result<()> { let mut partial_chain = Chain::new( network, full_chain.non_finalized_tip_height(), - full_chain.sprout_note_commitment_tree(), - full_chain.sapling_note_commitment_tree(), - full_chain.orchard_note_commitment_tree(), + full_chain.sprout_note_commitment_tree_for_tip(), + full_chain.sapling_note_commitment_tree_for_tip(), + full_chain.orchard_note_commitment_tree_for_tip(), full_chain.history_block_commitment_tree(), full_chain.chain_value_pools, ); @@ -406,9 +406,9 @@ fn finalized_equals_pushed_history_tree() -> Result<()> { let mut partial_chain = Chain::new( network, Height(finalized_count.try_into().unwrap()), - full_chain.sprout_note_commitment_tree(), - full_chain.sapling_note_commitment_tree(), - full_chain.orchard_note_commitment_tree(), + full_chain.sprout_note_commitment_tree_for_tip(), + full_chain.sapling_note_commitment_tree_for_tip(), + full_chain.orchard_note_commitment_tree_for_tip(), full_chain.history_block_commitment_tree(), full_chain.chain_value_pools, ); diff --git a/zebra-state/src/service/non_finalized_state/tests/vectors.rs b/zebra-state/src/service/non_finalized_state/tests/vectors.rs index 34242be752a..78c5de9d84a 100644 --- a/zebra-state/src/service/non_finalized_state/tests/vectors.rs +++ b/zebra-state/src/service/non_finalized_state/tests/vectors.rs @@ -513,8 +513,8 @@ fn history_tree_is_updated_for_network_upgrade( let tree = NonEmptyHistoryTree::from_block( Network::Mainnet, activation_block.clone(), - &chain.sapling_note_commitment_tree().root(), - &chain.orchard_note_commitment_tree().root(), + &chain.sapling_note_commitment_tree_for_tip().root(), + &chain.orchard_note_commitment_tree_for_tip().root(), ) .unwrap(); @@ -598,8 +598,8 @@ fn commitment_is_validated_for_network_upgrade(network: Network, network_upgrade let tree = NonEmptyHistoryTree::from_block( Network::Mainnet, activation_block.clone(), - &chain.sapling_note_commitment_tree().root(), - &chain.orchard_note_commitment_tree().root(), + &chain.sapling_note_commitment_tree_for_tip().root(), + &chain.orchard_note_commitment_tree_for_tip().root(), ) .unwrap(); From a2b7859e8edc0b4d9ce42ac1da3e25c7e1cc751f Mon Sep 17 00:00:00 2001 From: teor Date: Mon, 9 Oct 2023 13:10:08 +1000 Subject: [PATCH 7/8] Try larger cached state disk sizes (#7684) --- .github/workflows/deploy-gcp-tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index b1242ce4c4d..8751698783c 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -166,11 +166,11 @@ jobs: id: create-instance run: | gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \ - --boot-disk-size 300GB \ + --boot-disk-size 50GB \ --boot-disk-type pd-ssd \ --image-project=cos-cloud \ --image-family=cos-stable \ - --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ + --create-disk=name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \ --container-image=gcr.io/google-containers/busybox \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ @@ -367,11 +367,11 @@ jobs: id: create-instance run: | gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \ - --boot-disk-size 300GB \ + --boot-disk-size 50GB \ --boot-disk-type pd-ssd \ --image-project=cos-cloud \ --image-family=cos-stable \ - --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=300GB,type=pd-ssd \ + --create-disk=image=${{ env.CACHED_DISK_NAME }},name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",device-name="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}",size=400GB,type=pd-ssd \ --container-image=gcr.io/google-containers/busybox \ --machine-type ${{ vars.GCP_LARGE_MACHINE }} \ --network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \ From 8d0a17ee1ca174de65b1994e6da729ad3fa94758 Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Mon, 9 Oct 2023 18:59:59 +0100 Subject: [PATCH 8/8] fix(ci): handle disk mounting and logs reading edge-cases (#7690) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: use `exit-nopipe` with consistent `shell` usage Temporarily disabled the `set -e` option around the docker logs command to handle the broken pipe error gracefully. Handle more complex scenarios in our `Result of ${{ inputs.test_id }} test` job * fix: Use single quotes for the outer command * fix: use same approach for CD * test: check launch failure logs * fix: revert CD changes * fix: do not try to increase the disk size and wait mounting * fix: increase GB a bit more * fix: do not fail on pipe failure * fix: use plain `tee /dev/stderr` If this does not work try `(tee … || true)` * fix: `tee` not stoping on cd config tests * fix: match logic with GCP tests * fix(cd): handle pipe and other errors correctly * try `tee --output-error=exit-nopipe` * fix: TRAP without pipefail * test: pipefail with exit and trap * fix: use a subshell * fix(ci): wait for mounting and show system logs if fail * fix(ci): GCP is not always mounting disks in the same order * fix: use `grep` instead of `awk` * fix: typo * fix: use simpler `grep` command * fix: do not sleep if not require * chore: reduce diff --- .github/workflows/continous-delivery.yml | 91 ++++++++------ .github/workflows/deploy-gcp-tests.yml | 152 ++++++++++++++++------- 2 files changed, 162 insertions(+), 81 deletions(-) diff --git a/.github/workflows/continous-delivery.yml b/.github/workflows/continous-delivery.yml index 3bc314beb97..7fc5ec8d180 100644 --- a/.github/workflows/continous-delivery.yml +++ b/.github/workflows/continous-delivery.yml @@ -29,7 +29,7 @@ on: type: boolean default: false - # Temporarily disabled to reduce network load, see #6894. + # TODO: Temporarily disabled to reduce network load, see #6894. #push: # branches: # - main @@ -132,29 +132,37 @@ jobs: # Make sure Zebra can sync at least one full checkpoint on mainnet - name: Run tests using the default config + shell: /usr/bin/bash -exo pipefail {0} run: | - set -ex docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} docker run --detach --name default-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} - # show the logs, even if the job times out - docker logs --tail all --follow default-conf-tests | \ - tee --output-error=exit /dev/stderr | \ - grep --max-count=1 --extended-regexp --color=always \ - 'net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter' + + # Use a subshell to handle the broken pipe error gracefully + ( + trap "" PIPE; + docker logs \ + --tail all \ + --follow \ + default-conf-tests | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + -e "net.*=.*Main.*estimated progress to chain tip.*BeforeOverwinter" + ) || true + LOGS_EXIT_STATUS=$? + docker stop default-conf-tests - # get the exit status from docker - EXIT_STATUS=$( \ - docker wait default-conf-tests || \ - docker inspect --format "{{.State.ExitCode}}" default-conf-tests || \ - echo "missing container, or missing exit status for container" \ - ) - docker logs default-conf-tests - echo "docker exit status: $EXIT_STATUS" - if [[ "$EXIT_STATUS" = "137" ]]; then - echo "ignoring expected signal status" - exit 0 + + EXIT_STATUS=$(docker wait default-conf-tests || echo "Error retrieving exit status"); + echo "docker exit status: $EXIT_STATUS"; + + # If grep found the pattern, exit with the Docker container exit status + if [ $LOGS_EXIT_STATUS -eq 0 ]; then + exit $EXIT_STATUS; fi - exit "$EXIT_STATUS" + + # Handle other potential errors here + echo "An error occurred while processing the logs."; + exit 1; # Test reconfiguring the docker image for testnet. test-configuration-file-testnet: @@ -172,30 +180,37 @@ jobs: # Make sure Zebra can sync the genesis block on testnet - name: Run tests using a testnet config + shell: /usr/bin/bash -exo pipefail {0} run: | - set -ex docker pull ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} docker run --env "NETWORK=Testnet" --detach --name testnet-conf-tests -t ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} - # show the logs, even if the job times out - docker logs --tail all --follow testnet-conf-tests | \ - tee --output-error=exit /dev/stderr | \ - grep --max-count=1 --extended-regexp --color=always \ - -e 'net.*=.*Test.*estimated progress to chain tip.*Genesis' \ - -e 'net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter' + # Use a subshell to handle the broken pipe error gracefully + ( + trap "" PIPE; + docker logs \ + --tail all \ + --follow \ + testnet-conf-tests | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + -e "net.*=.*Test.*estimated progress to chain tip.*Genesis" \ + -e "net.*=.*Test.*estimated progress to chain tip.*BeforeOverwinter"; + ) || true + LOGS_EXIT_STATUS=$? + docker stop testnet-conf-tests - # get the exit status from docker - EXIT_STATUS=$( \ - docker wait testnet-conf-tests || \ - docker inspect --format "{{.State.ExitCode}}" testnet-conf-tests || \ - echo "missing container, or missing exit status for container" \ - ) - docker logs testnet-conf-tests - echo "docker exit status: $EXIT_STATUS" - if [[ "$EXIT_STATUS" = "137" ]]; then - echo "ignoring expected signal status" - exit 0 + + EXIT_STATUS=$(docker wait testnet-conf-tests || echo "Error retrieving exit status"); + echo "docker exit status: $EXIT_STATUS"; + + # If grep found the pattern, exit with the Docker container exit status + if [ $LOGS_EXIT_STATUS -eq 0 ]; then + exit $EXIT_STATUS; fi - exit "$EXIT_STATUS" + + # Handle other potential errors here + echo "An error occurred while processing the logs."; + exit 1; # Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet, # with one node in the configured GCP region. diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 8751698783c..d6820b9a311 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -183,39 +183,56 @@ jobs: # Format the mounted disk if the test doesn't use a cached state. - name: Format ${{ inputs.test_id }} volume + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ - while sudo lsof /dev/sdb; do \ - echo 'Waiting for /dev/sdb to be free...'; \ - sleep 10; \ - done; \ - sudo mkfs.ext4 -v /dev/sdb \ - " + --command=' \ + set -ex; + # Extract the correct disk name based on the device-name + export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ + sudo mkfs.ext4 -v /dev/$DISK_NAME \ + ' # Launch the test without any cached state - name: Launch ${{ inputs.test_id }} test + id: launch-test + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ + --command=' \ sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ - " + ' + + # Show debug logs if previous job failed + - name: Show debug logs if previous job failed + if: ${{ failure() }} + shell: /usr/bin/bash -exo pipefail {0} + run: | + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ vars.GCP_ZONE }} \ + --ssh-flag="-o ServerAliveInterval=5" \ + --ssh-flag="-o ConnectionAttempts=20" \ + --ssh-flag="-o ConnectTimeout=5" \ + --command=' \ + lsblk; + sudo lsof /dev/sdb; + sudo dmesg; + sudo journalctl -b \ + ' # set up and launch the test, if it uses cached state # each test runs one of the *-with/without-cached-state job series, and skips the other @@ -381,7 +398,6 @@ jobs: --labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \ --tags ${{ inputs.app_name }} \ --zone ${{ vars.GCP_ZONE }} - sleep 60 # Launch the test with the previously created Zebra-only cached state. # Each test runs one of the "Launch test" steps, and skips the other. @@ -405,22 +421,43 @@ jobs: # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially. # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ + --command=' \ + set -ex; + # Extract the correct disk name based on the device-name + export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ + sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ - " + ' + + # Show debug logs if previous job failed + - name: Show debug logs if previous job failed + if: ${{ failure() && (inputs.needs_zebra_state && !inputs.needs_lwd_state) && inputs.test_id != 'lwd-full-sync' }} + shell: /usr/bin/bash -exo pipefail {0} + run: | + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ vars.GCP_ZONE }} \ + --ssh-flag="-o ServerAliveInterval=5" \ + --ssh-flag="-o ConnectionAttempts=20" \ + --ssh-flag="-o ConnectTimeout=5" \ + --command=' \ + lsblk; + sudo lsof /dev/$DISK_NAME; + sudo dmesg; + sudo journalctl -b \ + ' # Launch the test with the previously created Lightwalletd and Zebra cached state. # Each test runs one of the "Launch test" steps, and skips the other. @@ -455,23 +492,44 @@ jobs: # lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially. # TODO: we should find a better logic for this use cases if: ${{ (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ + --command=' \ + set -ex; + # Extract the correct disk name based on the device-name + export DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-); \ + sudo docker run \ --name ${{ inputs.test_id }} \ --tty \ --detach \ ${{ inputs.test_variables }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ - --mount type=volume,volume-driver=local,volume-opt=device=/dev/sdb,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.zebra_state_dir }} \ + --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.root_state_path }}/${{ inputs.lwd_state_dir }} \ ${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \ - " + ' + + # Show debug logs if previous job failed + - name: Show debug logs if previous job failed + if: ${{ failure() && (inputs.needs_zebra_state && inputs.needs_lwd_state) || inputs.test_id == 'lwd-full-sync' }} + shell: /usr/bin/bash -exo pipefail {0} + run: | + gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ vars.GCP_ZONE }} \ + --ssh-flag="-o ServerAliveInterval=5" \ + --ssh-flag="-o ConnectionAttempts=20" \ + --ssh-flag="-o ConnectTimeout=5" \ + --command=' \ + lsblk; + sudo lsof /dev/$DISK_NAME; + sudo dmesg; + sudo journalctl -b \ + ' # Show all the test logs, then follow the logs of the test we just launched, until it finishes. # Then check the result of the test. @@ -538,23 +596,23 @@ jobs: # # Errors in the tests are caught by the final test status job. - name: Check startup logs for ${{ inputs.test_id }} + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command \ - "\ + --command=' \ sudo docker logs \ --tail all \ --follow \ ${{ inputs.test_id }} | \ head -700 | \ - tee --output-error=exit /dev/stderr | \ + tee --output-error=exit-nopipe /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - -e 'Zcash network: ${{ inputs.network }}' \ - " + -e "Zcash network: ${{ inputs.network }}" \ + ' # Check that the container executed at least 1 Rust test harness test, and that all tests passed. # Then wait for the container to finish, and exit with the test's exit status. @@ -567,6 +625,7 @@ jobs: # with that status. # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test + shell: /usr/bin/bash -exo pipefail {0} run: | gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ vars.GCP_ZONE }} \ @@ -574,26 +633,31 @@ jobs: --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ --command=' \ - set -e; - set -o pipefail; - trap '' PIPE; + trap "" PIPE; + # Temporarily disable "set -e" to handle the broken pipe error gracefully + set +e; sudo docker logs \ --tail all \ --follow \ ${{ inputs.test_id }} | \ - tee --output-error=exit /dev/stderr | \ + tee --output-error=exit-nopipe /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ - "test result: .*ok.* [1-9][0-9]* passed.*finished in"; \ + "test result: .*ok.* [1-9][0-9]* passed.*finished in"; + LOGS_EXIT_STATUS=$?; + set -e; + + EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status"); + echo "sudo docker exit status: $EXIT_STATUS"; - EXIT_STATUS=$( \ - sudo docker wait ${{ inputs.test_id }} || \ - sudo docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \ - echo "missing container, or missing exit status for container" \ - ); \ + # If grep found the pattern, exit with the Docker container"s exit status + if [ $LOGS_EXIT_STATUS -eq 0 ]; then + exit $EXIT_STATUS; + fi - echo "sudo docker exit status: $EXIT_STATUS"; \ - exit "$EXIT_STATUS" \ + # Handle other potential errors here + echo "An error occurred while processing the logs."; + exit 1; \ ' # create a state image from the instance's state disk, if requested by the caller @@ -707,6 +771,7 @@ jobs: # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION, # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables. - name: Get database versions from logs + shell: /usr/bin/bash -exo pipefail {0} run: | INITIAL_DISK_DB_VERSION="" RUNNING_DB_VERSION="" @@ -718,9 +783,9 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command=" \ + --command=' \ sudo docker logs ${{ inputs.test_id }} | head -1000 \ - ") + ') # either a semantic version or "creating new database" INITIAL_DISK_DB_VERSION=$( \ @@ -796,6 +861,7 @@ jobs: # # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable. - name: Get sync height from logs + shell: /usr/bin/bash -exo pipefail {0} run: | SYNC_HEIGHT="" @@ -805,9 +871,9 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --ssh-flag="-o ConnectionAttempts=20" \ --ssh-flag="-o ConnectTimeout=5" \ - --command=" \ + --command=' \ sudo docker logs ${{ inputs.test_id }} --tail 200 \ - ") + ') SYNC_HEIGHT=$( \ echo "$DOCKER_LOGS" | \