From fec991c88fc92659f3d7ba987377407dcfe64655 Mon Sep 17 00:00:00 2001
From: David Boreham <david@bozemanpass.com>
Date: Wed, 1 Jul 2026 10:33:59 -0600
Subject: [PATCH] Cap backup-test retry loop to stop multi-hour hangs

The test wrapped each backup.sh call in a 50x readiness loop written for
the old single-shot ensure_repo. Since ensure_repo now retries internally
(30x/5s) to ride out S3 warmup, the two loops multiplied: a broken-S3 run
ground on for ~2.5h (up to CI's 6h cap) instead of failing in ~5min.

ensure_repo already owns the warmup wait, so drop the outer loop to 3
attempts - enough to absorb a transient post-readiness hiccup without
re-nesting the retries. Refs #155.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tests/backup/run-test.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/backup/run-test.sh b/tests/backup/run-test.sh
index f9042f8..f7030c4 100755
--- a/tests/backup/run-test.sh
+++ b/tests/backup/run-test.sh
@@ -125,12 +125,15 @@ wait_for_pods_started
 dexec app "echo ${payload} > /data/payload.txt"
 echo "wrote payload: ${payload}"
 
-# 2. Take a backup, retrying until the S3 store has finished starting up. backup.sh creates
-#    the restic repository on first use (restic auto-creates the bucket on SeaweedFS).
+# 2. Take a backup. backup.sh's ensure_repo already waits for the S3 store to finish
+#    warming up (and creates the restic repository on first use), so we do NOT wrap this in
+#    a long readiness loop here - doing so would multiply with ensure_repo's own retry and
+#    stretch a failure into hours. A couple of attempts cover a transient post-readiness
+#    hiccup; genuine unavailability fails promptly.
 backed_up=
-for i in {1..50}; do
+for i in {1..3}; do
     if dexec backup "/scripts/backup.sh"; then backed_up=1; break; fi
-    echo "waiting for backup to succeed (s3 warming up): ${i}"
+    echo "backup attempt ${i} failed, retrying"
     sleep 5
 done
 if [ -z "$backed_up" ]; then