fix(daser): persist retry count after restart (#2185)

If job was being handled by retry worker on node shutdown the retry backoff counter would be lost after restart.
celestiaorg · May 11, 2023 · 876c6d0 · 876c6d0
1 parent 687704e
commit 876c6d0
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 13 deletions.
diff --git a/das/checkpoint.go b/das/checkpoint.go
@@ -23,6 +23,10 @@ type workerCheckpoint struct {
 func newCheckpoint(stats SamplingStats) checkpoint {
 	workers := make([]workerCheckpoint, 0, len(stats.Workers))
 	for _, w := range stats.Workers {
+		// no need to store retry jobs, since they will resume from failed heights map
+		if w.JobType == retryJob {
+			continue
+		}
 		workers = append(workers, workerCheckpoint{
 			From:    w.Curr,
 			To:      w.To,

diff --git a/das/coordinator_test.go b/das/coordinator_test.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 	"github.com/tendermint/tendermint/types"
 
 	"github.com/celestiaorg/celestia-node/header"
@@ -252,6 +253,51 @@ func TestCoordinator(t *testing.T) {
 		expectedState.Failed = make(map[uint64]int)
 		assert.Equal(t, expectedState, newCheckpoint(coordinator.state.unsafeStats()))
 	})
+
+	t.Run("persist retry count after on restart", func(t *testing.T) {
+		testParams := defaultTestParams()
+		testParams.dasParams.ConcurrencyLimit = 5
+		ctx, cancel := context.WithTimeout(context.Background(), testParams.timeoutDelay)
+
+		ch := checkpoint{
+			SampleFrom:  testParams.sampleFrom,
+			NetworkHead: testParams.networkHead,
+			Failed:      map[uint64]int{1: 1, 2: 2, 3: 3, 4: 4, 5: 5},
+			Workers:     []workerCheckpoint{},
+		}
+
+		waitCh := make(chan struct{})
+		var wg sync.WaitGroup
+		wg.Add(testParams.dasParams.ConcurrencyLimit)
+		sampleFn := func(ctx context.Context, h *header.ExtendedHeader) error {
+			wg.Done()
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-waitCh:
+				return nil
+			}
+		}
+
+		coordinator := newSamplingCoordinator(
+			testParams.dasParams,
+			getterStub{},
+			sampleFn,
+			newBroadcastMock(1),
+		)
+
+		go coordinator.run(ctx, ch)
+		cancel()
+		wg.Wait()
+		close(waitCh)
+
+		stopCtx, cancel := context.WithTimeout(context.Background(), testParams.timeoutDelay)
+		defer cancel()
+		assert.NoError(t, coordinator.wait(stopCtx))
+
+		st := coordinator.state.unsafeStats()
+		require.Equal(t, ch, newCheckpoint(st))
+	})
 }
 
 func BenchmarkCoordinator(b *testing.B) {

diff --git a/das/state.go b/das/state.go
@@ -71,15 +71,28 @@ func (s *coordinatorState) resumeFromCheckpoint(c checkpoint) {
 	s.networkHead = c.NetworkHead
 
 	for h, count := range c.Failed {
+		// resumed retries should start without backoff delay
 		s.failed[h] = retryAttempt{
 			count: count,
+			after: time.Now(),
 		}
 	}
 }
 
 func (s *coordinatorState) handleResult(res result) {
 	delete(s.inProgress, res.id)
 
+	switch res.jobType {
+	case recentJob, catchupJob:
+		s.handleRecentOrCatchupResult(res)
+	case retryJob:
+		s.handleRetryResult(res)
+	}
+
+	s.checkDone()
+}
+
+func (s *coordinatorState) handleRecentOrCatchupResult(res result) {
 	// check if the worker retried any of the previously failed heights
 	for h := range s.failed {
 		if h < res.from || h > res.to {
@@ -93,16 +106,17 @@ func (s *coordinatorState) handleResult(res result) {
 
 	// update failed heights
 	for h := range res.failed {
-		// if job was already in retry and failed again, carry over attempt count
-		lastRetry, ok := s.inRetry[h]
-		if ok {
-			if res.job.jobType != retryJob {
-				// retry job has been already created by another worker (recent or catchup)
-				continue
-			}
-			delete(s.inRetry, h)
-		}
+		nextRetry, _ := s.retryStrategy.nextRetry(retryAttempt{}, time.Now())
+		s.failed[h] = nextRetry
+	}
+}
 
+func (s *coordinatorState) handleRetryResult(res result) {
+	// move heights that has failed again to failed with keeping retry count, they will be picked up by
+	// retry workers later
+	for h := range res.failed {
+		lastRetry := s.inRetry[h]
+		// height will be retried after backoff
 		nextRetry, retryExceeded := s.retryStrategy.nextRetry(lastRetry, time.Now())
 		if retryExceeded {
 			log.Warnw("header exceeded maximum amount of sampling attempts",
@@ -111,7 +125,11 @@ func (s *coordinatorState) handleResult(res result) {
 		}
 		s.failed[h] = nextRetry
 	}
-	s.checkDone()
+
+	// processed height are either already moved to failed map or succeeded, cleanup inRetry
+	for h := res.from; h <= res.to; h++ {
+		delete(s.inRetry, h)
+	}
 }
 
 func (s *coordinatorState) isNewHead(newHead int64) bool {
@@ -249,6 +267,10 @@ func (s *coordinatorState) unsafeStats() SamplingStats {
 		}
 	}
 
+	for h, retry := range s.inRetry {
+		failed[h] += retry.count
+	}
+
 	return SamplingStats{
 		SampledChainHead: lowestFailedOrInProgress - 1,
 		CatchupHead:      s.next - 1,

diff --git a/das/worker.go b/das/worker.go
@@ -77,14 +77,13 @@ func (w *worker) run(ctx context.Context, timeout time.Duration, resultCh chan<-
 
 	for curr := w.state.from; curr <= w.state.to; curr++ {
 		err := w.sample(ctx, timeout, curr)
-		w.setResult(curr, err)
 		if errors.Is(err, context.Canceled) {
 			// sampling worker will resume upon restart
-			break
+			return
 		}
+		w.setResult(curr, err)
 	}
 
-	log.With()
 	log.Infow(
 		"finished sampling headers",
 		"from", w.state.from,