Skip to content

Commit 0fb7db7

Browse files
authored
test(redis): fix flaky TestRedis_LuaRPopLPushBullMQLikeLists (#596)
## Summary - `TestRedis_LuaRPopLPushBullMQLikeLists` flaked on CI with `etcd raft engine is not leader` on the first `RPush` right after `createNode(t, 3)` returned. The freshly-elected raft leader briefly stepped down ("stepped down to follower since quorum is not active") before the test's first write reached it, so the write raced a re-election. - Root cause is benign startup leader churn on slow GitHub Actions runners under `-race` — not a product bug. `waitForRaftReadiness` only confirms that a leader exists at one instant, which is not enough to guarantee stability for the very next write. - Add small `rpushEventually` / `lpushEventually` helpers (backed by a shared `doEventually` that retries only on `strings.Contains(err, "not leader")` for 5s at 50ms intervals) and use them for the first two writes in the flaky test. `waitForRaftReadiness` is intentionally left untouched because other tests rely on its exact semantics. Failing CI run: https://github.com/bootjp/elastickv/actions/runs/24842581631/job/72720509753 ## Test plan - [x] `go test -race -run TestRedis_LuaRPopLPushBullMQLikeLists ./adapter/ -count=5` — 5/5 pass locally. - [x] `go test -race ./adapter/... -timeout 900s` — only unrelated pre-existing flake `Test_grpc_transaction` fires (nil-pointer panic in teardown); explicitly out of scope per the fix request. - [x] `golangci-lint run ./adapter/...` — 0 issues. - [ ] Re-run CI job linked above to confirm the flake is gone.
2 parents ea8fe40 + 801742a commit 0fb7db7

2 files changed

Lines changed: 64 additions & 2 deletions

File tree

adapter/redis_lua_compat_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,12 @@ func TestRedis_LuaRPopLPushBullMQLikeLists(t *testing.T) {
121121
rdb := redis.NewClient(&redis.Options{Addr: nodes[0].redisAddress})
122122
defer func() { _ = rdb.Close() }()
123123

124-
require.NoError(t, rdb.RPush(ctx, "bull:test:wait", "job-1", "job-2", "job-3").Err())
125-
require.NoError(t, rdb.LPush(ctx, "bull:test:active", "job-0").Err())
124+
// Raft leadership can briefly churn right after createNode returns — the
125+
// freshly-elected leader can step down if the initial heartbeats miss
126+
// quorum on a slow CI runner (see waitForRaftReadiness). Retry the first
127+
// writes so they ride out that re-election instead of failing the test.
128+
rpushEventually(t, ctx, rdb, "bull:test:wait", "job-1", "job-2", "job-3")
129+
lpushEventually(t, ctx, rdb, "bull:test:active", "job-0")
126130

127131
result, err := rdb.Eval(ctx, `
128132
local moved = redis.call("RPOPLPUSH", KEYS[1], KEYS[2])

adapter/test_util.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"log"
66
"net"
77
"strconv"
8+
"strings"
89
"sync"
910
"sync/atomic"
1011
"testing"
@@ -18,6 +19,7 @@ import (
1819
pb "github.com/bootjp/elastickv/proto"
1920
"github.com/bootjp/elastickv/store"
2021
"github.com/cockroachdb/errors"
22+
"github.com/redis/go-redis/v9"
2123
"github.com/stretchr/testify/assert"
2224
"github.com/stretchr/testify/require"
2325
"golang.org/x/sys/unix"
@@ -30,6 +32,12 @@ const (
3032
testEngineElectionTick = 10
3133
testEngineMaxSizePerMsg = 1 << 20
3234
testEngineMaxInflight = 256
35+
36+
// leaderChurnRetryTimeout bounds how long doEventually keeps retrying a
37+
// write that fails with "not leader" right after createNode returns.
38+
leaderChurnRetryTimeout = 5 * time.Second
39+
// leaderChurnRetryInterval is the poll interval between retries.
40+
leaderChurnRetryInterval = 50 * time.Millisecond
3341
)
3442

3543
func newTestFactory() raftengine.Factory {
@@ -442,3 +450,53 @@ func setupNodes(t *testing.T, ctx context.Context, n int, ports []portsAdress) (
442450

443451
return nodes, grpcAdders, redisAdders, peers
444452
}
453+
454+
// isTransientNotLeaderErr reports whether err is a transient "not leader"
455+
// error that can happen right after createNode returns if the newly elected
456+
// leader briefly steps down due to a missed heartbeat quorum (common on slow
457+
// CI runners under -race). Callers should retry the write in that case.
458+
//
459+
// The match is case-insensitive because Redis protocol error bodies and
460+
// other layers may capitalise the phrase differently (e.g. "ERR Not Leader").
461+
func isTransientNotLeaderErr(err error) bool {
462+
if err == nil {
463+
return false
464+
}
465+
return strings.Contains(strings.ToLower(err.Error()), "not leader")
466+
}
467+
468+
// doEventually retries do() while it returns a transient "not leader" error,
469+
// giving the cluster a few seconds to re-settle leadership after startup.
470+
// Non-"not leader" errors fail the test immediately.
471+
//
472+
// Note: we use assert.Eventually (not require.Eventually) so we can capture
473+
// the last observed error in lastErr and surface it via require.NoError after
474+
// the loop. Otherwise a timeout would only report "condition never met in 5s"
475+
// and swallow the underlying error.
476+
func doEventually(t *testing.T, do func() error) {
477+
t.Helper()
478+
var lastErr error
479+
_ = assert.Eventually(t, func() bool {
480+
lastErr = do()
481+
return lastErr == nil || !isTransientNotLeaderErr(lastErr)
482+
}, leaderChurnRetryTimeout, leaderChurnRetryInterval)
483+
require.NoError(t, lastErr)
484+
}
485+
486+
// rpushEventually wraps RPUSH in doEventually so transient leader churn
487+
// immediately after createNode doesn't fail the test.
488+
func rpushEventually(t *testing.T, ctx context.Context, rdb *redis.Client, key string, vals ...any) {
489+
t.Helper()
490+
doEventually(t, func() error {
491+
return rdb.RPush(ctx, key, vals...).Err()
492+
})
493+
}
494+
495+
// lpushEventually wraps LPUSH in doEventually so transient leader churn
496+
// immediately after createNode doesn't fail the test.
497+
func lpushEventually(t *testing.T, ctx context.Context, rdb *redis.Client, key string, vals ...any) {
498+
t.Helper()
499+
doEventually(t, func() error {
500+
return rdb.LPush(ctx, key, vals...).Err()
501+
})
502+
}

0 commit comments

Comments
 (0)