Skip to content

Commit 2b6e8f2

Browse files
committed
test(adapter): widen raft election budget + retry gRPC on leader churn
PR #602 (leader-stability window) fixed setup-side flake in the redis tests but left Test_consistency_satisfy_write_after_read_sequence, Test_consistency_satisfy_write_after_read_for_parallel, and Test_grpc_transaction flaky on CI. Those tests drive 9,999 (or 1,000× parallel) raw/txn RPCs against the leader in a loop; if the leader loses quorum mid-run -- easy under `-race` on a loaded GHA runner -- the first Put/Get after step-down returns "etcd raft engine is not leader" or "leader not found" and the test fails (with a bonus nil panic when assert.NoError let a nil response flow into resp.Value). Root cause of the churn: the test raft config used ElectionTick=10 (100 ms election timeout) with HeartbeatTick=1. etcd/raft's CheckQuorum only holds ~1 ElectionTick of heartbeat-response history, so any goroutine-scheduler pause > 100 ms under -race made the leader conclude quorum was lost and step down. Fix: 1. Raise testEngineHeartbeatTick / testEngineElectionTick to 5/50 (50 ms heartbeat, 500-1000 ms randomised election timeout). Still well within the etcd/raft recommended 10x ratio, fast enough for tests, and generous enough to survive -race scheduling jitter. As a bonus, LeaseDuration is now 200 ms (500-300 margin) instead of 0, so the lease-read fast path is actually exercised. 2. Extend isTransientNotLeaderErr to also match "leader not found" (ErrLeaderNotFound) -- the coordinator emits that variant during the re-election window, distinct from "not leader". 3. Add rawPut/rawGet/txnPut/txnGet/txnDeleteEventually wrappers that reuse doEventually to retry on transient leader-unavailable errors, mirroring rpushEventually/lpushEventually from #596 for the gRPC paths. 4. Rewrite the three 9999- / 1000-iteration consistency tests to use the *Eventually helpers. This also removes the nil-pointer crash: rawGetEventually / txnGetEventually only return a non-nil response. Verified: all four originally-flaky tests run clean 3 out of 3 with go test -race -count=1, and the full adapter package passes with go test -race ./adapter/... -count=1.
1 parent 0dc74d4 commit 2b6e8f2

2 files changed

Lines changed: 118 additions & 44 deletions

File tree

adapter/grpc_test.go

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -151,24 +151,23 @@ func Test_consistency_satisfy_write_after_read_for_parallel(t *testing.T) {
151151
nodes, adders, _ := createNode(t, 3)
152152
c := rawKVClient(t, adders)
153153

154+
// 1000 concurrent clients × 3 RPCs saturates the single raft leader
155+
// hard enough to provoke brief quorum checks to fail on CI, so use
156+
// the *Eventually helpers that retry transient leader-unavailable
157+
// errors. See Test_consistency_satisfy_write_after_read_sequence.
158+
ctx := context.Background()
154159
wg := sync.WaitGroup{}
155160
wg.Add(1000)
156161
for i := range 1000 {
157162
go func(i int) {
163+
defer wg.Done()
158164
key := []byte("test-key-parallel" + strconv.Itoa(i))
159165
want := []byte(strconv.Itoa(i))
160-
_, err := c.RawPut(
161-
context.Background(),
162-
&pb.RawPutRequest{Key: key, Value: want},
163-
)
164-
assert.NoError(t, err, "Put RPC failed")
165-
_, err = c.RawPut(context.TODO(), &pb.RawPutRequest{Key: key, Value: want})
166-
assert.NoError(t, err, "Put RPC failed")
167-
168-
resp, err := c.RawGet(context.TODO(), &pb.RawGetRequest{Key: key})
169-
assert.NoError(t, err, "Get RPC failed")
166+
rawPutEventually(t, ctx, c, &pb.RawPutRequest{Key: key, Value: want})
167+
rawPutEventually(t, ctx, c, &pb.RawPutRequest{Key: key, Value: want})
168+
169+
resp := rawGetEventually(t, ctx, c, &pb.RawGetRequest{Key: key})
170170
assert.Equal(t, want, resp.Value, "consistency check failed")
171-
wg.Done()
172171
}(i)
173172
}
174173
wg.Wait()
@@ -183,20 +182,18 @@ func Test_consistency_satisfy_write_after_read_sequence(t *testing.T) {
183182

184183
key := []byte("test-key-sequence")
185184

185+
// Use *Eventually helpers because a 9999-iteration loop across three
186+
// t.Parallel adapter tests loads CI enough that the raft leader can
187+
// briefly lose quorum and step down mid-run, surfacing as transient
188+
// "not leader" / "leader not found" RPC errors. The helpers retry
189+
// only those transient errors; any other error still fails the test.
190+
ctx := context.Background()
186191
for i := range 9999 {
187192
want := []byte("sequence" + strconv.Itoa(i))
188-
_, err := c.RawPut(
189-
context.Background(),
190-
&pb.RawPutRequest{Key: key, Value: want},
191-
)
192-
assert.NoError(t, err, "Put RPC failed")
193-
194-
_, err = c.RawPut(context.TODO(), &pb.RawPutRequest{Key: key, Value: want})
195-
assert.NoError(t, err, "Put RPC failed")
196-
197-
resp, err := c.RawGet(context.TODO(), &pb.RawGetRequest{Key: key})
198-
assert.NoError(t, err, "Get RPC failed")
193+
rawPutEventually(t, ctx, c, &pb.RawPutRequest{Key: key, Value: want})
194+
rawPutEventually(t, ctx, c, &pb.RawPutRequest{Key: key, Value: want})
199195

196+
resp := rawGetEventually(t, ctx, c, &pb.RawGetRequest{Key: key})
200197
assert.Equal(t, want, resp.Value, "consistency check failed")
201198
}
202199
}
@@ -209,22 +206,18 @@ func Test_grpc_transaction(t *testing.T) {
209206

210207
key := []byte("test-key-sequence")
211208

209+
// See Test_consistency_satisfy_write_after_read_sequence for why the
210+
// *Eventually helpers are necessary here.
211+
ctx := context.Background()
212212
for i := range 9999 {
213213
want := []byte("sequence" + strconv.Itoa(i))
214-
_, err := c.Put(
215-
context.Background(),
216-
&pb.PutRequest{Key: key, Value: want},
217-
)
218-
assert.NoError(t, err, "Put RPC failed")
219-
resp, err := c.Get(context.TODO(), &pb.GetRequest{Key: key})
220-
assert.NoError(t, err, "Get RPC failed")
214+
txnPutEventually(t, ctx, c, &pb.PutRequest{Key: key, Value: want})
215+
resp := txnGetEventually(t, ctx, c, &pb.GetRequest{Key: key})
221216
assert.Equal(t, want, resp.Value, "consistency check failed")
222217

223-
_, err = c.Delete(context.TODO(), &pb.DeleteRequest{Key: key})
224-
assert.NoError(t, err, "Delete RPC failed")
218+
txnDeleteEventually(t, ctx, c, &pb.DeleteRequest{Key: key})
225219

226-
resp, err = c.Get(context.TODO(), &pb.GetRequest{Key: key})
227-
assert.NoError(t, err, "Get RPC failed")
220+
resp = txnGetEventually(t, ctx, c, &pb.GetRequest{Key: key})
228221
assert.Nil(t, resp.Value, "consistency check failed")
229222
}
230223
}

adapter/test_util.go

Lines changed: 92 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,29 @@ import (
2727
)
2828

2929
const (
30-
testEngineTickInterval = 10 * time.Millisecond
31-
testEngineHeartbeatTick = 1
32-
testEngineElectionTick = 10
30+
testEngineTickInterval = 10 * time.Millisecond
31+
// testEngineHeartbeatTick / testEngineElectionTick preserve etcd/raft's
32+
// recommended 10x ratio (ElectionTick = 10 × HeartbeatTick). Previous
33+
// values (1 / 10 → 100 ms election timeout) were too tight for `-race`
34+
// on CI: CheckQuorum only holds ~100 ms of heartbeat-response history,
35+
// so any scheduler pause on a loaded runner drops the leader's view of
36+
// quorum, and it steps down with "quorum is not active", bouncing
37+
// writes to "etcd raft engine is not leader" / "leader not found" in
38+
// the middle of tests like Test_consistency_satisfy_write_after_read_sequence.
39+
// 5 / 50 gives a 500 ms election timeout (500-1000 ms randomised) and
40+
// a 50 ms heartbeat, absorbing goroutine-scheduler jitter while still
41+
// keeping tests fast. Combined with leaseSafetyMargin = 300 ms, this
42+
// also yields a non-zero 200 ms LeaseDuration so the lease-read fast
43+
// path gets exercised instead of always falling through to ReadIndex.
44+
testEngineHeartbeatTick = 5
45+
testEngineElectionTick = 50
3346
testEngineMaxSizePerMsg = 1 << 20
3447
testEngineMaxInflight = 256
3548

3649
// leaderChurnRetryTimeout bounds how long doEventually keeps retrying a
37-
// write that fails with "not leader" right after createNode returns.
50+
// write that fails with a transient leader-unavailable error. It covers
51+
// both startup churn right after createNode returns and mid-test churn
52+
// when the leader briefly steps down under CI load.
3853
leaderChurnRetryTimeout = 5 * time.Second
3954
// leaderChurnRetryInterval is the poll interval between retries.
4055
leaderChurnRetryInterval = 50 * time.Millisecond
@@ -519,18 +534,23 @@ func setupNodes(t *testing.T, ctx context.Context, n int, ports []portsAdress) (
519534
return nodes, grpcAdders, redisAdders, peers
520535
}
521536

522-
// isTransientNotLeaderErr reports whether err is a transient "not leader"
523-
// error that can happen right after createNode returns if the newly elected
524-
// leader briefly steps down due to a missed heartbeat quorum (common on slow
525-
// CI runners under -race). Callers should retry the write in that case.
537+
// isTransientNotLeaderErr reports whether err is a transient
538+
// leader-unavailable error that can happen right after createNode returns
539+
// (freshly-elected leader briefly steps down due to a missed heartbeat
540+
// quorum) or in the middle of a long-running test when CI load causes the
541+
// leader to miss quorum momentarily.
526542
//
527-
// The match is case-insensitive because Redis protocol error bodies and
528-
// other layers may capitalise the phrase differently (e.g. "ERR Not Leader").
543+
// Both "not leader" (ErrNotLeader, etcd/raft step errors) and
544+
// "leader not found" (ErrLeaderNotFound, emitted while the cluster is
545+
// re-electing) are treated as retryable. The match is case-insensitive
546+
// because Redis protocol error bodies and other layers may capitalise the
547+
// phrase differently (e.g. "ERR Not Leader").
529548
func isTransientNotLeaderErr(err error) bool {
530549
if err == nil {
531550
return false
532551
}
533-
return strings.Contains(strings.ToLower(err.Error()), "not leader")
552+
s := strings.ToLower(err.Error())
553+
return strings.Contains(s, "not leader") || strings.Contains(s, "leader not found")
534554
}
535555

536556
// doEventually retries do() while it returns a transient "not leader" error,
@@ -568,3 +588,64 @@ func lpushEventually(t *testing.T, ctx context.Context, rdb *redis.Client, key s
568588
return rdb.LPush(ctx, key, vals...).Err()
569589
})
570590
}
591+
592+
// rawPutEventually wraps RawKV.RawPut in doEventually so transient leader
593+
// churn (either at startup or in the middle of a long-running loop) does
594+
// not fail the test with "not leader" / "leader not found".
595+
func rawPutEventually(t *testing.T, ctx context.Context, c pb.RawKVClient, req *pb.RawPutRequest) {
596+
t.Helper()
597+
doEventually(t, func() error {
598+
_, err := c.RawPut(ctx, req)
599+
return err
600+
})
601+
}
602+
603+
// rawGetEventually wraps RawKV.RawGet in doEventually and returns the
604+
// response only after a successful (non-"not leader") call.
605+
func rawGetEventually(t *testing.T, ctx context.Context, c pb.RawKVClient, req *pb.RawGetRequest) *pb.RawGetResponse {
606+
t.Helper()
607+
var resp *pb.RawGetResponse
608+
doEventually(t, func() error {
609+
r, err := c.RawGet(ctx, req)
610+
if err != nil {
611+
return err
612+
}
613+
resp = r
614+
return nil
615+
})
616+
return resp
617+
}
618+
619+
// txnPutEventually wraps TransactionalKV.Put in doEventually.
620+
func txnPutEventually(t *testing.T, ctx context.Context, c pb.TransactionalKVClient, req *pb.PutRequest) {
621+
t.Helper()
622+
doEventually(t, func() error {
623+
_, err := c.Put(ctx, req)
624+
return err
625+
})
626+
}
627+
628+
// txnGetEventually wraps TransactionalKV.Get in doEventually and returns the
629+
// response only after a successful (non-"not leader") call.
630+
func txnGetEventually(t *testing.T, ctx context.Context, c pb.TransactionalKVClient, req *pb.GetRequest) *pb.GetResponse {
631+
t.Helper()
632+
var resp *pb.GetResponse
633+
doEventually(t, func() error {
634+
r, err := c.Get(ctx, req)
635+
if err != nil {
636+
return err
637+
}
638+
resp = r
639+
return nil
640+
})
641+
return resp
642+
}
643+
644+
// txnDeleteEventually wraps TransactionalKV.Delete in doEventually.
645+
func txnDeleteEventually(t *testing.T, ctx context.Context, c pb.TransactionalKVClient, req *pb.DeleteRequest) {
646+
t.Helper()
647+
doEventually(t, func() error {
648+
_, err := c.Delete(ctx, req)
649+
return err
650+
})
651+
}

0 commit comments

Comments
 (0)