From b4e02401e66d1965d17eca96736a19f5dbeb66fe Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Tue, 17 Nov 2015 23:09:37 -0500 Subject: [PATCH] Trigger a raft election after splits. This minimizes the window of unavailability following a split. Fixes #1384. --- multiraft/multiraft.go | 21 +++++++++++++++++++++ multiraft/multiraft_test.go | 3 +++ storage/client_split_test.go | 34 ++++++++++++++++++++++++++++++++++ storage/replica_command.go | 12 ++++++++++++ 4 files changed, 70 insertions(+) diff --git a/multiraft/multiraft.go b/multiraft/multiraft.go index 23fc910af59e..58b1aeb6c58e 100644 --- a/multiraft/multiraft.go +++ b/multiraft/multiraft.go @@ -107,6 +107,7 @@ type MultiRaft struct { createGroupChan chan createGroupOp removeGroupChan chan removeGroupOp proposalChan chan *proposal + campaignChan chan roachpb.RangeID // callbackChan is a generic hook to run a callback in the raft thread. callbackChan chan func() } @@ -160,6 +161,7 @@ func NewMultiRaft(nodeID roachpb.NodeID, storeID roachpb.StoreID, config *Config createGroupChan: make(chan createGroupOp), removeGroupChan: make(chan removeGroupOp), proposalChan: make(chan *proposal), + campaignChan: make(chan roachpb.RangeID), callbackChan: make(chan func()), } @@ -422,6 +424,13 @@ func (m *MultiRaft) Status(groupID roachpb.RangeID) *raft.Status { return m.multiNode.Status(uint64(groupID)) } +// Campaign causes this node to start an election. Use with caution as +// contested elections may cause periods of unavailability. Only use +// Campaign() when you can be sure that only one replica will call it. +func (m *MultiRaft) Campaign(groupID roachpb.RangeID) { + m.campaignChan <- groupID +} + type proposal struct { groupID roachpb.RangeID commandID string @@ -626,6 +635,18 @@ func (s *state) start() { case prop := <-s.proposalChan: s.propose(prop) + case groupID := <-s.campaignChan: + if _, ok := s.groups[groupID]; !ok { + if err := s.createGroup(groupID, 0); err != nil { + log.Warningf("node %s failed to create group %s during MultiRaft.Campaign: %s", + s.nodeID, groupID, err) + continue + } + if err := s.multiNode.Campaign(context.Background(), uint64(groupID)); err != nil { + log.Warningf("node %s failed to campaign for group %s: %s", s.nodeID, groupID, err) + } + } + case s.readyGroups = <-raftReady: // readyGroups are saved in a local variable until they can be sent to // the write task (and then the real work happens after the write is diff --git a/multiraft/multiraft_test.go b/multiraft/multiraft_test.go index c8814b275b61..c67ca167d5cc 100644 --- a/multiraft/multiraft_test.go +++ b/multiraft/multiraft_test.go @@ -143,6 +143,9 @@ func (c *testCluster) createGroup(groupID roachpb.RangeID, firstNode, numReplica // the given node will win the election. Unlike elect(), triggerElection() does not // wait for the election to resolve. func (c *testCluster) triggerElection(nodeIndex int, groupID roachpb.RangeID) { + // TODO(bdarnell): call MultiRaft.Campaign instead of + // multiNode.Campaign. Doing so is non-trivial because + // heartbeat_test.go is sensitive to minor reorderings of events. if err := c.nodes[nodeIndex].multiNode.Campaign(context.Background(), uint64(groupID)); err != nil { c.t.Fatal(err) } diff --git a/storage/client_split_test.go b/storage/client_split_test.go index 6db83306d282..3b57a4f13124 100644 --- a/storage/client_split_test.go +++ b/storage/client_split_test.go @@ -869,3 +869,37 @@ func TestStoreSplitReadRace(t *testing.T) { } } } + +// TestLeaderAfterSplit verifies that a raft group created by a split +// elects a leader without waiting for an election timeout. +func TestLeaderAfterSplit(t *testing.T) { + defer leaktest.AfterTest(t) + storeContext := storage.TestStoreContext + storeContext.RaftElectionTimeoutTicks = 1000000 + mtc := &multiTestContext{ + storeContext: &storeContext, + } + mtc.Start(t, 3) + defer mtc.Stop() + + mtc.replicateRange(1, 0, 1, 2) + + leftKey := roachpb.Key("a") + splitKey := roachpb.Key("m") + rightKey := roachpb.Key("z") + + splitArgs := adminSplitArgs(roachpb.KeyMin, splitKey) + if _, err := client.SendWrapped(mtc.distSender, nil, &splitArgs); err != nil { + t.Fatal(err) + } + + incArgs := incrementArgs(leftKey, 1) + if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil { + t.Fatal(err) + } + + incArgs = incrementArgs(rightKey, 2) + if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil { + t.Fatal(err) + } +} diff --git a/storage/replica_command.go b/storage/replica_command.go index c21ae896613a..5c1f2058841e 100644 --- a/storage/replica_command.go +++ b/storage/replica_command.go @@ -1414,6 +1414,18 @@ func (r *Replica) splitTrigger(batch engine.Engine, split *roachpb.SplitTrigger) // Our in-memory state has diverged from the on-disk state. log.Fatalf("failed to update Store after split: %s", err) } + + // To avoid leaving the new range unavailable as it waits to elect + // its leader, one (and only one) of the nodes should start an + // election as soon as the split is processed. For simplicity, we + // choose the first node in the replica list. If this node is + // unavailable, the group will have to wait for an election + // timeout, just as with any other leader failure. (we could + // improve this by e.g. choosing the node that had the leader + // lease before the split and is therefore known to be up) + if r.store.StoreID() == split.NewDesc.Replicas[0].StoreID { + r.store.multiraft.Campaign(split.NewDesc.RangeID) + } }) return nil