Skip to content

Commit

Permalink
Trigger a raft election after splits.
Browse files Browse the repository at this point in the history
This minimizes the window of unavailability following a split.

Fixes cockroachdb#1384.
  • Loading branch information
bdarnell committed Nov 24, 2015
1 parent cf42a63 commit b4e0240
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 0 deletions.
21 changes: 21 additions & 0 deletions multiraft/multiraft.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ type MultiRaft struct {
createGroupChan chan createGroupOp
removeGroupChan chan removeGroupOp
proposalChan chan *proposal
campaignChan chan roachpb.RangeID
// callbackChan is a generic hook to run a callback in the raft thread.
callbackChan chan func()
}
Expand Down Expand Up @@ -160,6 +161,7 @@ func NewMultiRaft(nodeID roachpb.NodeID, storeID roachpb.StoreID, config *Config
createGroupChan: make(chan createGroupOp),
removeGroupChan: make(chan removeGroupOp),
proposalChan: make(chan *proposal),
campaignChan: make(chan roachpb.RangeID),
callbackChan: make(chan func()),
}

Expand Down Expand Up @@ -422,6 +424,13 @@ func (m *MultiRaft) Status(groupID roachpb.RangeID) *raft.Status {
return m.multiNode.Status(uint64(groupID))
}

// Campaign causes this node to start an election. Use with caution as
// contested elections may cause periods of unavailability. Only use
// Campaign() when you can be sure that only one replica will call it.
func (m *MultiRaft) Campaign(groupID roachpb.RangeID) {
m.campaignChan <- groupID
}

type proposal struct {
groupID roachpb.RangeID
commandID string
Expand Down Expand Up @@ -626,6 +635,18 @@ func (s *state) start() {
case prop := <-s.proposalChan:
s.propose(prop)

case groupID := <-s.campaignChan:
if _, ok := s.groups[groupID]; !ok {
if err := s.createGroup(groupID, 0); err != nil {
log.Warningf("node %s failed to create group %s during MultiRaft.Campaign: %s",
s.nodeID, groupID, err)
continue
}
if err := s.multiNode.Campaign(context.Background(), uint64(groupID)); err != nil {
log.Warningf("node %s failed to campaign for group %s: %s", s.nodeID, groupID, err)
}
}

case s.readyGroups = <-raftReady:
// readyGroups are saved in a local variable until they can be sent to
// the write task (and then the real work happens after the write is
Expand Down
3 changes: 3 additions & 0 deletions multiraft/multiraft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ func (c *testCluster) createGroup(groupID roachpb.RangeID, firstNode, numReplica
// the given node will win the election. Unlike elect(), triggerElection() does not
// wait for the election to resolve.
func (c *testCluster) triggerElection(nodeIndex int, groupID roachpb.RangeID) {
// TODO(bdarnell): call MultiRaft.Campaign instead of
// multiNode.Campaign. Doing so is non-trivial because
// heartbeat_test.go is sensitive to minor reorderings of events.
if err := c.nodes[nodeIndex].multiNode.Campaign(context.Background(), uint64(groupID)); err != nil {
c.t.Fatal(err)
}
Expand Down
34 changes: 34 additions & 0 deletions storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -869,3 +869,37 @@ func TestStoreSplitReadRace(t *testing.T) {
}
}
}

// TestLeaderAfterSplit verifies that a raft group created by a split
// elects a leader without waiting for an election timeout.
func TestLeaderAfterSplit(t *testing.T) {
defer leaktest.AfterTest(t)
storeContext := storage.TestStoreContext
storeContext.RaftElectionTimeoutTicks = 1000000
mtc := &multiTestContext{
storeContext: &storeContext,
}
mtc.Start(t, 3)
defer mtc.Stop()

mtc.replicateRange(1, 0, 1, 2)

leftKey := roachpb.Key("a")
splitKey := roachpb.Key("m")
rightKey := roachpb.Key("z")

splitArgs := adminSplitArgs(roachpb.KeyMin, splitKey)
if _, err := client.SendWrapped(mtc.distSender, nil, &splitArgs); err != nil {
t.Fatal(err)
}

incArgs := incrementArgs(leftKey, 1)
if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil {
t.Fatal(err)
}

incArgs = incrementArgs(rightKey, 2)
if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil {
t.Fatal(err)
}
}
12 changes: 12 additions & 0 deletions storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,18 @@ func (r *Replica) splitTrigger(batch engine.Engine, split *roachpb.SplitTrigger)
// Our in-memory state has diverged from the on-disk state.
log.Fatalf("failed to update Store after split: %s", err)
}

// To avoid leaving the new range unavailable as it waits to elect
// its leader, one (and only one) of the nodes should start an
// election as soon as the split is processed. For simplicity, we
// choose the first node in the replica list. If this node is
// unavailable, the group will have to wait for an election
// timeout, just as with any other leader failure. (we could
// improve this by e.g. choosing the node that had the leader
// lease before the split and is therefore known to be up)
if r.store.StoreID() == split.NewDesc.Replicas[0].StoreID {
r.store.multiraft.Campaign(split.NewDesc.RangeID)
}
})

return nil
Expand Down

0 comments on commit b4e0240

Please sign in to comment.