From b4e02401e66d1965d17eca96736a19f5dbeb66fe Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@cockroachlabs.com>
Date: Tue, 17 Nov 2015 23:09:37 -0500
Subject: [PATCH] Trigger a raft election after splits.

This minimizes the window of unavailability following a split.

Fixes #1384.
---
 multiraft/multiraft.go       | 21 +++++++++++++++++++++
 multiraft/multiraft_test.go  |  3 +++
 storage/client_split_test.go | 34 ++++++++++++++++++++++++++++++++++
 storage/replica_command.go   | 12 ++++++++++++
 4 files changed, 70 insertions(+)

diff --git a/multiraft/multiraft.go b/multiraft/multiraft.go
index 23fc910af59e..58b1aeb6c58e 100644
--- a/multiraft/multiraft.go
+++ b/multiraft/multiraft.go
@@ -107,6 +107,7 @@ type MultiRaft struct {
 	createGroupChan chan createGroupOp
 	removeGroupChan chan removeGroupOp
 	proposalChan    chan *proposal
+	campaignChan    chan roachpb.RangeID
 	// callbackChan is a generic hook to run a callback in the raft thread.
 	callbackChan chan func()
 }
@@ -160,6 +161,7 @@ func NewMultiRaft(nodeID roachpb.NodeID, storeID roachpb.StoreID, config *Config
 		createGroupChan: make(chan createGroupOp),
 		removeGroupChan: make(chan removeGroupOp),
 		proposalChan:    make(chan *proposal),
+		campaignChan:    make(chan roachpb.RangeID),
 		callbackChan:    make(chan func()),
 	}
 
@@ -422,6 +424,13 @@ func (m *MultiRaft) Status(groupID roachpb.RangeID) *raft.Status {
 	return m.multiNode.Status(uint64(groupID))
 }
 
+// Campaign causes this node to start an election. Use with caution as
+// contested elections may cause periods of unavailability. Only use
+// Campaign() when you can be sure that only one replica will call it.
+func (m *MultiRaft) Campaign(groupID roachpb.RangeID) {
+	m.campaignChan <- groupID
+}
+
 type proposal struct {
 	groupID   roachpb.RangeID
 	commandID string
@@ -626,6 +635,18 @@ func (s *state) start() {
 			case prop := <-s.proposalChan:
 				s.propose(prop)
 
+			case groupID := <-s.campaignChan:
+				if _, ok := s.groups[groupID]; !ok {
+					if err := s.createGroup(groupID, 0); err != nil {
+						log.Warningf("node %s failed to create group %s during MultiRaft.Campaign: %s",
+							s.nodeID, groupID, err)
+						continue
+					}
+					if err := s.multiNode.Campaign(context.Background(), uint64(groupID)); err != nil {
+						log.Warningf("node %s failed to campaign for group %s: %s", s.nodeID, groupID, err)
+					}
+				}
+
 			case s.readyGroups = <-raftReady:
 				// readyGroups are saved in a local variable until they can be sent to
 				// the write task (and then the real work happens after the write is
diff --git a/multiraft/multiraft_test.go b/multiraft/multiraft_test.go
index c8814b275b61..c67ca167d5cc 100644
--- a/multiraft/multiraft_test.go
+++ b/multiraft/multiraft_test.go
@@ -143,6 +143,9 @@ func (c *testCluster) createGroup(groupID roachpb.RangeID, firstNode, numReplica
 // the given node will win the election. Unlike elect(), triggerElection() does not
 // wait for the election to resolve.
 func (c *testCluster) triggerElection(nodeIndex int, groupID roachpb.RangeID) {
+	// TODO(bdarnell): call MultiRaft.Campaign instead of
+	// multiNode.Campaign. Doing so is non-trivial because
+	// heartbeat_test.go is sensitive to minor reorderings of events.
 	if err := c.nodes[nodeIndex].multiNode.Campaign(context.Background(), uint64(groupID)); err != nil {
 		c.t.Fatal(err)
 	}
diff --git a/storage/client_split_test.go b/storage/client_split_test.go
index 6db83306d282..3b57a4f13124 100644
--- a/storage/client_split_test.go
+++ b/storage/client_split_test.go
@@ -869,3 +869,37 @@ func TestStoreSplitReadRace(t *testing.T) {
 		}
 	}
 }
+
+// TestLeaderAfterSplit verifies that a raft group created by a split
+// elects a leader without waiting for an election timeout.
+func TestLeaderAfterSplit(t *testing.T) {
+	defer leaktest.AfterTest(t)
+	storeContext := storage.TestStoreContext
+	storeContext.RaftElectionTimeoutTicks = 1000000
+	mtc := &multiTestContext{
+		storeContext: &storeContext,
+	}
+	mtc.Start(t, 3)
+	defer mtc.Stop()
+
+	mtc.replicateRange(1, 0, 1, 2)
+
+	leftKey := roachpb.Key("a")
+	splitKey := roachpb.Key("m")
+	rightKey := roachpb.Key("z")
+
+	splitArgs := adminSplitArgs(roachpb.KeyMin, splitKey)
+	if _, err := client.SendWrapped(mtc.distSender, nil, &splitArgs); err != nil {
+		t.Fatal(err)
+	}
+
+	incArgs := incrementArgs(leftKey, 1)
+	if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil {
+		t.Fatal(err)
+	}
+
+	incArgs = incrementArgs(rightKey, 2)
+	if _, err := client.SendWrapped(mtc.distSender, nil, &incArgs); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/storage/replica_command.go b/storage/replica_command.go
index c21ae896613a..5c1f2058841e 100644
--- a/storage/replica_command.go
+++ b/storage/replica_command.go
@@ -1414,6 +1414,18 @@ func (r *Replica) splitTrigger(batch engine.Engine, split *roachpb.SplitTrigger)
 			// Our in-memory state has diverged from the on-disk state.
 			log.Fatalf("failed to update Store after split: %s", err)
 		}
+
+		// To avoid leaving the new range unavailable as it waits to elect
+		// its leader, one (and only one) of the nodes should start an
+		// election as soon as the split is processed. For simplicity, we
+		// choose the first node in the replica list. If this node is
+		// unavailable, the group will have to wait for an election
+		// timeout, just as with any other leader failure. (we could
+		// improve this by e.g. choosing the node that had the leader
+		// lease before the split and is therefore known to be up)
+		if r.store.StoreID() == split.NewDesc.Replicas[0].StoreID {
+			r.store.multiraft.Campaign(split.NewDesc.RangeID)
+		}
 	})
 
 	return nil