Skip to content

Commit

Permalink
feat: default to a better raft request timeout
Browse files Browse the repository at this point in the history
Using the old default values of:

```yaml
zeebe.broker:
  cluster:
    electionTimeout: 2.5s
  raft:
    enablePriorityElection: true
  experimental:
    maxAppendsPerFollower: 2
    raft:
      requestTimeout: 5s
```

the loss of 2 requests between primary(leader) and secondary(follower)
could trigger unnecessary re-election because the secondary would not
receive any requests from the primary for at least 5 seconds which
exceeds election timeout.

This changes the default request timeout to always match the default
election timeout. Using all default values, we get at least one more
request attempt between primary and secondary before re-election and
probably more, depending on the exact timing when requests are sent.
  • Loading branch information
lenaschoenburg committed Apr 14, 2023
1 parent e4e552a commit 25a334d
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public final class ClusterCfg implements ConfigurationEntry {
public static final int DEFAULT_REPLICATION_FACTOR = 1;
public static final int DEFAULT_CLUSTER_SIZE = 1;
public static final String DEFAULT_CLUSTER_NAME = "zeebe-cluster";
public static final Duration DEFAULT_ELECTION_TIMEOUT = Duration.ofMillis(2500);

private static final String NODE_ID_ERROR_MSG =
"Node id %s needs to be non negative and smaller then cluster size %s.";
private static final String REPLICATION_FACTOR_ERROR_MSG =
Expand All @@ -38,7 +40,6 @@ public final class ClusterCfg implements ConfigurationEntry {
+ " quorum = {}. If you want to ensure high fault-tolerance and availability,"
+ " make sure to use an odd replication factor.";
private static final Duration DEFAULT_HEARTBEAT_INTERVAL = Duration.ofMillis(250);
private static final Duration DEFAULT_ELECTION_TIMEOUT = Duration.ofMillis(2500);

private List<String> initialContactPoints = DEFAULT_CONTACT_POINTS;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
*/
package io.camunda.zeebe.broker.system.configuration;

import static io.camunda.zeebe.broker.system.configuration.ClusterCfg.DEFAULT_ELECTION_TIMEOUT;

import java.time.Duration;

public final class ExperimentalRaftCfg implements ConfigurationEntry {

private static final Duration DEFAULT_REQUEST_TIMEOUT = Duration.ofSeconds(5);
// Requests should time out faster than the election timeout to ensure that a single missed
// heartbeat does not cause immediate re-election.
private static final Duration DEFAULT_REQUEST_TIMEOUT = DEFAULT_ELECTION_TIMEOUT;
private static final Duration DEFAULT_MAX_QUORUM_RESPONSE_TIMEOUT = Duration.ofSeconds(0);
private static final int DEFAULT_MIN_STEP_DOWN_FAILURE_COUNT = 3;
private static final int DEFAULT_PREFER_SNAPSHOT_REPLICATION_THRESHOLD = 100;
Expand Down
2 changes: 1 addition & 1 deletion dist/src/main/config/broker.standalone.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,7 @@
# raft:
# Sets the timeout for all requests send by raft leaders and followers.
# This setting can also be overridden using the environment variable ZEEBE_BROKER_EXPERIMENTAL_RAFT_REQUESTTIMEOUT
# requestTimeout: 5s
# requestTimeout: 2500ms

# If the leader is not able to reach the quorum, the leader may step down.
# This is triggered after a number of requests, to a quorum of followers, has failed, and the number of failures
Expand Down
2 changes: 1 addition & 1 deletion dist/src/main/config/broker.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@
# raft:
# Sets the timeout for all requests send by raft leaders and followers.
# This setting can also be overridden using the environment variable ZEEBE_BROKER_EXPERIMENTAL_RAFT_REQUESTTIMEOUT
# requestTimeout: 5s
# requestTimeout: 2500ms

# If the leader is not able to reach the quorum, the leader may step down.
# This is triggered after a number of requests, to a quorum of followers, has failed, and the number of failures
Expand Down

0 comments on commit 25a334d

Please sign in to comment.