Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[redis] allow the client to reconnect on redis exceptions #1306

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public enum BACKPLANE_TYPE {
private int timeout = 10000;
private String[] redisNodes = {};
private int maxAttempts = 20;
private boolean reconnectClient = true;
private boolean cacheCas = false;

public String getRedisUri() {
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/build/buildfarm/common/redis/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ java_library(
"//third_party/jedis",
"@maven//:com_google_guava_guava",
"@maven//:io_grpc_grpc_api",
"@maven//:io_prometheus_simpleclient",
"@maven//:org_projectlombok_lombok",
"@maven//:org_redisson_redisson",
],
)
82 changes: 78 additions & 4 deletions src/main/java/build/buildfarm/common/redis/RedisClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,37 @@

import io.grpc.Status;
import io.grpc.Status.Code;
import io.prometheus.client.Counter;
import java.io.Closeable;
import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Supplier;
import redis.clients.jedis.JedisCluster;
import redis.clients.jedis.exceptions.JedisConnectionException;
import redis.clients.jedis.exceptions.JedisDataException;
import redis.clients.jedis.exceptions.JedisException;
import redis.clients.jedis.exceptions.JedisNoReachableClusterNodeException;

/**
* @class RedisClient
* @brief Responsible for making calls to redis.
*/
public class RedisClient implements Closeable {
// Metrics to detect any kind of redis failures.
// Often due to network issues are the redis cluster going down.
private static final Counter redisErrorCounter =
Counter.build().name("redis_client_error").help("Count of redis client failures").register();
private static final Counter redisClientRebuildErrorCounter =
Counter.build()
.name("redis_client_rebuild_error")
.help("Count of failures rebuilding redis client")
.register();
private boolean restablishClientOnFailures = false;

private static final String MISCONF_RESPONSE = "MISCONF";

@FunctionalInterface
Expand All @@ -56,14 +73,27 @@ public JedisMisconfigurationException(final String message, final Throwable caus
}
}

private final JedisCluster jedis;
// We store the factory in case we want to re-create the jedis client.
private Supplier<JedisCluster> jedisClusterFactory;

// The jedis client.
private JedisCluster jedis;

private boolean closed = false;

public RedisClient(JedisCluster jedis) {
this.jedis = jedis;
}

public RedisClient(
JedisCluster jedis,
Supplier<JedisCluster> jedisClusterFactory,
boolean restablishClientOnFailures) {
this.jedis = jedis;
this.jedisClusterFactory = jedisClusterFactory;
this.restablishClientOnFailures = restablishClientOnFailures;
}

@Override
public synchronized void close() {
closed = true;
Expand All @@ -81,7 +111,7 @@ private synchronized void throwIfClosed() throws IOException {
}

public void run(Consumer<JedisCluster> withJedis) throws IOException {
call(
callImpl(
(JedisContext<Void>)
jedis -> {
withJedis.accept(jedis);
Expand All @@ -91,9 +121,14 @@ public void run(Consumer<JedisCluster> withJedis) throws IOException {

public <T> T blockingCall(JedisInterruptibleContext<T> withJedis)
throws IOException, InterruptedException {
return defaultBlockingCall(withJedis);
}

private <T> T defaultBlockingCall(JedisInterruptibleContext<T> withJedis)
throws IOException, InterruptedException {
AtomicReference<InterruptedException> interruption = new AtomicReference<>(null);
T result =
call(
callImpl(
jedis -> {
try {
return withJedis.run(jedis);
Expand All @@ -109,8 +144,47 @@ public <T> T blockingCall(JedisInterruptibleContext<T> withJedis)
return result;
}

@SuppressWarnings("ConstantConditions")
public <T> T call(JedisContext<T> withJedis) throws IOException {
return callImpl(withJedis);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: do we need to make a separate method for this still?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. callImpl folded into call

}

private <T> T callImpl(JedisContext<T> withJedis) throws IOException {
// Typical configuration that does not handle exceptions
// or try to restablish the jedis client on failures.
if (!restablishClientOnFailures) {
return defaultCall(withJedis);
}

// Alternatively,
// Capture all redis problems at the client level.
// Try to re-establish the client and log all issues.
// This will block the overall thread until redis can be connected to.
// It may be a useful strategy for gaining stability on a poorly performing network,
// or a redis cluster that goes down.
while (true) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer configuration to be a number of a reconnects. If it's 0 or null then we don't retry, otherwise we retry up to that number of times. Bonus if there is some backoff here where we don't spam retries continuously.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. Switched to retry amount + duration between retries. We have a Retrier.java class in this repo, but its too specific to grpc so I didn't use it. Might be nice to use something like https://resilience4j.readme.io/docs/retry in the future.

try {
return defaultCall(withJedis);
} catch (Exception e) {
redisErrorCounter.inc();
System.out.println("Failure in RedisClient::call");
System.out.println(e.toString());
rebuildJedisCluser();
}
}
}

private void rebuildJedisCluser() {
try {
System.out.println("Rebuilding redis client");
jedis = jedisClusterFactory.get();
} catch (Exception e) {
redisClientRebuildErrorCounter.inc();
System.out.println("Failed to rebuild redis client");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this plumb in log?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

System.out.println(e.toString());
}
}

private <T> T defaultCall(JedisContext<T> withJedis) throws IOException {
throwIfClosed();
try {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,11 @@ public void start(String clientPublicName) throws IOException {
// Construct a single redis client to be used throughout the entire backplane.
// We wish to avoid various synchronous and error handling issues that could occur when using
// multiple clients.
client = new RedisClient(jedisClusterFactory.get());
client =
new RedisClient(
jedisClusterFactory.get(),
jedisClusterFactory,
configs.getBackplane().isReconnectClient());
// Create containers that make up the backplane
state = DistributedStateCreator.create(client);

Expand Down