Fix issue caused by sole server marked as failed under load

If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
brandond · May 29, 2024 · ca39614 · ca39614
1 parent ed23a2b
commit ca39614
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 5 deletions.
diff --git a/pkg/agent/loadbalancer/loadbalancer.go b/pkg/agent/loadbalancer/loadbalancer.go
@@ -158,14 +158,15 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
 	lb.mutex.RLock()
 	defer lb.mutex.RUnlock()
 
+	var allChecksFailed bool
 	startIndex := lb.nextServerIndex
 	for {
 		targetServer := lb.currentServerAddress
 
 		server := lb.servers[targetServer]
 		if server == nil || targetServer == "" {
 			logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer)
-		} else if server.healthCheck() {
+		} else if allChecksFailed || server.healthCheck() {
 			conn, err := server.dialContext(ctx, network, targetServer)
 			if err == nil {
 				return conn, nil
@@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
 			startIndex = maxIndex
 		}
 		if lb.nextServerIndex == startIndex {
-			return nil, errors.New("all servers failed")
+			if allChecksFailed {
+				return nil, errors.New("all servers failed")
+			}
+			logrus.Debugf("Health checks for all servers in load balancer %s have failed: retrying with health checks ignored", lb.serviceName)
+			allChecksFailed = true
 		}
 	}
 }

diff --git a/pkg/agent/loadbalancer/servers.go b/pkg/agent/loadbalancer/servers.go
@@ -227,13 +227,19 @@ func (lb *LoadBalancer) SetHealthCheck(address string, healthCheck func() bool)
 // runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their
 // connections closed, to force clients to switch over to a healthy server.
 func (lb *LoadBalancer) runHealthChecks(ctx context.Context) {
+	previousStatus := map[string]bool{}
 	wait.Until(func() {
 		lb.mutex.RLock()
 		defer lb.mutex.RUnlock()
-		for _, server := range lb.servers {
-			if !server.healthCheck() {
+		for address, server := range lb.servers {
+			status := server.healthCheck()
+			if status == false && previousStatus[address] == true {
+				// Only close connections when the server transitions from healthy to unhealthy;
+				// we don't want to re-close all the connections every time as we might be ignoring
+				// health checks due to all servers being marked unhealthy.
 				defer server.closeAll()
 			}
+			previousStatus[address] = status
 		}
 	}, time.Second, ctx.Done())
 	logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName)

diff --git a/pkg/etcd/etcdproxy.go b/pkg/etcd/etcdproxy.go
@@ -130,7 +130,7 @@ func (e etcdproxy) createHealthCheck(ctx context.Context, address string) func()
 			statusCode = resp.StatusCode
 		}
 		if err != nil || statusCode != http.StatusOK {
-			logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", url, err, statusCode)
+			logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", address, err, statusCode)
 			connected = false
 		} else {
 			connected = true