Skip to content

Commit

Permalink
Fix issue caused by sole server marked as failed under load
Browse files Browse the repository at this point in the history
If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed May 29, 2024
1 parent ed23a2b commit ca39614
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 5 deletions.
9 changes: 7 additions & 2 deletions pkg/agent/loadbalancer/loadbalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,15 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
lb.mutex.RLock()
defer lb.mutex.RUnlock()

var allChecksFailed bool
startIndex := lb.nextServerIndex
for {
targetServer := lb.currentServerAddress

server := lb.servers[targetServer]
if server == nil || targetServer == "" {
logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer)
} else if server.healthCheck() {
} else if allChecksFailed || server.healthCheck() {
conn, err := server.dialContext(ctx, network, targetServer)
if err == nil {
return conn, nil
Expand All @@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
startIndex = maxIndex
}
if lb.nextServerIndex == startIndex {
return nil, errors.New("all servers failed")
if allChecksFailed {
return nil, errors.New("all servers failed")
}
logrus.Debugf("Health checks for all servers in load balancer %s have failed: retrying with health checks ignored", lb.serviceName)
allChecksFailed = true
}
}
}
Expand Down
10 changes: 8 additions & 2 deletions pkg/agent/loadbalancer/servers.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,13 +227,19 @@ func (lb *LoadBalancer) SetHealthCheck(address string, healthCheck func() bool)
// runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their
// connections closed, to force clients to switch over to a healthy server.
func (lb *LoadBalancer) runHealthChecks(ctx context.Context) {
previousStatus := map[string]bool{}
wait.Until(func() {
lb.mutex.RLock()
defer lb.mutex.RUnlock()
for _, server := range lb.servers {
if !server.healthCheck() {
for address, server := range lb.servers {
status := server.healthCheck()
if status == false && previousStatus[address] == true {
// Only close connections when the server transitions from healthy to unhealthy;
// we don't want to re-close all the connections every time as we might be ignoring
// health checks due to all servers being marked unhealthy.
defer server.closeAll()
}
previousStatus[address] = status
}
}, time.Second, ctx.Done())
logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName)
Expand Down
2 changes: 1 addition & 1 deletion pkg/etcd/etcdproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (e etcdproxy) createHealthCheck(ctx context.Context, address string) func()
statusCode = resp.StatusCode
}
if err != nil || statusCode != http.StatusOK {
logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", url, err, statusCode)
logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", address, err, statusCode)
connected = false
} else {
connected = true
Expand Down

0 comments on commit ca39614

Please sign in to comment.