Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Be less agressive about marking home servers as zombie.

Instead of marking them as zombie as soon as ONE packet doesn't
receive a response, mark then as zombie when we have received NO
responses for (zombie_period / 4)
  • Loading branch information...
commit f8bcc0fe9eee3edd8669b1eccf19ede596175fa5 1 parent 3d7e1b2
@alandekok authored
Showing with 54 additions and 22 deletions.
  1. +1 −0  src/include/realms.h
  2. +46 −22 src/main/event.c
  3. +7 −0 src/main/realms.c
View
1  src/include/realms.h
@@ -53,6 +53,7 @@ typedef struct home_server {
int currently_outstanding;
int message_authenticator;
+ time_t last_packet;
struct timeval revive_time;
struct timeval zombie_period_start;
int zombie_period; /* unresponsive for T, mark it dead */
View
68 src/main/event.c
@@ -1083,33 +1083,54 @@ static void no_response_to_proxied_request(void *ctx)
/*
* Don't touch request due to race conditions
*/
- if (home->state == HOME_STATE_IS_DEAD) {
- rad_assert(home->ev != NULL); /* or it will never wake up */
- return;
- }
/*
- * Enable the zombie period when we notice that the home
- * server hasn't responded. We do NOT back-date the start
- * of the zombie period.
- */
- if (home->state == HOME_STATE_ALIVE) {
- home->state = HOME_STATE_ZOMBIE;
- home->zombie_period_start = now;
- fr_event_delete(el, &home->ev);
- home->currently_outstanding = 0;
- home->num_received_pings = 0;
-
- radlog(L_PROXY, "Marking home server %s port %d as zombie (it looks like it is dead).",
- inet_ntop(home->ipaddr.af, &home->ipaddr.ipaddr,
- buffer, sizeof(buffer)),
- home->port);
-
+ * If it's not alive, don't try to make it a zombie.
+ */
+ if (home->state != HOME_STATE_ALIVE) {
/*
- * Start pinging the home server.
+ * Don't check home->ev due to race conditions.
*/
- ping_home_server(home);
+ return;
}
+
+ /*
+ * We've received a real packet recently. Don't mark the
+ * server as zombie until we've received NO packets for a
+ * while. The "1/4" of zombie period was chosen rather
+ * arbitrarily. It's a balance between too short, which
+ * gives quick fail-over and fail-back, or too long,
+ * where the proxy still sends packets to an unresponsive
+ * home server.
+ */
+ if ((home->last_packet + ((home->zombie_period + 3) / 4)) >= now.tv_sec) {
+ return;
+ }
+
+ /*
+ * Enable the zombie period when we notice that the home
+ * server hasn't responded for a while. We back-date the
+ * zombie period to when we last received a response from
+ * the home server.
+ */
+ home->state = HOME_STATE_ZOMBIE;
+
+ home->zombie_period_start.tv_sec = home->last_packet;
+ home->zombie_period_start.tv_sec = USEC / 2;
+
+ fr_event_delete(el, &home->ev);
+ home->currently_outstanding = 0;
+ home->num_received_pings = 0;
+
+ radlog(L_PROXY, "Marking home server %s port %d as zombie (it looks like it is dead).",
+ inet_ntop(home->ipaddr.af, &home->ipaddr.ipaddr,
+ buffer, sizeof(buffer)),
+ home->port);
+
+ /*
+ * Start pinging the home server.
+ */
+ ping_home_server(home);
}
#endif
@@ -3064,8 +3085,11 @@ REQUEST *received_proxy_response(RADIUS_PACKET *packet)
* receive a packet? Setting this here means that we
* mark it alive on *any* packet, even if it's lost all
* of the *other* packets in the last 10s.
+ *
+ * This behavior could be configurable.
*/
request->home_server->state = HOME_STATE_ALIVE;
+ request->home_server->last_packet = now.tv_sec;
#ifdef WITH_COA
/*
View
7 src/main/realms.c
@@ -385,6 +385,13 @@ static int home_server_add(realm_config_t *rc, CONF_SECTION *cs, int pool_type)
home->name = name2;
home->cs = cs;
+ /*
+ * For zombie period calculations. We want to count
+ * zombies from the time when the server starts, instead
+ * of from 1970.
+ */
+ home->last_packet = time(NULL);
+
/*
* Authentication servers have a default "no_response_fail = 0".
* Accounting servers have a default "no_response_fail = 1".
Please sign in to comment.
Something went wrong with that request. Please try again.