From 43048cbf07325475e7a8d96ddaf13f872942b3e4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 16 Nov 2016 14:21:27 -0600 Subject: [PATCH 1/2] TS-5056 Implement nonrecoverable error mechanism Change `Emergency()` to terminate the current process with status code UNRECOVERABLE_EXIT. Also change traffic_manager to listen for the UNRECOVERABLE_EXIT status code. If heard, then TM will not try to restart TS from that point forward. This was designed so that traffic_server could call Emergency(..) in the event of a nonrecoverable error such as a bad config file. No amount of TS rebooting will fix a bad config, so we might as well have TM wait for human intervention. Note that if traffic_cop or traffic_manager calls Emergency(), nothing totally unexpected will happen since the only visible change from this patch is the status code. --- cmd/traffic_manager/traffic_manager.cc | 6 ++++- lib/ts/Diags.cc | 7 +++++- lib/ts/ink_error.cc | 31 +++++++++++++++++++++----- lib/ts/ink_error.h | 10 +++++++++ mgmt/LocalManager.cc | 11 +++++++++ mgmt/LocalManager.h | 1 + 6 files changed, 58 insertions(+), 8 deletions(-) diff --git a/cmd/traffic_manager/traffic_manager.cc b/cmd/traffic_manager/traffic_manager.cc index cf1e3861e1e..ca9928095ee 100644 --- a/cmd/traffic_manager/traffic_manager.cc +++ b/cmd/traffic_manager/traffic_manager.cc @@ -804,7 +804,7 @@ main(int argc, const char **argv) break; } - if (lmgmt->run_proxy && !lmgmt->processRunning()) { /* Make sure we still have a proxy up */ + if (lmgmt->run_proxy && !lmgmt->processRunning() && lmgmt->proxy_recoverable) { /* Make sure we still have a proxy up */ if (sleep_time) { mgmt_log("Relaunching proxy after %d sec...", sleep_time); millisleep(1000 * sleep_time); // we use millisleep instead of sleep because it doesnt interfere with signals @@ -819,6 +819,10 @@ main(int argc, const char **argv) just_started++; } } else { /* Give the proxy a chance to fire up */ + if (!lmgmt->proxy_recoverable) { + mgmt_log("[main] Proxy is un-recoverable. Proxy will not be relaunched.\n"); + } + just_started++; } diff --git a/lib/ts/Diags.cc b/lib/ts/Diags.cc index 85b3465876e..da8c5f2ebff 100644 --- a/lib/ts/Diags.cc +++ b/lib/ts/Diags.cc @@ -552,7 +552,12 @@ Diags::error_va(DiagsLevel level, const SourceLocation *loc, const char *format_ if (cleanup_func) { cleanup_func(); } - ink_fatal_va(format_string, ap2); + + // DL_Emergency means the process cannot recover from a reboot + if (level == DL_Emergency) + ink_emergency_va(format_string, ap2); + else + ink_fatal_va(format_string, ap2); } va_end(ap2); diff --git a/lib/ts/ink_error.cc b/lib/ts/ink_error.cc index 7d7bf1e4d5c..d2464013fd2 100644 --- a/lib/ts/ink_error.cc +++ b/lib/ts/ink_error.cc @@ -35,12 +35,12 @@ */ static void -fatal_va(const char *fmt, va_list ap) +fatal_va(const char *hdr, const char *fmt, va_list ap) { char msg[1024]; - const size_t len = sizeof("FATAL: ") - 1; + const size_t len = strlen(hdr); - strncpy(msg, "FATAL: ", sizeof(msg)); + strncpy(msg, hdr, sizeof(msg)); vsnprintf(msg + len, sizeof(msg) - len, fmt, ap); msg[sizeof(msg) - 1] = 0; @@ -51,7 +51,7 @@ fatal_va(const char *fmt, va_list ap) void ink_fatal_va(const char *fmt, va_list ap) { - fatal_va(fmt, ap); + fatal_va("Fatal: ", fmt, ap); ::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any. } @@ -61,19 +61,38 @@ ink_fatal(const char *message_format, ...) va_list ap; va_start(ap, message_format); - fatal_va(message_format, ap); + fatal_va("Fatal: ", message_format, ap); va_end(ap); ::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any. } +void +ink_emergency_va(const char *fmt, va_list ap) +{ + fatal_va("Emergency: ", fmt, ap); + ::exit(UNRECOVERABLE_EXIT); +} + +void +ink_emergency(const char *message_format, ...) +{ + va_list ap; + + va_start(ap, message_format); + ink_emergency_va(message_format, ap); + va_end(ap); + + ::exit(UNRECOVERABLE_EXIT); +} + void ink_abort(const char *message_format, ...) { va_list ap; va_start(ap, message_format); - fatal_va(message_format, ap); + fatal_va("Fatal: ", message_format, ap); va_end(ap); abort(); diff --git a/lib/ts/ink_error.h b/lib/ts/ink_error.h index d0b76515676..34309f9dec7 100644 --- a/lib/ts/ink_error.h +++ b/lib/ts/ink_error.h @@ -36,6 +36,16 @@ #include "ts/ink_platform.h" #include "ts/ink_apidefs.h" +// This magic exit code is used to signal that the crashing process cannot +// be recovered from a restart of said process +// +// Originally, this was intended to be used as a backchannel mechanism whereby +// traffic_server can tell traffic_manager via an exit code to stop trying to restart +// traffic_server b/c (for example) traffic_server has a bad config file +#define UNRECOVERABLE_EXIT 33 + +void ink_emergency_va(const char *fmt, va_list ap) TS_NORETURN; +void ink_emergency(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN; void ink_fatal_va(const char *message_format, va_list ap) TS_NORETURN; void ink_fatal(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN; void ink_abort(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN; diff --git a/mgmt/LocalManager.cc b/mgmt/LocalManager.cc index e28abc233a9..85839a09bfa 100644 --- a/mgmt/LocalManager.cc +++ b/mgmt/LocalManager.cc @@ -24,6 +24,7 @@ #include "ts/ink_platform.h" #include "ts/ink_sock.h" #include "ts/ink_file.h" +#include "ts/ink_error.h" #include "MgmtUtils.h" #include "ts/I_Layout.h" #include "LocalManager.h" @@ -185,6 +186,7 @@ LocalManager::LocalManager(bool proxy_on) : BaseManager(), run_proxy(proxy_on), syslog_facility = 0; ccom = nullptr; + proxy_recoverable = true; proxy_started_at = -1; proxy_launch_count = 0; manager_started_at = time(nullptr); @@ -493,6 +495,15 @@ LocalManager::pollMgmtProcessServer() if (WIFSIGNALED(estatus)) { int sig = WTERMSIG(estatus); mgmt_log("[LocalManager::pollMgmtProcessServer] Server Process terminated due to Sig %d: %s\n", sig, strsignal(sig)); + } else if (WIFEXITED(estatus)) { + int return_code = WEXITSTATUS(estatus); + + // traffic_server's exit code will be UNRECOVERABLE_EXIT if it calls + // ink_emergency() or ink_emergency_va(). The call signals that traffic_server + // cannot be recovered with a reboot. In other words, catastrophic failure. + if (return_code == UNRECOVERABLE_EXIT) { + proxy_recoverable = false; + } } if (lmgmt->run_proxy) { diff --git a/mgmt/LocalManager.h b/mgmt/LocalManager.h index c9e99ded321..d142324ca7b 100644 --- a/mgmt/LocalManager.h +++ b/mgmt/LocalManager.h @@ -91,6 +91,7 @@ class LocalManager : public BaseManager bool clusterOk(); volatile bool run_proxy; + volatile bool proxy_recoverable; // false if traffic_server cannot recover with a reboot volatile time_t manager_started_at; volatile time_t proxy_started_at; volatile int proxy_launch_count; From 40f071d8221fffff1282784279147997760764f1 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 28 Nov 2016 15:17:47 -0600 Subject: [PATCH 2/2] Add comment to clarify exit conditions --- lib/ts/ink_error.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/ts/ink_error.cc b/lib/ts/ink_error.cc index d2464013fd2..d38bafb2f71 100644 --- a/lib/ts/ink_error.cc +++ b/lib/ts/ink_error.cc @@ -81,6 +81,7 @@ ink_emergency(const char *message_format, ...) va_start(ap, message_format); ink_emergency_va(message_format, ap); + // Should never reach here since ink_emergency_va calls exit() va_end(ap); ::exit(UNRECOVERABLE_EXIT);