Skip to content

Commit

Permalink
Diagnostic info and metrics for Docker command failures and timeouts (#…
Browse files Browse the repository at this point in the history
…4070)

This change improves diagnostic information for failing Docker commands as well as timed out Docker commands:

* For all failures (including timeouts), a textual representation for exit status values is logged.
* Timeouts are explicitly detected and reported using a specialized exception allowing for a better timeout handling on higher layers of the Docker container implementation.
* Emit counter metric on Docker command timeout

This change introduces a set of new counter metrics that are emitted if a Docker command is terminated because of a timeout. A high number of such timeout occurrences is usually an indication for highly loaded invokers. The new metrics help to identify such invokers.
  • Loading branch information
sven-lange-last authored and chetanmeh committed Oct 26, 2018
1 parent a8addf6 commit 996d001
Show file tree
Hide file tree
Showing 8 changed files with 174 additions and 39 deletions.
3 changes: 3 additions & 0 deletions common/scala/src/main/scala/whisk/common/Logging.scala
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ object LoggingMarkers {
val finish = "finish"
val error = "error"
val count = "count"
val timeout = "timeout"

private val controller = "controller"
private val invoker = "invoker"
Expand Down Expand Up @@ -296,6 +297,8 @@ object LoggingMarkers {
// Time in invoker
val INVOKER_ACTIVATION = LogMarkerToken(invoker, activation, start)
def INVOKER_DOCKER_CMD(cmd: String) = LogMarkerToken(invoker, "docker", start, Some(cmd), Map("cmd" -> cmd))
def INVOKER_DOCKER_CMD_TIMEOUT(cmd: String) =
LogMarkerToken(invoker, "docker", timeout, Some(cmd), Map("cmd" -> cmd))
def INVOKER_RUNC_CMD(cmd: String) = LogMarkerToken(invoker, "runc", start, Some(cmd), Map("cmd" -> cmd))
def INVOKER_KUBECTL_CMD(cmd: String) = LogMarkerToken(invoker, "kubectl", start, Some(cmd), Map("cmd" -> cmd))
def INVOKER_CONTAINER_START(containerState: String) =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ import scala.util.Success
import scala.util.Try
import akka.event.Logging.{ErrorLevel, InfoLevel}
import pureconfig.loadConfigOrThrow
import whisk.common.Logging
import whisk.common.LoggingMarkers
import whisk.common.TransactionId
import whisk.common.{Logging, LoggingMarkers, MetricEmitter, TransactionId}
import whisk.core.ConfigKeys
import whisk.core.containerpool.ContainerId
import whisk.core.containerpool.ContainerAddress
Expand Down Expand Up @@ -127,11 +125,11 @@ class DockerClient(dockerHost: Option[String] = None,
.map(ContainerId.apply)
.recoverWith {
// https://docs.docker.com/v1.12/engine/reference/run/#/exit-status
// Exit code 125 means an error reported by the Docker daemon.
// Exit status 125 means an error reported by the Docker daemon.
// Examples:
// - Unrecognized option specified
// - Not enough disk space
case pre: ProcessRunningException if pre.exitCode == 125 =>
case pre: ProcessUnsuccessfulException if pre.exitStatus == ExitStatus(125) =>
Future.failed(
DockerContainerId
.parse(pre.stdout)
Expand Down Expand Up @@ -189,6 +187,9 @@ class DockerClient(dockerHost: Option[String] = None,
logLevel = InfoLevel)
executeProcess(cmd, timeout).andThen {
case Success(_) => transid.finished(this, start)
case Failure(pte: ProcessTimeoutException) =>
transid.failed(this, start, pte.getMessage, ErrorLevel)
MetricEmitter.emitCounterMetric(LoggingMarkers.INVOKER_DOCKER_CMD_TIMEOUT(args.head))
case Failure(t) => transid.failed(this, start, t.getMessage, ErrorLevel)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ trait ProcessRunner {
* Runs the specified command with arguments asynchronously and
* capture stdout as well as stderr.
*
* If not set to infinite, after timeout is reached the process is killed.
* If not set to infinite, after timeout is reached the process is terminated.
*
* Be cautious with the execution context you pass because the command
* is blocking.
Expand All @@ -52,17 +52,87 @@ trait ProcessRunner {
case _ => None
}

(process.exitValue(), out.mkString("\n"), err.mkString("\n"), scheduled)
(ExitStatus(process.exitValue()), out.mkString("\n"), err.mkString("\n"), scheduled)
}).flatMap {
case (0, stdout, _, scheduled) =>
case (ExitStatus(0), stdout, _, scheduled) =>
scheduled.foreach(_.cancel())
Future.successful(stdout)
case (code, stdout, stderr, scheduled) =>
case (exitStatus, stdout, stderr, scheduled) =>
scheduled.foreach(_.cancel())
Future.failed(ProcessRunningException(code, stdout, stderr))
timeout match {
case _: FiniteDuration if exitStatus.terminatedBySIGTERM =>
Future.failed(ProcessTimeoutException(timeout, exitStatus, stdout, stderr))
case _ => Future.failed(ProcessUnsuccessfulException(exitStatus, stdout, stderr))
}
}
}

object ExitStatus {
// Based on The Open Group Base Specifications Issue 7, 2018 edition:
// Shell & Utilities - Shell Command Language - 2.8.2 Exit Status for Commands
// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_08_02
val STATUS_SUCCESSFUL = 0
val STATUS_NOT_EXECUTABLE = 126
val STATUS_NOT_FOUND = 127
// When a command is stopped by a signal, the exit status is 128 + signal numer
val STATUS_SIGNAL = 128

// Based on The Open Group Base Specifications Issue 7, 2018 edition:
// Shell & Utilities - Utilities - kill
// http://pubs.opengroup.org/onlinepubs/9699919799/utilities/kill.html
val SIGHUP = 1
val SIGINT = 2
val SIGQUIT = 3
val SIGABRT = 6
val SIGKILL = 9
val SIGALRM = 14
val SIGTERM = 15
}

case class ExitStatus(statusValue: Int) {

import ExitStatus._

override def toString(): String = {
def signalAsString(signal: Int): String = {
signal match {
case SIGHUP => "SIGHUP"
case SIGINT => "SIGINT"
case SIGQUIT => "SIGQUIT"
case SIGABRT => "SIGABRT"
case SIGKILL => "SIGKILL"
case SIGALRM => "SIGALRM"
case SIGTERM => "SIGTERM"
case _ => signal.toString
}
}

val detail = statusValue match {
case STATUS_SUCCESSFUL => "successful"
case STATUS_NOT_EXECUTABLE => "not executable"
case STATUS_NOT_FOUND => "not found"
case _ if statusValue >= ExitStatus.STATUS_SIGNAL =>
"terminated by signal " + signalAsString(statusValue - ExitStatus.STATUS_SIGNAL)
case _ => "unsuccessful"
}

s"$statusValue ($detail)"
}

val successful = statusValue == ExitStatus.STATUS_SUCCESSFUL
val terminatedBySIGTERM = (statusValue - ExitStatus.STATUS_SIGNAL) == ExitStatus.SIGTERM
}

case class ProcessRunningException(exitCode: Int, stdout: String, stderr: String)
extends Exception(s"code: $exitCode ${if (exitCode == 143) "(killed)" else ""}, stdout: $stdout, stderr: $stderr")
abstract class ProcessRunningException(info: String, val exitStatus: ExitStatus, val stdout: String, val stderr: String)
extends Exception(s"info: $info, code: $exitStatus, stdout: $stdout, stderr: $stderr")

case class ProcessUnsuccessfulException(override val exitStatus: ExitStatus,
override val stdout: String,
override val stderr: String)
extends ProcessRunningException("command was unsuccessful", exitStatus, stdout, stderr)

case class ProcessTimeoutException(timeout: Duration,
override val exitStatus: ExitStatus,
override val stdout: String,
override val stderr: String)
extends ProcessRunningException(s"command was terminated, took longer than $timeout", exitStatus, stdout, stderr)
10 changes: 8 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,34 +182,40 @@ Metrics below are for invoker state as recorded within load balancer monitoring.

Following metrics capture stats around various docker command executions.

* Pause
* pause
* `openwhisk.counter.invoker_docker.pause_start`
* `openwhisk.counter.invoker_docker.pause_error`
* `openwhisk.counter.invoker_docker.pause_timeout`
* `openwhisk.histogram.invoker_docker.pause_finish`
* `openwhisk.histogram.invoker_docker.pause_error`
* Ps
* ps
* `openwhisk.counter.invoker_docker.ps_start`
* `openwhisk.counter.invoker_docker.ps_error`
* `openwhisk.counter.invoker_docker.ps_timeout`
* `openwhisk.histogram.invoker_docker.ps_finish`
* `openwhisk.histogram.invoker_docker.ps_error`
* pull
* `openwhisk.counter.invoker_docker.pull_start`
* `openwhisk.counter.invoker_docker.pull_error`
* `openwhisk.counter.invoker_docker.pull_timeout`
* `openwhisk.histogram.invoker_docker.pull_finish`
* `openwhisk.histogram.invoker_docker.pull_error`
* rm
* `openwhisk.counter.invoker_docker.rm_start`
* `openwhisk.counter.invoker_docker.rm_error`
* `openwhisk.counter.invoker_docker.rm_timeout`
* `openwhisk.histogram.invoker_docker.rm_finish`
* `openwhisk.histogram.invoker_docker.rm_error`
* run
* `openwhisk.counter.invoker_docker.run_start`
* `openwhisk.counter.invoker_docker.run_error`
* `openwhisk.counter.invoker_docker.run_timeout`
* `openwhisk.histogram.invoker_docker.run_finish`
* `openwhisk.histogram.invoker_docker.run_error`
* unpause
* `openwhisk.counter.invoker_docker.unpause_start`
* `openwhisk.counter.invoker_docker.unpause_error`
* `openwhisk.counter.invoker_docker.unpause_timeout`
* `openwhisk.histogram.invoker_docker.unpause_finish`
* `openwhisk.histogram.invoker_docker.unpause_error`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package whisk.core.containerpool.docker.test

import akka.actor.ActorSystem

import java.util.concurrent.Semaphore

import scala.concurrent.Await
Expand All @@ -41,10 +40,7 @@ import whisk.common.LoggingMarkers.INVOKER_DOCKER_CMD
import whisk.common.TransactionId
import whisk.core.containerpool.ContainerAddress
import whisk.core.containerpool.ContainerId
import whisk.core.containerpool.docker.BrokenDockerContainer
import whisk.core.containerpool.docker.DockerClient
import whisk.core.containerpool.docker.DockerContainerId
import whisk.core.containerpool.docker.ProcessRunningException
import whisk.core.containerpool.docker._
import whisk.utils.retry

@RunWith(classOf[JUnitRunner])
Expand Down Expand Up @@ -246,7 +242,7 @@ class DockerClientTests
runCmdCount += 1
println(s"runCmdCount=${runCmdCount}, args.last=${args.last}")
runCmdCount match {
case 1 => Future.failed(ProcessRunningException(1, "", ""))
case 1 => Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
case 2 => Future.successful(secondContainerId.asString)
case _ => Future.failed(new Throwable())
}
Expand Down Expand Up @@ -337,11 +333,11 @@ class DockerClientTests
runAndVerify(dc.pull("image"), "pull")
}

it should "fail with BrokenDockerContainer when run returns with exit code 125 and a container ID" in {
it should "fail with BrokenDockerContainer when run returns with exit status 125 and a container ID" in {
val dc = dockerClient {
Future.failed(
ProcessRunningException(
exitCode = 125,
ProcessUnsuccessfulException(
exitStatus = ExitStatus(125),
stdout = id.asString,
stderr =
"""/usr/bin/docker: Error response from daemon: mkdir /var/run/docker.1.1/libcontainerd.1.1/55db56ee082239428b27d3728b4dd324c09068458aad9825727d5bfc1bba6d52: no space left on device."""))
Expand All @@ -350,19 +346,44 @@ class DockerClientTests
bdc.id shouldBe id
}

it should "fail with ProcessRunningException when run returns with exit code !=125 or no container ID" in {
it should "fail with ProcessRunningException when run returns with exit code !=125, no container ID or timeout" in {
def runAndVerify(pre: ProcessRunningException, clue: String) = {
val dc = dockerClient { Future.failed(pre) }
withClue(s"${clue} - exitCode = ${pre.exitCode}, stdout = '${pre.stdout}', stderr = '${pre.stderr}': ") {
withClue(s"${clue} - exitStatus = ${pre.exitStatus}, stdout = '${pre.stdout}', stderr = '${pre.stderr}': ") {
the[ProcessRunningException] thrownBy await(dc.run("image", Seq.empty)) shouldBe pre
}
}

Seq[(ProcessRunningException, String)](
(ProcessRunningException(126, id.asString, "Unknown command"), "Exit code not 125"),
(ProcessRunningException(125, "", "Unknown flag: --foo"), "No container ID"),
(ProcessRunningException(1, "", ""), "Exit code not 125 and no container ID")).foreach {
case (pre, clue) => runAndVerify(pre, clue)
(ProcessUnsuccessfulException(ExitStatus(127), id.asString, "Unknown command"), "Exit code not 125"),
(ProcessUnsuccessfulException(ExitStatus(125), "", "Unknown flag: --foo"), "No container ID"),
(ProcessUnsuccessfulException(ExitStatus(1), "", ""), "Exit code not 125 and no container ID"),
(ProcessTimeoutException(1.second, ExitStatus(125), id.asString, ""), "Timeout instead of unsuccessful command"))
.foreach {
case (pre, clue) => runAndVerify(pre, clue)
}
}

it should "fail with ProcessTimeoutException when command times out" in {
val expectedPTE =
ProcessTimeoutException(timeout = 10.seconds, exitStatus = ExitStatus(143), stdout = "stdout", stderr = "stderr")
val dc = dockerClient {
Future.failed(expectedPTE)
}
Seq[(Future[_], String)](
(dc.run("image", Seq.empty), "run"),
(dc.inspectIPAddress(id, "network"), "inspectIPAddress"),
(dc.pause(id), "pause"),
(dc.unpause(id), "unpause"),
(dc.rm(id), "rm"),
(dc.ps(), "ps"),
(dc.pull("image"), "pull"),
(dc.isOomKilled(id), "isOomKilled"))
.foreach {
case (cmd, clue) =>
withClue(s"command '$clue' - ") {
the[ProcessTimeoutException] thrownBy await(cmd) shouldBe expectedPTE
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class DockerContainerTests
override def run(image: String,
args: Seq[String] = Seq.empty[String])(implicit transid: TransactionId): Future[ContainerId] = {
runs += ((image, args))
Future.failed(ProcessRunningException(1, "", ""))
Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
}
}
implicit val runc = stub[RuncApi]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,12 @@ import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

import scala.concurrent.ExecutionContext.Implicits.global
import whisk.core.containerpool.docker.ProcessRunner
import whisk.core.containerpool.docker._

import scala.concurrent.ExecutionContext
import scala.concurrent.duration._
import scala.concurrent.Await
import org.scalatest.Matchers
import whisk.core.containerpool.docker.ProcessRunningException

import scala.language.reflectiveCalls // Needed to invoke run() method of structural ProcessRunner extension

Expand All @@ -55,18 +54,53 @@ class ProcessRunnerTests extends FlatSpec with Matchers with WskActorSystem {
}

it should "run an external command unsuccessfully and capture its output" in {
val exitCode = 1
val exitStatus = ExitStatus(1)
val stdout = "Output"
val stderr = "Error"

val future = processRunner.run(Seq("/bin/sh", "-c", s"echo ${stdout}; echo ${stderr} 1>&2; exit ${exitCode}"))
val future =
processRunner.run(Seq("/bin/sh", "-c", s"echo ${stdout}; echo ${stderr} 1>&2; exit ${exitStatus.statusValue}"))

the[ProcessRunningException] thrownBy await(future) shouldBe ProcessRunningException(exitCode, stdout, stderr)
val exception = the[ProcessRunningException] thrownBy await(future)
exception shouldBe ProcessUnsuccessfulException(exitStatus, stdout, stderr)
exception.getMessage should startWith("info: command was unsuccessful")
}

it should "terminate an external command after the specified timeout is reached" in {
val future = processRunner.run(Seq("sleep", "1"), 100.milliseconds)
val exception = the[ProcessRunningException] thrownBy await(future)
exception.exitCode shouldBe 143
val timeout = 100.milliseconds
// Run "sleep" command for 1 second and make sure that stdout and stderr are dropped
val future = processRunner.run(Seq("/bin/sh", "-c", "sleep 1 1>/dev/null 2>/dev/null"), timeout)
val exception = the[ProcessTimeoutException] thrownBy await(future)
exception shouldBe ProcessTimeoutException(timeout, ExitStatus(143), "", "")
exception.getMessage should startWith(s"info: command was terminated, took longer than $timeout")
}

behavior of "ExitStatus"

it should "provide a proper textual representation" in {
Seq[(Int, String)](
(0, "successful"),
(1, "unsuccessful"),
(125, "unsuccessful"),
(126, "not executable"),
(127, "not found"),
(128, "terminated by signal 0"),
(129, "terminated by signal SIGHUP"),
(130, "terminated by signal SIGINT"),
(131, "terminated by signal SIGQUIT"),
(134, "terminated by signal SIGABRT"),
(137, "terminated by signal SIGKILL"),
(142, "terminated by signal SIGALRM"),
(143, "terminated by signal SIGTERM"),
(144, "terminated by signal 16")).foreach {
case (statusValue, detailText) =>
ExitStatus(statusValue).toString shouldBe s"$statusValue ($detailText)"
}
}

it should "properly classify exit status" in {
withClue("Exit status 0 is successful - ") { ExitStatus(0).successful shouldBe true }
withClue("Exit status 1 is not successful - ") { ExitStatus(1).successful shouldBe false }
withClue("Exit status 143 means terminated by SIGTERM - ") { ExitStatus(143).terminatedBySIGTERM shouldBe true }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ class KubernetesContainerTests
env: Map[String, String] = Map.empty,
labels: Map[String, String] = Map.empty)(implicit transid: TransactionId): Future[KubernetesContainer] = {
runs += ((name, image, env, labels))
Future.failed(ProcessRunningException(1, "", ""))
Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
}
}

Expand Down

0 comments on commit 996d001

Please sign in to comment.