diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html index be6d7bc2320e1..37d56a06ded7f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html @@ -128,6 +128,7 @@

Executors

Shuffle Write Logs Thread Dump + Exec Loss Reason diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 2055c8ff11882..5cc2868c62534 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -31,6 +31,14 @@ function getThreadDumpEnabled() { return threadDumpEnabled; } +function formatLossReason(removeReason, type, row) { + if (removeReason) { + return removeReason + } else { + return "" + } +} + function formatStatus(status, type, row) { if (row.isExcluded) { return "Excluded"; @@ -132,7 +140,7 @@ function totalDurationColor(totalGCTime, totalDuration) { } var sumOptionalColumns = [3, 4]; -var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14]; +var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14, 15]; var execDataTable; var sumDataTable; @@ -543,6 +551,10 @@ $(document).ready(function () { data: 'id', render: function (data, type) { return type === 'display' ? ("Thread Dump" ) : data; } + }, + { + data: 'removeReason', + render: formatLossReason } ], "order": [[0, "asc"]], @@ -709,6 +721,7 @@ $(document).ready(function () { "
Peak Pool Memory Direct / Mapped
" + "
Resources
" + "
Resource Profile Id
" + + "
Exec Loss Reason
" + ""); reselectCheckboxesBasedOnTaskTableState(); diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index 67e8f7ea4c880..e255de4d2dd9e 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -221,10 +221,17 @@ private[spark] class ExecutorPodsLifecycleManager( val pod = podState.pod val reason = Option(pod.getStatus.getReason) val message = Option(pod.getStatus.getMessage) + val explained = describeExitCode(exitCode) + val exitMsg = s"The executor with id $execId exited with exit code $explained." + val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}") + val msgStr = message.map(m => s"The API gave the following message: ${m}") + + s""" - |The executor with id $execId exited with exit code $exitCode. - |The API gave the following brief reason: ${reason.getOrElse("N/A")} - |The API gave the following message: ${message.getOrElse("N/A")} + |${exitMsg} + |${reasonStr.getOrElse("")} + |${msgStr.getOrElse("")} + | |The API gave the following container statuses: | |${containersDescription(pod)} @@ -246,4 +253,25 @@ private[spark] class ExecutorPodsLifecycleManager( private object ExecutorPodsLifecycleManager { val UNKNOWN_EXIT_CODE = -1 + + // A utility function to try and help people figure out whats gone wrong faster. + def describeExitCode(code: Int): String = { + val humanStr = code match { + case 0 => "(success)" + case 1 => "(generic, look at logs to clarify)" + case 42 => "(Douglas Adams fan)" + // Spark specific + case 10 | 50 => "(Uncaught exception)" + case 52 => "(JVM OOM)" + case 53 => "(DiskStore failed to create temp dir)" + // K8s & JVM specific exit codes + case 126 => "(not executable - possibly perm or arch)" + case 137 => "(SIGKILL, possible container OOM)" + case 139 => "(SIGSEGV: that's unexpected)" + case 255 => "(exit-1, your guess is as good as mine)" + case _ => "(unexpected)" + } + s"${code}${humanStr}" + } + } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala index d5a4856d3793e..5dad6a3b7622b 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala @@ -66,9 +66,7 @@ private[spark] class KubernetesClusterSchedulerBackend( // Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler private[k8s] def doRemoveExecutor(executorId: String, reason: ExecutorLossReason): Unit = { - if (isExecutorActive(executorId)) { - removeExecutor(executorId, reason) - } + removeExecutor(executorId, reason) } private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala index d84d28f98c63c..e3ec53adef6ab 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala @@ -69,7 +69,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte val failedPod = failedExecutorWithoutDeletion(1) snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName)).delete() @@ -81,7 +81,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte snapshotsStore.notifySubscribers() snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend, times(1)).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName), times(2)).delete() @@ -114,7 +114,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte eventHandlerUnderTest.conf.set(Config.KUBERNETES_DELETE_EXECUTORS, false) snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName), never()).delete() @@ -126,13 +126,20 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte assert(pod.getMetadata().getLabels().get(SPARK_EXECUTOR_INACTIVE_LABEL) === "true") } - private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String = { + private def exitReasonMessage(execId: Int, failedPod: Pod, exitCode: Int): String = { val reason = Option(failedPod.getStatus.getReason) val message = Option(failedPod.getStatus.getMessage) + val explained = ExecutorPodsLifecycleManager.describeExitCode(exitCode) + val exitMsg = s"The executor with id $execId exited with exit code $explained." + val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}") + val msgStr = message.map(m => s"The API gave the following message: ${m}") + + s""" - |The executor with id $failedExecutorId exited with exit code 1. - |The API gave the following brief reason: ${reason.getOrElse("N/A")} - |The API gave the following message: ${message.getOrElse("N/A")} + |${exitMsg} + |${reasonStr.getOrElse("")} + |${msgStr.getOrElse("")} + | |The API gave the following container statuses: | |${containersDescription(failedPod)} diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala index c4b878df8be49..5dd84e853620e 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala @@ -158,10 +158,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn backend.start() backend.doRemoveExecutor("1", ExecutorKilled) - verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled)) + verify(driverEndpointRef).send(RemoveExecutor("1", ExecutorKilled)) backend.doRemoveExecutor("2", ExecutorKilled) - verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled)) + verify(driverEndpointRef).send(RemoveExecutor("2", ExecutorKilled)) } test("Kill executors") {