" +
"");
reselectCheckboxesBasedOnTaskTableState();
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
index 67e8f7ea4c880..e255de4d2dd9e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
@@ -221,10 +221,17 @@ private[spark] class ExecutorPodsLifecycleManager(
val pod = podState.pod
val reason = Option(pod.getStatus.getReason)
val message = Option(pod.getStatus.getMessage)
+ val explained = describeExitCode(exitCode)
+ val exitMsg = s"The executor with id $execId exited with exit code $explained."
+ val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}")
+ val msgStr = message.map(m => s"The API gave the following message: ${m}")
+
+
s"""
- |The executor with id $execId exited with exit code $exitCode.
- |The API gave the following brief reason: ${reason.getOrElse("N/A")}
- |The API gave the following message: ${message.getOrElse("N/A")}
+ |${exitMsg}
+ |${reasonStr.getOrElse("")}
+ |${msgStr.getOrElse("")}
+ |
|The API gave the following container statuses:
|
|${containersDescription(pod)}
@@ -246,4 +253,25 @@ private[spark] class ExecutorPodsLifecycleManager(
private object ExecutorPodsLifecycleManager {
val UNKNOWN_EXIT_CODE = -1
+
+ // A utility function to try and help people figure out whats gone wrong faster.
+ def describeExitCode(code: Int): String = {
+ val humanStr = code match {
+ case 0 => "(success)"
+ case 1 => "(generic, look at logs to clarify)"
+ case 42 => "(Douglas Adams fan)"
+ // Spark specific
+ case 10 | 50 => "(Uncaught exception)"
+ case 52 => "(JVM OOM)"
+ case 53 => "(DiskStore failed to create temp dir)"
+ // K8s & JVM specific exit codes
+ case 126 => "(not executable - possibly perm or arch)"
+ case 137 => "(SIGKILL, possible container OOM)"
+ case 139 => "(SIGSEGV: that's unexpected)"
+ case 255 => "(exit-1, your guess is as good as mine)"
+ case _ => "(unexpected)"
+ }
+ s"${code}${humanStr}"
+ }
+
}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index d5a4856d3793e..5dad6a3b7622b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -66,9 +66,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
// Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler
private[k8s] def doRemoveExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
- if (isExecutorActive(executorId)) {
- removeExecutor(executorId, reason)
- }
+ removeExecutor(executorId, reason)
}
private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
index d84d28f98c63c..e3ec53adef6ab 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
@@ -69,7 +69,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
val failedPod = failedExecutorWithoutDeletion(1)
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
- val msg = exitReasonMessage(1, failedPod)
+ val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName)).delete()
@@ -81,7 +81,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
snapshotsStore.notifySubscribers()
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
- val msg = exitReasonMessage(1, failedPod)
+ val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend, times(1)).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName), times(2)).delete()
@@ -114,7 +114,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
eventHandlerUnderTest.conf.set(Config.KUBERNETES_DELETE_EXECUTORS, false)
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
- val msg = exitReasonMessage(1, failedPod)
+ val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName), never()).delete()
@@ -126,13 +126,20 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
assert(pod.getMetadata().getLabels().get(SPARK_EXECUTOR_INACTIVE_LABEL) === "true")
}
- private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String = {
+ private def exitReasonMessage(execId: Int, failedPod: Pod, exitCode: Int): String = {
val reason = Option(failedPod.getStatus.getReason)
val message = Option(failedPod.getStatus.getMessage)
+ val explained = ExecutorPodsLifecycleManager.describeExitCode(exitCode)
+ val exitMsg = s"The executor with id $execId exited with exit code $explained."
+ val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}")
+ val msgStr = message.map(m => s"The API gave the following message: ${m}")
+
+
s"""
- |The executor with id $failedExecutorId exited with exit code 1.
- |The API gave the following brief reason: ${reason.getOrElse("N/A")}
- |The API gave the following message: ${message.getOrElse("N/A")}
+ |${exitMsg}
+ |${reasonStr.getOrElse("")}
+ |${msgStr.getOrElse("")}
+ |
|The API gave the following container statuses:
|
|${containersDescription(failedPod)}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index c4b878df8be49..5dd84e853620e 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -158,10 +158,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
backend.start()
backend.doRemoveExecutor("1", ExecutorKilled)
- verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled))
+ verify(driverEndpointRef).send(RemoveExecutor("1", ExecutorKilled))
backend.doRemoveExecutor("2", ExecutorKilled)
- verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled))
+ verify(driverEndpointRef).send(RemoveExecutor("2", ExecutorKilled))
}
test("Kill executors") {