apache · Ngone51 · Sep 17, 2020 · Sep 17, 2020 · holdenk · Sep 17, 2020
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark
 
-import org.apache.spark.scheduler.ExecutorDecommissionInfo
+import org.apache.spark.scheduler.ExecutorDecommissionReason
 
 /**
  * A client that communicates with the cluster manager to request or kill executors.
@@ -88,44 +88,35 @@ private[spark] trait ExecutorAllocationClient {
    * Default implementation delegates to kill, scheduler must override
    * if it supports graceful decommissioning.
    *
-   * @param executorsAndDecomInfo identifiers of executors & decom info.
+   * @param executorsAndDecomReason identifiers of executors & decom reason.
    * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
    *                                 after these executors have been decommissioned.
-   * @param triggeredByExecutor whether the decommission is triggered at executor.
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
   def decommissionExecutors(
-      executorsAndDecomInfo: Array[(String, ExecutorDecommissionInfo)],
-      adjustTargetNumExecutors: Boolean,
-      triggeredByExecutor: Boolean): Seq[String] = {
-    killExecutors(executorsAndDecomInfo.map(_._1),
+      executorsAndDecomReason: Array[(String, ExecutorDecommissionReason)],
+      adjustTargetNumExecutors: Boolean): Seq[String] = {
+    killExecutors(executorsAndDecomReason.map(_._1),
       adjustTargetNumExecutors,
       countFailures = false)
   }
 
-
   /**
    * Request that the cluster manager decommission the specified executor.
    * Delegates to decommissionExecutors.
    *
    * @param executorId identifiers of executor to decommission
-   * @param decommissionInfo information about the decommission (reason, host loss)
+   * @param decomReason the decommission reason of the executor
    * @param adjustTargetNumExecutors if we should adjust the target number of executors.
-   * @param triggeredByExecutor whether the decommission is triggered at executor.
-   *                            (TODO: add a new type like `ExecutorDecommissionInfo` for the
-   *                            case where executor is decommissioned at executor first, so we
-   *                            don't need this extra parameter.)
    * @return whether the request is acknowledged by the cluster manager.
    */
   final def decommissionExecutor(
       executorId: String,
-      decommissionInfo: ExecutorDecommissionInfo,
-      adjustTargetNumExecutors: Boolean,
-      triggeredByExecutor: Boolean = false): Boolean = {
+      decomReason: ExecutorDecommissionReason,
+      adjustTargetNumExecutors: Boolean): Boolean = {
     val decommissionedExecutors = decommissionExecutors(
-      Array((executorId, decommissionInfo)),
-      adjustTargetNumExecutors = adjustTargetNumExecutors,
-      triggeredByExecutor = triggeredByExecutor)
+      Array((executorId, decomReason)),
+      adjustTargetNumExecutors = adjustTargetNumExecutors)
     decommissionedExecutors.nonEmpty && decommissionedExecutors(0).equals(executorId)
   }
 

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -579,12 +579,11 @@ private[spark] class ExecutorAllocationManager(
       // We don't want to change our target number of executors, because we already did that
       // when the task backlog decreased.
       if (decommissionEnabled) {
-        val executorIdsWithoutHostLoss = executorIdsToBeRemoved.toSeq.map(
-          id => (id, ExecutorDecommissionInfo("spark scale down"))).toArray
+        val executorIdsWithoutHostLoss = executorIdsToBeRemoved.toSeq.map(id =>
+          (id, DynamicAllocationDecommission())).toArray[(String, ExecutorDecommissionReason)]
         client.decommissionExecutors(
           executorIdsWithoutHostLoss,
-          adjustTargetNumExecutors = false,
-          triggeredByExecutor = false)
+          adjustTargetNumExecutors = false)
       } else {
         client.killExecutors(executorIdsToBeRemoved.toSeq, adjustTargetNumExecutors = false,
           countFailures = false, force = false)

diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
@@ -31,7 +31,7 @@ import org.apache.spark.deploy.DeployMessages._
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc._
-import org.apache.spark.scheduler.ExecutorDecommissionInfo
+import org.apache.spark.scheduler.StandaloneDecommission
 import org.apache.spark.util.{RpcUtils, ThreadUtils}
 
 /**
@@ -182,8 +182,7 @@ private[spark] class StandaloneAppClient(
         if (ExecutorState.isFinished(state)) {
           listener.executorRemoved(fullId, message.getOrElse(""), exitStatus, workerHost)
         } else if (state == ExecutorState.DECOMMISSIONED) {
-          listener.executorDecommissioned(fullId,
-            ExecutorDecommissionInfo(message.getOrElse(""), workerHost))
+          listener.executorDecommissioned(fullId, StandaloneDecommission(workerHost))
         }
 
       case WorkerRemoved(id, host, message) =>

diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy.client
 
-import org.apache.spark.scheduler.ExecutorDecommissionInfo
+import org.apache.spark.scheduler.ExecutorDecommissionReason
 
 /**
  * Callbacks invoked by deploy client when various events happen. There are currently five events:
@@ -41,7 +41,7 @@ private[spark] trait StandaloneAppClientListener {
   def executorRemoved(
       fullId: String, message: String, exitStatus: Option[Int], workerHost: Option[String]): Unit
 
-  def executorDecommissioned(fullId: String, decommissionInfo: ExecutorDecommissionInfo): Unit
+  def executorDecommissioned(fullId: String, decomReason: ExecutorDecommissionReason): Unit
 
   def workerRemoved(workerId: String, host: String, message: String): Unit
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1826,7 +1826,7 @@ private[spark] class DAGScheduler(
             val externalShuffleServiceEnabled = env.blockManager.externalShuffleServiceEnabled
             val isHostDecommissioned = taskScheduler
               .getExecutorDecommissionState(bmAddress.executorId)
-              .exists(_.workerHost.isDefined)
+              .exists(_.isHostDecommissioned)
 
             // Shuffle output of all executors on host `bmAddress.host` may be lost if:
             // - External shuffle service is enabled, so we assume that all shuffle data on node is
@@ -2368,7 +2368,7 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     case ExecutorLost(execId, reason) =>
       val workerHost = reason match {
         case ExecutorProcessLost(_, workerHost, _) => workerHost
-        case ExecutorDecommission(workerHost) => workerHost
+        case ExecutorDecommission(_, host) => host
         case _ => None
       }
       dagScheduler.handleExecutorLost(execId, workerHost)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorDecommissionInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorDecommissionInfo.scala
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorDecommissionReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorDecommissionReason.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+private[spark] sealed trait ExecutorDecommissionReason {
+  val reason: String = "decommissioned"
+  override def toString: String = reason
+}
+
+/**
+ * For the case where decommission is trigger because of executor dynamic allocation
+ */
+case class DynamicAllocationDecommission() extends ExecutorDecommissionReason {
+  override val reason: String = "decommissioned by dynamic allocation"
+}
+
+/**
+ * For the case where decommission is triggered at executor fist.
+ */
+class ExecutorTriggeredDecommission extends ExecutorDecommissionReason
+
+/**
+ * For the Kubernetes workloads
+ */
+case class K8SDecommission() extends ExecutorTriggeredDecommission
+
+/**
+ * For the Standalone workloads.
+ * @param workerHost When workerHost is defined, it means the worker has been decommissioned too.
+ *                   Used to infer if the shuffle data might be lost even if the external shuffle
+ *                   service is enabled.
+ */
+case class StandaloneDecommission(workerHost: Option[String] = None)
+  extends ExecutorDecommissionReason {
+  override val reason: String = if (workerHost.isDefined) {
+    s"Worker ${workerHost.get} decommissioned"
+  } else {
+    "decommissioned"
+  }
+}
+
+/**
+ * For test only.
+ */
+case class TestExecutorDecommission(host: Option[String] = None)
+  extends ExecutorDecommissionReason {
+  override val reason: String = if (host.isDefined) {
+    s"Host ${host.get} decommissioned(test)"
+  } else {
+    "decommissioned(test)"
+  }
+}
+
+/**
+ * State related to decommissioning that is kept by the TaskSchedulerImpl. This state is derived
+ * from the ExecutorDecommissionReason above but it is kept distinct to allow the state to evolve
+ * independently from the message.
+ */
+case class ExecutorDecommissionState(
+    // Timestamp the decommissioning commenced as per the Driver's clock,
+    // to estimate when the executor might eventually be lost if EXECUTOR_DECOMMISSION_KILL_INTERVAL
+    // is configured.
+    startTime: Long,
+    reason: ExecutorDecommissionReason) {
+
+  def isHostDecommissioned: Boolean = reason match {
+    case StandaloneDecommission(workerHost) => workerHost.isDefined
+    case _ => false
+  }
+
+  def host: Option[String] = reason match {
+    case StandaloneDecommission(workerHost) => workerHost
+    case _ => None
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -71,7 +71,8 @@ case class ExecutorProcessLost(
  * This is used by the task scheduler to remove state associated with the executor, but
  * not yet fail any tasks that were running in the executor before the executor is "fully" lost.
  *
- * @param workerHost it is defined when the worker is decommissioned too
+ * @param reason the reason why the executor is decommissioned
+ * @param host it is defined when the host where the executor located is decommissioned too
  */
-private [spark] case class ExecutorDecommission(workerHost: Option[String] = None)
- extends ExecutorLossReason("Executor decommission.")
+private [spark] case class ExecutorDecommission(reason: String, host: Option[String] = None)
+ extends ExecutorLossReason(reason)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -101,7 +101,7 @@ private[spark] trait TaskScheduler {
   /**
    * Process a decommissioning executor.
    */
-  def executorDecommission(executorId: String, decommissionInfo: ExecutorDecommissionInfo): Unit
+  def executorDecommission(executorId: String, reason: ExecutorDecommissionReason): Unit
 
   /**
    * If an executor is decommissioned, return its corresponding decommission info

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -906,12 +906,12 @@ private[spark] class TaskSchedulerImpl(
   }
 
   override def executorDecommission(
-      executorId: String, decommissionInfo: ExecutorDecommissionInfo): Unit = {
+      executorId: String, reason: ExecutorDecommissionReason): Unit = {
     synchronized {
       // Don't bother noting decommissioning for executors that we don't know about
       if (executorIdToHost.contains(executorId)) {
         executorsPendingDecommission(executorId) =
-          ExecutorDecommissionState(clock.getTimeMillis(), decommissionInfo.workerHost)
+          ExecutorDecommissionState(clock.getTimeMillis(), reason)
       }
     }
     rootPool.executorDecommission(executorId)
@@ -970,6 +970,9 @@ private[spark] class TaskSchedulerImpl(
       logDebug(s"Executor $executorId on $hostPort lost, but reason not yet known.")
     case ExecutorKilled =>
       logInfo(s"Executor $executorId on $hostPort killed by driver.")
+    case ExecutorDecommission(reason, _) =>
+      // use logInfo instead of logError as the loss of decommissioned executor is what we expect
+      logInfo(s"Decommissioned executor $executorId on $hostPort shutdown: $reason")
     case _ =>
       logError(s"Lost executor $executorId on $hostPort: $reason")
   }
@@ -1055,7 +1058,7 @@ private[spark] class TaskSchedulerImpl(
   // exposed for test
   protected final def isHostDecommissioned(host: String): Boolean = {
     hostToExecutors.get(host).exists { executors =>
-      executors.exists(e => getExecutorDecommissionState(e).exists(_.workerHost.isDefined))
+      executors.exists(e => getExecutorDecommissionState(e).exists(_.isHostDecommissioned))
     }
   }
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -991,7 +991,7 @@ private[spark] class TaskSetManager(
     for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
       val exitCausedByApp: Boolean = reason match {
         case exited: ExecutorExited => exited.exitCausedByApp
-        case ExecutorKilled | ExecutorDecommission(_) => false
+        case ExecutorKilled | ExecutorDecommission(_, _) => false
         case ExecutorProcessLost(_, _, false) => false
         case _ => true
       }

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -22,7 +22,6 @@ import java.nio.ByteBuffer
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.resource.{ResourceInformation, ResourceProfile}
 import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.ExecutorDecommissionInfo
 import org.apache.spark.scheduler.ExecutorLossReason
 import org.apache.spark.util.SerializableBuffer