apache · holdenk · Jun 2, 2020 · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -158,8 +158,6 @@ private[deploy] object DeployMessages {
 
   case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
 
-  case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future.
-
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -668,7 +668,7 @@ private[deploy] class Worker(
       finishedApps += id
       maybeCleanupApplication(id)
 
-    case DecommissionSelf =>
+    case WorkerDecommission(_, _) =>
       decommissionSelf()
   }
 

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -210,6 +210,10 @@ private[spark] class CoarseGrainedExecutorBackend(
     case UpdateDelegationTokens(tokenBytes) =>
       logInfo(s"Received tokens of ${tokenBytes.length} bytes")
       SparkHadoopUtil.get.addDelegationTokens(tokenBytes, env.conf)
+
+    case DecommissionSelf =>
+      logInfo("Received decommission self")
+      decommissionSelf()
   }
 
   override def onDisconnected(remoteAddress: RpcAddress): Unit = {
@@ -258,26 +262,65 @@ private[spark] class CoarseGrainedExecutorBackend(
     System.exit(code)
   }
 
-  private def decommissionSelf(): Boolean = {
-    logInfo("Decommissioning self w/sync")
-    try {
-      decommissioned = true
-      // Tell master we are are decommissioned so it stops trying to schedule us
-      if (driver.nonEmpty) {
-        driver.get.askSync[Boolean](DecommissionExecutor(executorId))
+  private var previousAllBlocksMigrated = false
+  private def shutdownIfDone(): Unit = {
+    val numRunningTasks = executor.numRunningTasks
+    logInfo(s"Checking to see if we can shutdown have ${numRunningTasks} running tasks.")
+    if (executor.numRunningTasks == 0) {
+      if (env.conf.get(STORAGE_DECOMMISSION_ENABLED)) {
+        val allBlocksMigrated = env.blockManager.decommissionManager match {
+          case Some(m) => m.allBlocksMigrated
+          case None => false // We haven't started migrations yet.
+        }
+        if (allBlocksMigrated && previousAllBlocksMigrated) {
+          logInfo("No running tasks, all blocks migrated, stopping.")
+          exitExecutor(0, "Finished decommissioning", notifyDriver = true)
 for ((tid, info) <- taskInfos if info.running && info.executorId == execId) { 
 for ((tid, info) <- taskInfos if info.running && info.executorId == execId) { 
+        }
+        previousAllBlocksMigrated = allBlocksMigrated
       } else {
-        logError("No driver to message decommissioning.")
+        logInfo("No running tasks, no block migration configured, stopping.")
+        exitExecutor(0, "Finished decommissioning", notifyDriver = true)
       }
-      if (executor != null) {
-        executor.decommission()
+    } else {
+      // If there's a running task it could store blocks.
+      previousAllBlocksMigrated = false
+    }
+  }
+
+  private def decommissionSelf(): Boolean = {
+    if (!decommissioned) {
+      logInfo("Decommissioning self w/sync")
+      try {
+        decommissioned = true
+        // Tell master we are are decommissioned so it stops trying to schedule us
+        if (driver.nonEmpty) {
+          driver.get.askSync[Boolean](DecommissionExecutor(executorId))
+        } else {
+          logError("No driver to message decommissioning.")
+        }
+        if (executor != null) {
+          executor.decommission()
+        }
+        // Shutdown the executor once all tasks are gone :)
+        val shutdownThread = new Thread() {
+          while (true) {
+            shutdownIfDone()
+            Thread.sleep(1000) // 1s
+          }
+        }
+        shutdownThread.setDaemon(true)
+        shutdownThread.setName("decommission-shutdown-thread")
+        shutdownThread.start()
+        logInfo("Done decommissioning self.")
+        // Return true since we are handling a signal
+        true
+      } catch {
+        case e: Exception =>
+          logError(s"Error ${e} during attempt to decommission self")
+          false
       }
-      logInfo("Done decommissioning self.")
-      // Return true since we are handling a signal
+    } else {
       true
-    } catch {
-      case e: Exception =>
-        logError(s"Error ${e} during attempt to decommission self")
-        false
     }
   }
 }

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -233,6 +233,7 @@ private[spark] class Executor(
    * Mark an executor for decommissioning and avoid launching new tasks.
    */
   private[spark] def decommission(): Unit = {
+    logInfo("Executor asked to decommission. Starting shutdown thread.")
     decommissioned = true
   }
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -52,6 +52,8 @@ private[spark] object CoarseGrainedClusterMessages {
   case class UpdateDelegationTokens(tokens: Array[Byte])
     extends CoarseGrainedClusterMessage
 
+  case object DecommissionSelf extends CoarseGrainedClusterMessage // Mark as decommissioned.
+
   // Executors to driver
   case class RegisterExecutor(
       executorId: String,

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -27,7 +27,7 @@ import scala.concurrent.Future
 import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{ExecutorAllocationClient, SparkEnv, SparkException, TaskState}
-import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.{DeployMessage, SparkHadoopUtil}
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.executor.ExecutorLogUrlHandler
 import org.apache.spark.internal.Logging
@@ -432,7 +432,18 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       if (shouldDisable) {
         logInfo(s"Starting decommissioning executor $executorId.")
         try {
+          // Stop making offers on this executor
           scheduler.executorDecommission(executorId)
+          // Send decommission message to the executor (it could have originated there but not
+          // necessarily).
+          executorDataMap.get(executorId) match {
+            case Some(executorInfo) =>
+              executorInfo.executorEndpoint.send(
+                DecommissionSelf)
+            case None =>
+              // Ignoring the executor since it is not registered.
+              logWarning(s"Attempted to decommission unknown executor $executorId.")
+          }
         } catch {
           case e: Exception =>
             logError(s"Unexpected error during decommissioning ${e.toString}", e)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -245,7 +245,7 @@ private[spark] class BlockManager(
   private var blockReplicationPolicy: BlockReplicationPolicy = _
 
   private var blockManagerDecommissioning: Boolean = false
-  private var decommissionManager: Option[BlockManagerDecommissionManager] = None
+  private[spark] var decommissionManager: Option[BlockManagerDecommissionManager] = None
 
   // A DownloadFileManager used to track all the files of remote blocks which are above the
   // specified memory threshold. Files will be deleted automatically based on weak reference.
@@ -262,6 +262,8 @@ private[spark] class BlockManager(
 
   // Shuffles which are either in queue for migrations or migrated
   private val migratingShuffles = mutable.HashSet[(Int, Long)]()
+  // Shuffles which have migrated
+  private val migratedShuffles = mutable.HashSet[(Int, Long)]()
   // Shuffles which are queued for migration
   private val shufflesToMigrate = new java.util.concurrent.ConcurrentLinkedQueue[(Int, Long)]()
 
@@ -1822,6 +1824,7 @@ private[spark] class BlockManager(
     }
   }
 
+
   private class ShuffleMigrationRunnable(peer: BlockManagerId) extends Runnable {
     @volatile var running = true
     override def run(): Unit = {
@@ -1862,6 +1865,7 @@ private[spark] class BlockManager(
                 logInfo(s"Migrated sub block ${blockId}")
               }
               logInfo(s"Migrated ${shuffleId},${mapId} to ${peer}")
+              migratedShuffles += ((shuffleId, mapId))
           }
         }
         // This catch is intentionally outside of the while running block.
@@ -1887,7 +1891,7 @@ private[spark] class BlockManager(
    * but rather shadows them.
    * Requires an Indexed based shuffle resolver.
    */
-  def offloadShuffleBlocks(): Unit = {
+  def offloadShuffleBlocks(): Boolean = {
     // Update the queue of shuffles to be migrated
     logInfo("Offloading shuffle blocks")
     val localShuffles = migratableResolver.getStoredShuffles()
@@ -1914,29 +1918,32 @@ private[spark] class BlockManager(
     deadPeers.foreach { peer =>
         migrationPeers.get(peer).foreach(_.running = false)
     }
+    // If we found any new shuffles to migrate or otherwise have not migrated everything.
+    return newShufflesToMigrate.nonEmpty || (migratingShuffles.&~(migratedShuffles)).nonEmpty
   }
 
 
   /**
    * Stop migrating shuffle blocks.
    */
   def stopOffloadingShuffleBlocks(): Unit = {
+    logInfo("Stopping offloading shuffle blocks")
     migrationPeers.values.foreach(_.running = false)
   }
 
   /**
    * Tries to offload all cached RDD blocks from this BlockManager to peer BlockManagers
    * Visible for testing
    */
-  def decommissionRddCacheBlocks(): Unit = {
+  private[spark] def decommissionRddCacheBlocks(): Boolean = {
     val replicateBlocksInfo = master.getReplicateInfoForRDDBlocks(blockManagerId)
 
     if (replicateBlocksInfo.nonEmpty) {
       logInfo(s"Need to replicate ${replicateBlocksInfo.size} RDD blocks " +
         "for block manager decommissioning")
     } else {
       logWarning(s"Asked to decommission RDD cache blocks, but no blocks to migrate")
-      return
+      return false
     }
 
     // Maximum number of storage replication failure which replicateBlock can handle
@@ -1965,6 +1972,7 @@ private[spark] class BlockManager(
       logWarning("Blocks failed replication in cache decommissioning " +
         s"process: ${blocksFailedReplication.mkString(",")}")
     }
+    return true
   }
 
   /**
@@ -2039,8 +2047,11 @@ private[spark] class BlockManager(
    * Class to handle block manager decommissioning retries
    * It creates a Thread to retry offloading all RDD cache blocks
    */
-  private class BlockManagerDecommissionManager(conf: SparkConf) {
+  private[spark] class BlockManagerDecommissionManager(conf: SparkConf) {
     @volatile private var stopped = false
+    // Since running tasks can add more blocks this can change.
+    @volatile var allBlocksMigrated = false
+    var previousBlocksLeft = true
     private val blockMigrationThread = new Thread {
       val sleepInterval = conf.get(
         config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL)
@@ -2053,22 +2064,30 @@ private[spark] class BlockManager(
           && failures < 20) {
           logInfo("Iterating on migrating from the block manager.")
           try {
+            var blocksLeft = false
             // If enabled we migrate shuffle blocks first as they are more expensive.
             if (conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
-              logDebug(s"Attempting to replicate all shuffle blocks")
-              offloadShuffleBlocks()
-              logInfo(s"Done starting workers to migrate shuffle blocks")
+              logDebug("Attempting to replicate all shuffle blocks")
+              blocksLeft = blocksLeft || offloadShuffleBlocks()
+              logInfo("Done starting workers to migrate shuffle blocks")
             }
             if (conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED)) {
-              logDebug(s"Attempting to replicate all cached RDD blocks")
-              decommissionRddCacheBlocks()
-              logInfo(s"Attempt to replicate all cached blocks done")
+              logDebug("Attempting to replicate all cached RDD blocks")
+              blocksLeft = blocksLeft || decommissionRddCacheBlocks()
+              logInfo("Attempt to replicate all cached blocks done")
+              blocksLeft
             }
+            logInfo(s"We have blocksLeft: ${blocksLeft}")
+            // Avoid the situation where  block was added during the loop
+            allBlocksMigrated = (! blocksLeft ) && ( ! previousBlocksLeft )
+            previousBlocksLeft = blocksLeft
             if (!conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED) &&
               !conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
               logWarning("Decommissioning, but no task configured set one or both:\n" +
                 "spark.storage.decommission.shuffle_blocks\n" +
                 "spark.storage.decommission.rdd_blocks")
+              allBlocksMigrated = true
+              stopped = true
             }
             logInfo(s"Waiting for ${sleepInterval} before refreshing migrations.")
             Thread.sleep(sleepInterval)
@@ -2103,6 +2122,7 @@ private[spark] class BlockManager(
   }
 
   def stop(): Unit = {
+    logInfo("Stopping decommission manager")
     decommissionManager.foreach(_.stop())
     blockTransferService.close()
     if (blockStoreClient ne blockTransferService) {

diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionSuite.scala
@@ -59,7 +59,7 @@ class BlockManagerDecommissionSuite extends SparkFunSuite with LocalSparkContext
       .set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, shuffle)
     // Just replicate blocks as fast as we can during testing, there isn't another
     // workload we need to worry about.
-      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1L)
+      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1000L)
 
     sc = new SparkContext(master, "test", conf)
 
@@ -223,10 +223,7 @@ class BlockManagerDecommissionSuite extends SparkFunSuite with LocalSparkContext
       assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
     }
 
-    // Make the executor we decommissioned exit
-    sched.client.killExecutors(List(execToDecommission))
-
-    // Wait for the executor to be removed
+    // Wait for the executor to be removed after blocks are migrated.
     executorRemovedSem.acquire(1)
 
     // Since the RDD is cached or shuffled so further usage of same RDD should use the