-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[WIP][SPARK-32913][CORE][K8S] Improve ExecutorDecommissionInfo and ExecutorDecommissionState for different use cases #29788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
|
||
package org.apache.spark | ||
|
||
import org.apache.spark.scheduler.ExecutorDecommissionInfo | ||
import org.apache.spark.scheduler.ExecutorDecommissionReason | ||
|
||
/** | ||
* A client that communicates with the cluster manager to request or kill executors. | ||
|
@@ -88,44 +88,35 @@ private[spark] trait ExecutorAllocationClient { | |
* Default implementation delegates to kill, scheduler must override | ||
* if it supports graceful decommissioning. | ||
* | ||
* @param executorsAndDecomInfo identifiers of executors & decom info. | ||
* @param executorsAndDecomReason identifiers of executors & decom reason. | ||
* @param adjustTargetNumExecutors whether the target number of executors will be adjusted down | ||
* after these executors have been decommissioned. | ||
* @param triggeredByExecutor whether the decommission is triggered at executor. | ||
* @return the ids of the executors acknowledged by the cluster manager to be removed. | ||
*/ | ||
def decommissionExecutors( | ||
executorsAndDecomInfo: Array[(String, ExecutorDecommissionInfo)], | ||
adjustTargetNumExecutors: Boolean, | ||
triggeredByExecutor: Boolean): Seq[String] = { | ||
killExecutors(executorsAndDecomInfo.map(_._1), | ||
executorsAndDecomReason: Array[(String, ExecutorDecommissionReason)], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible that different executors have different There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is how it was earlier -- so we aren't changing the semantics save the renaming :-) And plus yes this can happen: Different executors on different hosts would have different ExecutorDecommissionReason/Info with different hosts potentially in them. This is simply a bulk api : Instead of making n calls we are folding them into one. |
||
adjustTargetNumExecutors: Boolean): Seq[String] = { | ||
killExecutors(executorsAndDecomReason.map(_._1), | ||
adjustTargetNumExecutors, | ||
countFailures = false) | ||
} | ||
|
||
|
||
/** | ||
* Request that the cluster manager decommission the specified executor. | ||
* Delegates to decommissionExecutors. | ||
* | ||
* @param executorId identifiers of executor to decommission | ||
* @param decommissionInfo information about the decommission (reason, host loss) | ||
* @param decomReason the decommission reason of the executor | ||
* @param adjustTargetNumExecutors if we should adjust the target number of executors. | ||
* @param triggeredByExecutor whether the decommission is triggered at executor. | ||
* (TODO: add a new type like `ExecutorDecommissionInfo` for the | ||
* case where executor is decommissioned at executor first, so we | ||
* don't need this extra parameter.) | ||
* @return whether the request is acknowledged by the cluster manager. | ||
*/ | ||
final def decommissionExecutor( | ||
executorId: String, | ||
decommissionInfo: ExecutorDecommissionInfo, | ||
adjustTargetNumExecutors: Boolean, | ||
triggeredByExecutor: Boolean = false): Boolean = { | ||
decomReason: ExecutorDecommissionReason, | ||
adjustTargetNumExecutors: Boolean): Boolean = { | ||
val decommissionedExecutors = decommissionExecutors( | ||
Array((executorId, decommissionInfo)), | ||
adjustTargetNumExecutors = adjustTargetNumExecutors, | ||
triggeredByExecutor = triggeredByExecutor) | ||
Array((executorId, decomReason)), | ||
adjustTargetNumExecutors = adjustTargetNumExecutors) | ||
decommissionedExecutors.nonEmpty && decommissionedExecutors(0).equals(executorId) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1826,7 +1826,7 @@ private[spark] class DAGScheduler( | |
val externalShuffleServiceEnabled = env.blockManager.externalShuffleServiceEnabled | ||
val isHostDecommissioned = taskScheduler | ||
.getExecutorDecommissionState(bmAddress.executorId) | ||
.exists(_.workerHost.isDefined) | ||
.exists(_.isHostDecommissioned) | ||
|
||
// Shuffle output of all executors on host `bmAddress.host` may be lost if: | ||
// - External shuffle service is enabled, so we assume that all shuffle data on node is | ||
|
@@ -2368,7 +2368,7 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler | |
case ExecutorLost(execId, reason) => | ||
val workerHost = reason match { | ||
case ExecutorProcessLost(_, workerHost, _) => workerHost | ||
case ExecutorDecommission(workerHost) => workerHost | ||
case ExecutorDecommission(_, host) => host | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See comment below for
You don't need to add an extra '_' then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this actually a good point! This's actually a rule of Databricks' scala style guide. But I just follow the style of above |
||
case _ => None | ||
} | ||
dagScheduler.handleExecutorLost(execId, workerHost) | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.scheduler | ||
|
||
private[spark] sealed trait ExecutorDecommissionReason { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get why this is a sealed trait, namely we're pattern matching against it. But this seems to remove flexibility for anyone working on scheduler backends There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @holdenk, can you please provide an example of how having this as a sealed trait would limit the flexibility ? It is marked as a private[spark], so the resource manager specific scheduler backends, should be able to extend it ... no ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duh. Sorry for my n00bness. I can totally see why this shouldn't be a sealed trait: For example it is forcing the TestExecutorDecommissionInfo to be in this file. @Ngone51 is there a strong reason for making this be a sealed trait ? Is that required by the RPC framework for example ? If not, I don't think its worth it. |
||
val reason: String = "decommissioned" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also indirectly use the reason for |
||
override def toString: String = reason | ||
} | ||
|
||
/** | ||
* For the case where decommission is trigger because of executor dynamic allocation | ||
*/ | ||
case class DynamicAllocationDecommission() extends ExecutorDecommissionReason { | ||
override val reason: String = "decommissioned by dynamic allocation" | ||
} | ||
|
||
/** | ||
* For the case where decommission is triggered at executor fist. | ||
*/ | ||
class ExecutorTriggeredDecommission extends ExecutorDecommissionReason | ||
|
||
/** | ||
* For the Kubernetes workloads | ||
*/ | ||
case class K8SDecommission() extends ExecutorTriggeredDecommission | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think maybe we could have a better level here, there isn't really anything K8s specific about this kind of message. Rather all external cluster manager decommissions could be the same perhaps? |
||
|
||
/** | ||
* For the Standalone workloads. | ||
* @param workerHost When workerHost is defined, it means the worker has been decommissioned too. | ||
* Used to infer if the shuffle data might be lost even if the external shuffle | ||
* service is enabled. | ||
*/ | ||
case class StandaloneDecommission(workerHost: Option[String] = None) | ||
extends ExecutorDecommissionReason { | ||
override val reason: String = if (workerHost.isDefined) { | ||
s"Worker ${workerHost.get} decommissioned" | ||
} else { | ||
"decommissioned" | ||
} | ||
} | ||
|
||
/** | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this test only class somewhere in the test only package ? See TestResourceIDs as an example. |
||
* For test only. | ||
*/ | ||
case class TestExecutorDecommission(host: Option[String] = None) | ||
extends ExecutorDecommissionReason { | ||
override val reason: String = if (host.isDefined) { | ||
s"Host ${host.get} decommissioned(test)" | ||
} else { | ||
"decommissioned(test)" | ||
} | ||
} | ||
|
||
/** | ||
* State related to decommissioning that is kept by the TaskSchedulerImpl. This state is derived | ||
* from the ExecutorDecommissionReason above but it is kept distinct to allow the state to evolve | ||
* independently from the message. | ||
*/ | ||
case class ExecutorDecommissionState( | ||
// Timestamp the decommissioning commenced as per the Driver's clock, | ||
// to estimate when the executor might eventually be lost if EXECUTOR_DECOMMISSION_KILL_INTERVAL | ||
// is configured. | ||
startTime: Long, | ||
reason: ExecutorDecommissionReason) { | ||
|
||
def isHostDecommissioned: Boolean = reason match { | ||
case StandaloneDecommission(workerHost) => workerHost.isDefined | ||
case _ => false | ||
} | ||
|
||
def host: Option[String] = reason match { | ||
case StandaloneDecommission(workerHost) => workerHost | ||
case _ => None | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,8 @@ case class ExecutorProcessLost( | |
* This is used by the task scheduler to remove state associated with the executor, but | ||
* not yet fail any tasks that were running in the executor before the executor is "fully" lost. | ||
* | ||
* @param workerHost it is defined when the worker is decommissioned too | ||
* @param reason the reason why the executor is decommissioned | ||
* @param host it is defined when the host where the executor located is decommissioned too | ||
*/ | ||
private [spark] case class ExecutorDecommission(workerHost: Option[String] = None) | ||
extends ExecutorLossReason("Executor decommission.") | ||
private [spark] case class ExecutorDecommission(reason: String, host: Option[String] = None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My scala knowledge is really really poor, but I would rather we make this be a non case class if you are planning to do this. Currently, I think the field "reason" is going to be duplicated in the base class ExecutorLossReason and the ExecutorDecommission. That's also the reason why you are pattern matching it above with an additional _ (for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The case class is neeed because we'd apply pattern matching on it. The "reason" is necessary because of class inheritance, no? Please see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can right unapply methods if you need to do pattern matching with something other than a case class. |
||
extends ExecutorLossReason(reason) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -906,12 +906,12 @@ private[spark] class TaskSchedulerImpl( | |
} | ||
|
||
override def executorDecommission( | ||
executorId: String, decommissionInfo: ExecutorDecommissionInfo): Unit = { | ||
executorId: String, reason: ExecutorDecommissionReason): Unit = { | ||
synchronized { | ||
// Don't bother noting decommissioning for executors that we don't know about | ||
if (executorIdToHost.contains(executorId)) { | ||
executorsPendingDecommission(executorId) = | ||
ExecutorDecommissionState(clock.getTimeMillis(), decommissionInfo.workerHost) | ||
ExecutorDecommissionState(clock.getTimeMillis(), reason) | ||
} | ||
} | ||
rootPool.executorDecommission(executorId) | ||
|
@@ -970,6 +970,9 @@ private[spark] class TaskSchedulerImpl( | |
logDebug(s"Executor $executorId on $hostPort lost, but reason not yet known.") | ||
case ExecutorKilled => | ||
logInfo(s"Executor $executorId on $hostPort killed by driver.") | ||
case ExecutorDecommission(reason, _) => | ||
// use logInfo instead of logError as the loss of decommissioned executor is what we expect | ||
logInfo(s"Decommissioned executor $executorId on $hostPort shutdown: $reason") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of 'shutdown', should we say 'is finally lost' ? To be more accurate in this setting. +1 on this change to avoid log spam. |
||
case _ => | ||
logError(s"Lost executor $executorId on $hostPort: $reason") | ||
} | ||
|
@@ -1055,7 +1058,7 @@ private[spark] class TaskSchedulerImpl( | |
// exposed for test | ||
protected final def isHostDecommissioned(host: String): Boolean = { | ||
hostToExecutors.get(host).exists { executors => | ||
executors.exists(e => getExecutorDecommissionState(e).exists(_.workerHost.isDefined)) | ||
executors.exists(e => getExecutorDecommissionState(e).exists(_.isHostDecommissioned)) | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -991,7 +991,7 @@ private[spark] class TaskSetManager( | |
for ((tid, info) <- taskInfos if info.running && info.executorId == execId) { | ||
val exitCausedByApp: Boolean = reason match { | ||
case exited: ExecutorExited => exited.exitCausedByApp | ||
case ExecutorKilled | ExecutorDecommission(_) => false | ||
case ExecutorKilled | ExecutorDecommission(_, _) => false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering if we should instead pattern match in a separate arm like:
To avoid having to change the case arms when we make changes to the structure definitions. |
||
case ExecutorProcessLost(_, _, false) => false | ||
case _ => true | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we renaming this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, I would argue against un-necessary renaming even if it seems a bit "unnatural". It creates un-necessary diff noise.
To me "Info" and "Reason" are both similar: They both portend "additional information".
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess @Ngone51 is trying to follow the style of
TaskEndReason
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank @dongjoon-hyun for the clarification. In addition to follow the style of
TaskEndReason
, I acutally also want to handle the decommission info/reason in the similar way ofTaskEndReason
.