From 4c19e53ef4a0d3f6db2b6b3eef946dd1f3406fb1 Mon Sep 17 00:00:00 2001 From: Kay Ousterhout Date: Mon, 30 Nov 2015 11:50:55 -0800 Subject: [PATCH] [SPARK-4681] Enable executor blacklisting by default. This commit sets the default value of spark.scheduler.executorTaskBlacklistTime to 5 seconds, so that if a task fails on a particular executor, it won't be re-launched on the same executor until 5 seconds has elapsed, to allow time for the task to (preferably) be launched on a different executor, where it may be more likely to complete successfully. --- .../spark/scheduler/TaskSetManager.scala | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index a02f3017cb6e9..5e56fd62af973 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -58,14 +58,21 @@ private[spark] class TaskSetManager( val conf = sched.sc.conf - /* - * Sometimes if an executor is dead or in an otherwise invalid state, the driver - * does not realize right away leading to repeated task failures. If enabled, - * this temporarily prevents a task from re-launching on an executor where - * it just failed. + /** + * This timeout (specified in milliseconds) is used to prevent tasks from being immediatley + * re-launched on an executor where the task has already failed. A task will not be re-launched + * on an executor where it has already failed until this amount of time has elapsed since the + * failure. One example of when this is useful is if an executor is in dead and the driver + * hasn't realized, leading to repeated task failures. Blacklisting can be disabled by setting + * this to 0. + * + * The motivation for the default value of 5 seconds is that it is longer than the default + * locality wait time (spark.locality.wait), so the task will be launched on an executor with + * worse locality (if one is available) before being re-launched on an executor where it failed + * (since running more slowly is preferable to failing). */ private val EXECUTOR_TASK_BLACKLIST_TIMEOUT = - conf.getLong("spark.scheduler.executorTaskBlacklistTime", 0L) + conf.getLong("spark.scheduler.executorTaskBlacklistTime", 5000L) // Quantile of tasks at which to start speculation val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75)