From 4c19e53ef4a0d3f6db2b6b3eef946dd1f3406fb1 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Mon, 30 Nov 2015 11:50:55 -0800
Subject: [PATCH] [SPARK-4681] Enable executor blacklisting by default.

This commit sets the default value of spark.scheduler.executorTaskBlacklistTime
to 5 seconds, so that if a task fails on a particular executor, it won't
be re-launched on the same executor until 5 seconds has elapsed, to allow
time for the task to (preferably) be launched on a different executor, where
it may be more likely to complete successfully.
---
 .../spark/scheduler/TaskSetManager.scala      | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index a02f3017cb6e9..5e56fd62af973 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -58,14 +58,21 @@ private[spark] class TaskSetManager(
 
   val conf = sched.sc.conf
 
-  /*
-   * Sometimes if an executor is dead or in an otherwise invalid state, the driver
-   * does not realize right away leading to repeated task failures. If enabled,
-   * this temporarily prevents a task from re-launching on an executor where
-   * it just failed.
+  /**
+   * This timeout (specified in milliseconds) is used to prevent tasks from being immediatley
+   * re-launched on an executor where the task has already failed. A task will not be re-launched
+   * on an executor where it has already failed until this amount of time has elapsed since the
+   * failure.  One example of when this is useful is if an executor is in dead and the driver
+   * hasn't realized, leading to repeated task failures.  Blacklisting can be disabled by setting
+   * this to 0.
+   *
+   * The motivation for the default value of 5 seconds is that it is longer than the default
+   * locality wait time (spark.locality.wait), so the task will be launched on an executor with
+   * worse locality (if one is available) before being re-launched on an executor where it failed
+   * (since running more slowly is preferable to failing).
    */
   private val EXECUTOR_TASK_BLACKLIST_TIMEOUT =
-    conf.getLong("spark.scheduler.executorTaskBlacklistTime", 0L)
+    conf.getLong("spark.scheduler.executorTaskBlacklistTime", 5000L)
 
   // Quantile of tasks at which to start speculation
   val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75)