apache · juliuszsompolski · Apr 14, 2026 · Apr 16, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java
@@ -331,6 +331,10 @@ public enum LogKeys implements LogKey {
   LABEL_COLUMN,
   LARGEST_CLUSTER_INDEX,
   LAST_ACCESS_TIME,
+  LAST_ATTEMPT_ACC_INVALIDATE,
+  LAST_ATTEMPT_ACC_SYSTEM_METRIC,
+  LAST_ATTEMPT_ACC_UNEXPECTED_REASON,
+  LAST_ATTEMPT_ACC_USER_METRIC,
   LAST_COMMITTED_CHECKPOINT_ID,
   LAST_COMMIT_BASED_CHECKPOINT_ID,
   LAST_SCAN_TIME,

diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -95,6 +95,11 @@ class LogEntry(messageWithContext: => MessageWithContext) {
   def message: String = cachedMessageWithContext.message
 
   def context: java.util.Map[String, String] = cachedMessageWithContext.context
+
+  def +(other: LogEntry): LogEntry = {
+    val combined = cachedMessageWithContext + other.cachedMessageWithContext
+    new LogEntry(combined)
+  }
 }
 
 /**

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -3152,6 +3152,8 @@ object SparkContext extends Logging {
   private[spark] val RDD_SCOPE_KEY = "spark.rdd.scope"
   private[spark] val RDD_SCOPE_NO_OVERRIDE_KEY = "spark.rdd.scope.noOverride"
   private[spark] val SQL_EXECUTION_ID_KEY = "spark.sql.execution.id"
+  private[spark] val DATASET_QUERY_EXECUTION_ID_KEY =
+    "spark.sql.dataset.queryExecution.id"
 
   /**
    * Executor id for the driver.  In earlier versions of Spark, this was `<driver>`, but this was

diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala
@@ -39,6 +39,13 @@ private[spark] object Tests {
     .booleanConf
     .createOptional
 
+  val INJECT_SHUFFLE_FETCH_FAILURES =
+    ConfigBuilder("spark.testing.injectShuffleFetchFailures")
+      .doc("Injecting fetch failures for shuffle stages by providing an invalid BlockManager " +
+        "location for the first stage attempt. Testing only flag!")
+      .booleanConf
+      .createWithDefault(false)
+
   val TEST_NO_STAGE_RETRY = ConfigBuilder("spark.test.noStageRetry")
     .version("1.2.0")
     .booleanConf

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
@@ -130,6 +130,22 @@ private[spark] object RDDOperationScope extends Logging {
       name: String,
       allowNesting: Boolean,
       ignoreParent: Boolean)(body: => T): T = {
+    withScope(sc, name, allowNesting, ignoreParent,
+      nextScopeId().toString)(body)
+  }
+
+  /**
+   * Execute the given body such that all RDDs created in this body
+   * will have the same scope, with an explicit scope ID.
+   *
+   * Note: Return statements are NOT allowed in body.
+   */
+  private[spark] def withScope[T](
+      sc: SparkContext,
+      name: String,
+      allowNesting: Boolean,
+      ignoreParent: Boolean,
+      rddScopeId: String)(body: => T): T = {
     // Save the old scope to restore it later
     val scopeKey = SparkContext.RDD_SCOPE_KEY
     val noOverrideKey = SparkContext.RDD_SCOPE_NO_OVERRIDE_KEY
@@ -139,10 +155,12 @@ private[spark] object RDDOperationScope extends Logging {
     try {
       if (ignoreParent) {
         // Ignore all parent settings and scopes and start afresh with our own root scope
-        sc.setLocalProperty(scopeKey, new RDDOperationScope(name).toJson)
+        sc.setLocalProperty(scopeKey,
+          new RDDOperationScope(name, None, rddScopeId).toJson)
       } else if (sc.getLocalProperty(noOverrideKey) == null) {
         // Otherwise, set the scope only if the higher level caller allows us to do so
-        sc.setLocalProperty(scopeKey, new RDDOperationScope(name, oldScope).toJson)
+        sc.setLocalProperty(scopeKey,
+          new RDDOperationScope(name, oldScope, rddScopeId).toJson)
       }
       // Optionally disallow the child body to override our scope
       if (!allowNesting) {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1858,6 +1858,11 @@ private[spark] class DAGScheduler(
             throw SparkCoreErrors.accessNonExistentAccumulatorError(id)
         }
         acc.merge(updates.asInstanceOf[AccumulatorV2[Any, Any]])
+        if (acc.isInstanceOf[LastAttemptAccumulator[_, _, _]]) {
+          acc.asInstanceOf[LastAttemptAccumulator[_, _, _]].mergeLastAttempt(
+            updates, stage.rdd, event.taskInfo,
+            task.stageId, task.stageAttemptId, task.localProperties)
+        }
         // To avoid UI cruft, ignore cases where value wasn't updated
         if (acc.name.isDefined && !updates.isZero) {
           stage.latestInfo.accumulables(id) = acc.toInfo(None, Some(acc.value))
@@ -2333,6 +2338,19 @@ private[spark] class DAGScheduler(
                 // The epoch of the task is acceptable (i.e., the task was launched after the most
                 // recent failure we're aware of for the executor), so mark the task's output as
                 // available.
+                // For testing purposes, inject fetch failures controlled from the driver-side by
+                // supplying an invalid location.
+                if (Utils.isTesting &&
+                    sc.conf.get(config.Tests.INJECT_SHUFFLE_FETCH_FAILURES) &&
+                    task.stageAttemptId == 0) {
+                  val currentLocation = status.location
+                  val invalidLocation = BlockManagerId(
+                    execId = BlockManagerId.INVALID_EXECUTOR_ID,
+                    host = currentLocation.host,
+                    port = currentLocation.port,
+                    topologyInfo = currentLocation.topologyInfo)
+                  status.updateLocation(invalidLocation)
+                }
                 val isChecksumMismatched = mapOutputTracker.registerMapOutput(
                   shuffleStage.shuffleDep.shuffleId, smt.partitionId, status)
                 if (isChecksumMismatched) {