apache · wankunde · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.shuffle.api;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.Map;
 import java.util.Optional;
@@ -73,4 +74,16 @@ default Optional<SingleSpillShuffleMapOutputWriter> createSingleFileMapOutputWri
       long mapId) throws IOException {
     return Optional.empty();
   }
+
+  /**
+   * Get the final data file for a given shuffle and map task.
+   * Returns empty if there is special logic in customized shuffle components.
+   *
+   * @param shuffleId Unique identifier for the shuffle the map task is a part of
+   * @param mapId An ID of the map task. The ID is unique within this Spark application.
+   * @return
+   */
+  default Optional<File> getFinalDataFile(int shuffleId, long mapId) {
+    return Optional.empty();
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -21,6 +21,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.LinkedList;
+import java.util.Optional;
 import java.util.zip.Checksum;
 
 import org.apache.spark.SparkException;
@@ -153,8 +154,10 @@ public long[] getChecksums() {
    * @param isFinalFile if true, this indicates that we're writing the final output file and that
    *                    the bytes written should be counted towards shuffle write metrics rather
    *                    than shuffle spill metrics.
+   * @param finalDataFileDir if present, the directory to write the final output file to. If not
+   *                         present, the file will be written to a temporary directory.
    */
-  private void writeSortedFile(boolean isFinalFile) {
+  private void writeSortedFile(boolean isFinalFile, Optional<File> finalDataFileDir) {
     // Only emit the log if this is an actual spilling.
     if (!isFinalFile) {
       logger.info(
@@ -198,7 +201,8 @@ private void writeSortedFile(boolean isFinalFile) {
     // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
     // createTempShuffleBlock here; see SPARK-3426 for more details.
     final Tuple2<TempShuffleBlockId, File> spilledFileInfo =
-      blockManager.diskBlockManager().createTempShuffleBlock();
+      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
-      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
-        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
+      finalDataFileDir.filter(v -> spills.isEmpty()).map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
-      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
-        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
+      finalDataFileDir.filter(v -> spills.isEmpty()).map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
     final File file = spilledFileInfo._2();
     final TempShuffleBlockId blockId = spilledFileInfo._1();
     final SpillInfo spillInfo = new SpillInfo(numPartitions, file, blockId);
@@ -292,7 +296,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
       return 0L;
     }
 
-    writeSortedFile(false);
+    writeSortedFile(false, Optional.empty());
     final long spillSize = freeMemory();
     inMemSorter.reset();
     // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the
@@ -440,14 +444,16 @@ public void insertRecord(Object recordBase, long recordOffset, int length, int p
   /**
    * Close the sorter, causing any buffered data to be sorted and written out to disk.
    *
+   * @param finalDataFileDir if present, the directory to write the final output file to. If not
+   *                         present, the file will be written to a temporary directory.
    * @return metadata for the spill files written by this sorter. If no records were ever inserted
    *         into this sorter, then this will return an empty array.
    */
-  public SpillInfo[] closeAndGetSpills() throws IOException {
+  public SpillInfo[] closeAndGetSpills(Optional<File> finalDataFileDir) throws IOException {
     if (inMemSorter != null) {
       // Here we are spilling the remaining data in the buffer. If there is no spill before, this
       // final spill file will be the final shuffle output file.
-      writeSortedFile(/* isFinalFile = */spills.isEmpty());
+      writeSortedFile(/* isFinalFile = */spills.isEmpty(), finalDataFileDir);
       freeMemory();
       inMemSorter.free();
       inMemSorter = null;

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -219,7 +219,9 @@ void closeAndWriteOutput() throws IOException {
     updatePeakMemoryUsed();
     serBuffer = null;
     serOutputStream = null;
-    final SpillInfo[] spills = sorter.closeAndGetSpills();
+    Optional<File> finalDataFileDir =
+      shuffleExecutorComponents.getFinalDataFile(shuffleId, mapId).map(File::getParentFile);
+    final SpillInfo[] spills = sorter.closeAndGetSpills(finalDataFileDir);
     try {
       partitionLengths = mergeSpills(spills);
     } finally {

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.shuffle.sort.io;
 
+import java.io.File;
 import java.util.Map;
 import java.util.Optional;
 
@@ -82,4 +83,9 @@ public Optional<SingleSpillShuffleMapOutputWriter> createSingleFileMapOutputWrit
     }
     return Optional.of(new LocalDiskSingleSpillMapOutputWriter(shuffleId, mapId, blockResolver));
   }
+
+  @Override
+  public Optional<File> getFinalDataFile(int shuffleId, long mapId) {
+    return Optional.of(blockResolver.getDataFile(shuffleId, mapId));
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -226,6 +226,25 @@ private[spark] class DiskBlockManager(
     (blockId, getFile(blockId))
   }
 
+  /** Produces a unique block id and File suitable for storing shuffled intermediate results
+   * in the input directory.
+   */
+  def createTempShuffleBlockInDir(fileDir: File): (TempShuffleBlockId, File) = {
+    var blockId = TempShuffleBlockId(UUID.randomUUID())
+    var tmpFile = new File(fileDir, blockId.name)
+    while (tmpFile.exists()) {
+      blockId = TempShuffleBlockId(UUID.randomUUID())
+      tmpFile = new File(fileDir, blockId.name)
+    }
+    if (permissionChangingRequired) {
+      // SPARK-37618: we need to make the file world readable because the parent will
+      // lose the setgid bit when making it group writable. Without this the shuffle
+      // service can't read the shuffle files in a secure setup.
+      createWorldReadableFile(tmpFile)
+    }
+    (blockId, tmpFile)
+  }
+
   /** Produces a unique block id and File suitable for storing shuffled intermediate results. */
   def createTempShuffleBlock(): (TempShuffleBlockId, File) = {
     var blockId = TempShuffleBlockId(UUID.randomUUID())

diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -170,6 +170,21 @@ public void setUp() throws Exception {
       spillFilesCreated.add(file);
       return Tuple2$.MODULE$.apply(blockId, file);
     });
+    when(diskBlockManager.createTempShuffleBlockInDir(any(File.class))).thenAnswer(invocation -> {
+      File fileDir = (File) invocation.getArguments()[0];
+      TempShuffleBlockId blockId = new TempShuffleBlockId(UUID.randomUUID());
+      File file = spy(new File(fileDir, blockId.name()));
+      when(file.delete()).thenAnswer(inv -> {
+        totalSpilledDiskBytes += file.length();
+        return inv.callRealMethod();
+      });
+      spillFilesCreated.add(file);
+      return new Tuple2<>(blockId, file);
+    });
+    when(diskBlockManager.getFile(any(BlockId.class))).thenAnswer(invocationOnMock -> {
+      BlockId blockId = (BlockId) invocationOnMock.getArguments()[0];
+      return new File(tempDir, blockId.name());
+    });
     when(diskBlockManager.createTempFileWith(any(File.class))).thenAnswer(invocationOnMock -> {
       File file = (File) invocationOnMock.getArguments()[0];
       return Utils.tempFileWith(file);

diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleDriverComponentsSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDriverComponentsSuite.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.shuffle
 
+import java.io.File
 import java.util.{Map => JMap}
+import java.util.Optional
 import java.util.concurrent.atomic.AtomicBoolean
 
 import com.google.common.collect.ImmutableMap
@@ -90,4 +92,8 @@ class TestShuffleExecutorComponentsInitialized(delegate: ShuffleExecutorComponen
       numPartitions: Int): ShuffleMapOutputWriter = {
     delegate.createMapOutputWriter(shuffleId, mapTaskId, numPartitions)
   }
+
+  override def getFinalDataFile(shuffleId: Int, mapId: Long): Optional[File] = {
+    delegate.getFinalDataFile(shuffleId, mapId)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
@@ -182,6 +182,14 @@ class DiskBlockManagerSuite extends SparkFunSuite {
     }
   }
 
+  test("test createTempShuffleBlockInDir with non-existent directory") {
+    val nonExistsDir = new File("__nonExistsDir__")
+    assert(!nonExistsDir.exists())
+    val (blockId, tmpFile) = diskBlockManager.createTempShuffleBlockInDir(nonExistsDir)
+    assert(tmpFile.getName == blockId.name)
+    assert(tmpFile.getParentFile == nonExistsDir)
+  }
+
   def writeToFile(file: File, numBytes: Int): Unit = {
     val writer = new FileWriter(file, true)
     for (i <- 0 until numBytes) writer.write(i)

diff --git a/...rc/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala b/...rc/main/scala/org/apache/spark/shuffle/KubernetesLocalDiskShuffleExecutorComponents.scala
@@ -67,6 +67,10 @@ class KubernetesLocalDiskShuffleExecutorComponents(sparkConf: SparkConf)
     : Optional[SingleSpillShuffleMapOutputWriter] = {
     delegate.createSingleFileMapOutputWriter(shuffleId, mapId)
   }
+
+  override def getFinalDataFile(shuffleId: Int, mapId: Long): Optional[File] = {
+    delegate.getFinalDataFile(shuffleId, mapId)
+  }
 }
 
 object KubernetesLocalDiskShuffleExecutorComponents extends Logging {