apache · wankunde · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -21,6 +21,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.LinkedList;
+import java.util.Optional;
 import java.util.zip.Checksum;
 
 import org.apache.spark.SparkException;
@@ -153,8 +154,10 @@ public long[] getChecksums() {
    * @param isFinalFile if true, this indicates that we're writing the final output file and that
    *                    the bytes written should be counted towards shuffle write metrics rather
    *                    than shuffle spill metrics.
+   * @param finalDataFileDir if present, the directory to write the final output file to. If not
+   *                         present, the file will be written to a temporary directory.
    */
-  private void writeSortedFile(boolean isFinalFile) {
+  private void writeSortedFile(boolean isFinalFile, Optional<File> finalDataFileDir) {
     // Only emit the log if this is an actual spilling.
     if (!isFinalFile) {
       logger.info(
@@ -198,7 +201,8 @@ private void writeSortedFile(boolean isFinalFile) {
     // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
     // createTempShuffleBlock here; see SPARK-3426 for more details.
     final Tuple2<TempShuffleBlockId, File> spilledFileInfo =
-      blockManager.diskBlockManager().createTempShuffleBlock();
+      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
-      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
-        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
+      finalDataFileDir.filter(v -> spills.isEmpty()).map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
-      finalDataFileDir.map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
-        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
+      finalDataFileDir.filter(v -> spills.isEmpty()).map(blockManager.diskBlockManager()::createTempShuffleBlockInDir)
+        .orElseGet(blockManager.diskBlockManager()::createTempShuffleBlock);
     final File file = spilledFileInfo._2();
     final TempShuffleBlockId blockId = spilledFileInfo._1();
     final SpillInfo spillInfo = new SpillInfo(numPartitions, file, blockId);
@@ -292,7 +296,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
       return 0L;
     }
 
-    writeSortedFile(false);
+    writeSortedFile(false, Optional.empty());
     final long spillSize = freeMemory();
     inMemSorter.reset();
     // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the
@@ -440,14 +444,16 @@ public void insertRecord(Object recordBase, long recordOffset, int length, int p
   /**
    * Close the sorter, causing any buffered data to be sorted and written out to disk.
    *
+   * @param finalDataFileDir if present, the directory to write the final output file to. If not
+   *                         present, the file will be written to a temporary directory.
    * @return metadata for the spill files written by this sorter. If no records were ever inserted
    *         into this sorter, then this will return an empty array.
    */
-  public SpillInfo[] closeAndGetSpills() throws IOException {
+  public SpillInfo[] closeAndGetSpills(Optional<File> finalDataFileDir) throws IOException {
     if (inMemSorter != null) {
       // Here we are spilling the remaining data in the buffer. If there is no spill before, this
       // final spill file will be the final shuffle output file.
-      writeSortedFile(/* isFinalFile = */spills.isEmpty());
+      writeSortedFile(/* isFinalFile = */spills.isEmpty(), finalDataFileDir);
       freeMemory();
       inMemSorter.free();
       inMemSorter = null;

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -52,12 +52,14 @@
 import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 import org.apache.spark.serializer.SerializationStream;
 import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.shuffle.api.ShuffleExecutorComponents;
 import org.apache.spark.shuffle.api.ShuffleMapOutputWriter;
 import org.apache.spark.shuffle.api.ShufflePartitionWriter;
 import org.apache.spark.shuffle.api.SingleSpillShuffleMapOutputWriter;
 import org.apache.spark.shuffle.api.WritableByteChannelWrapper;
+import org.apache.spark.shuffle.sort.io.LocalDiskShuffleExecutorComponents;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
@@ -219,7 +221,15 @@ void closeAndWriteOutput() throws IOException {
     updatePeakMemoryUsed();
     serBuffer = null;
     serOutputStream = null;
-    final SpillInfo[] spills = sorter.closeAndGetSpills();
+    Optional<File> finalDataFileDir;
+    if (shuffleExecutorComponents instanceof LocalDiskShuffleExecutorComponents) {
+      File dataFile =
+        new IndexShuffleBlockResolver(sparkConf, blockManager).getDataFile(shuffleId, mapId);
+      finalDataFileDir = Optional.of(dataFile.getParentFile());
+    } else {
+      finalDataFileDir = Optional.empty();
+    }
+    final SpillInfo[] spills = sorter.closeAndGetSpills(finalDataFileDir);
     try {
       partitionLengths = mergeSpills(spills);
     } finally {

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -226,6 +226,24 @@ private[spark] class DiskBlockManager(
     (blockId, getFile(blockId))
   }
 
+  /** Produces a unique block id and File suitable for storing shuffled intermediate results
+   * in the input directory. */
+  def createTempShuffleBlockInDir(fileDir: File): (TempShuffleBlockId, File) = {
+    var blockId = TempShuffleBlockId(UUID.randomUUID())
+    var tmpFile = new File(fileDir, blockId.name)
+    while (tmpFile.exists()) {
+      blockId = TempShuffleBlockId(UUID.randomUUID())
+      tmpFile = new File(fileDir, blockId.name)
+    }
+    if (permissionChangingRequired) {
+      // SPARK-37618: we need to make the file world readable because the parent will
+      // lose the setgid bit when making it group writable. Without this the shuffle
+      // service can't read the shuffle files in a secure setup.
+      createWorldReadableFile(tmpFile)
+    }
+    (blockId, tmpFile)
+  }
+
   /** Produces a unique block id and File suitable for storing shuffled intermediate results. */
   def createTempShuffleBlock(): (TempShuffleBlockId, File) = {
     var blockId = TempShuffleBlockId(UUID.randomUUID())

diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -170,6 +170,21 @@ public void setUp() throws Exception {
       spillFilesCreated.add(file);
       return Tuple2$.MODULE$.apply(blockId, file);
     });
+    when(diskBlockManager.createTempShuffleBlockInDir(any(File.class))).thenAnswer(invocationOnMock -> {
+      File fileDir = (File) invocationOnMock.getArguments()[0];
+      TempShuffleBlockId blockId = new TempShuffleBlockId(UUID.randomUUID());
+      File file = spy(new File(fileDir, blockId.name()));
+      when(file.delete()).thenAnswer(inv -> {
+        totalSpilledDiskBytes += file.length();
+        return inv.callRealMethod();
+      });
+      spillFilesCreated.add(file);
+      return new Tuple2<>(blockId, file);
+    });
+    when(diskBlockManager.getFile(any(BlockId.class))).thenAnswer(invocationOnMock -> {
+      BlockId blockId = (BlockId) invocationOnMock.getArguments()[0];
+      return new File(tempDir, blockId.name());
+    });
     when(diskBlockManager.createTempFileWith(any(File.class))).thenAnswer(invocationOnMock -> {
       File file = (File) invocationOnMock.getArguments()[0];
       return Utils.tempFileWith(file);