Merge branch 'master' into decimal-parsing-locale

apache · Nov 27, 2018 · 0d1a4f0 · 0d1a4f0
2 parents 5236336 + 6a064ba
commit 0d1a4f0
Show file tree

Hide file tree

Showing 148 changed files with 4,738 additions and 2,168 deletions.
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -169,6 +169,7 @@ exportMethods("arrange",
               "toJSON",
               "transform",
               "union",
+              "unionAll",
               "unionByName",
               "unique",
               "unpersist",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2732,6 +2732,20 @@ setMethod("union",
             dataFrame(unioned)
           })
 
+#' Return a new SparkDataFrame containing the union of rows
+#'
+#' This is an alias for `union`.
+#'
+#' @rdname union
+#' @name unionAll
+#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method
+#' @note unionAll since 1.4.0
+setMethod("unionAll",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            union(x, y)
+          })
+
 #' Return a new SparkDataFrame containing the union of rows, matched by column names
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -631,6 +631,9 @@ setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
 #' @rdname union
 setGeneric("union", function(x, y) { standardGeneric("union") })
 
+#' @rdname union
+setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
+
 #' @rdname unionByName
 setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
 

diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
@@ -109,7 +109,7 @@ setMethod("corr",
 #'
 #' Finding frequent items for columns, possibly with false positives.
 #' Using the frequent element count algorithm described in
-#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#' \url{https://doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
 #'
 #' @param x A SparkDataFrame.
 #' @param cols A vector column names to search frequent items in.
@@ -143,7 +143,7 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
 #' *exact* rank of x is close to (p * N). More precisely,
 #'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
 #' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
-#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' optimizations). The algorithm was first present in [[https://doi.org/10.1145/375663.375670
 #' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
 #' Note that NA values will be ignored in numerical columns before calculation. For
 #'   columns only containing NA values, an empty list is returned.

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2458,6 +2458,7 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
   expect_equal(count(unioned), 6)
   expect_equal(first(unioned)$name, "Michael")
   expect_equal(count(arrange(suppressWarnings(union(df, df2)), df$age)), 6)
+  expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
 
   df1 <- select(df2, "age", "name")
   unioned1 <- arrange(unionByName(df1, df), df1$age)

diff --git a/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java
@@ -371,26 +371,30 @@ private void assertErrorsContain(Set<String> errors, Set<String> contains) {
 
   private void assertErrorAndClosed(RpcResult result, String expectedError) {
     assertTrue("unexpected success: " + result.successMessages, result.successMessages.isEmpty());
-    // we expect 1 additional error, which should contain one of the follow messages:
-    // - "closed"
-    // - "Connection reset"
-    // - "java.nio.channels.ClosedChannelException"
     Set<String> errors = result.errorMessages;
     assertEquals("Expected 2 errors, got " + errors.size() + "errors: " +
         errors, 2, errors.size());
 
+    // We expect 1 additional error due to closed connection and here are possible keywords in the
+    // error message.
+    Set<String> possibleClosedErrors = Sets.newHashSet(
+        "closed",
+        "Connection reset",
+        "java.nio.channels.ClosedChannelException",
+        "java.io.IOException: Broken pipe"
+    );
     Set<String> containsAndClosed = Sets.newHashSet(expectedError);
-    containsAndClosed.add("closed");
-    containsAndClosed.add("Connection reset");
-    containsAndClosed.add("java.nio.channels.ClosedChannelException");
+    containsAndClosed.addAll(possibleClosedErrors);
 
     Pair<Set<String>, Set<String>> r = checkErrorsContain(errors, containsAndClosed);
 
     assertTrue("Got a non-empty set " + r.getLeft(), r.getLeft().isEmpty());
 
     Set<String> errorsNotFound = r.getRight();
     assertEquals(
-        "The size of " + errorsNotFound.toString() + " was not 2", 2, errorsNotFound.size());
+        "The size of " + errorsNotFound + " was not " + (possibleClosedErrors.size() - 1),
+        possibleClosedErrors.size() - 1,
+        errorsNotFound.size());
     for (String err: errorsNotFound) {
       assertTrue("Found a wrong error " + err, containsAndClosed.contains(err));
     }

diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -83,10 +83,10 @@ public void spill() throws IOException {
   public abstract long spill(long size, MemoryConsumer trigger) throws IOException;
 
   /**
-   * Allocates a LongArray of `size`. Note that this method may throw `OutOfMemoryError` if Spark
-   * doesn't have enough memory for this allocation, or throw `TooLargePageException` if this
-   * `LongArray` is too large to fit in a single page. The caller side should take care of these
-   * two exceptions, or make sure the `size` is small enough that won't trigger exceptions.
+   * Allocates a LongArray of `size`. Note that this method may throw `SparkOutOfMemoryError`
+   * if Spark doesn't have enough memory for this allocation, or throw `TooLargePageException`
+   * if this `LongArray` is too large to fit in a single page. The caller side should take care of
+   * these two exceptions, or make sure the `size` is small enough that won't trigger exceptions.
    *
    * @throws SparkOutOfMemoryError
    * @throws TooLargePageException
@@ -111,7 +111,7 @@ public void freeArray(LongArray array) {
   /**
    * Allocate a memory block with at least `required` bytes.
    *
-   * @throws OutOfMemoryError
+   * @throws SparkOutOfMemoryError
    */
   protected MemoryBlock allocatePage(long required) {
     MemoryBlock page = taskMemoryManager.allocatePage(Math.max(pageSize, required), this);

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -37,12 +37,11 @@
 import org.apache.spark.Partitioner;
 import org.apache.spark.ShuffleDependency;
 import org.apache.spark.SparkConf;
-import org.apache.spark.TaskContext;
-import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.scheduler.MapStatus$;
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.*;
@@ -79,7 +78,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final int numPartitions;
   private final BlockManager blockManager;
   private final Partitioner partitioner;
-  private final ShuffleWriteMetrics writeMetrics;
+  private final ShuffleWriteMetricsReporter writeMetrics;
   private final int shuffleId;
   private final int mapId;
   private final Serializer serializer;
@@ -103,8 +102,8 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
       IndexShuffleBlockResolver shuffleBlockResolver,
       BypassMergeSortShuffleHandle<K, V> handle,
       int mapId,
-      TaskContext taskContext,
-      SparkConf conf) {
+      SparkConf conf,
+      ShuffleWriteMetricsReporter writeMetrics) {
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
     this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
@@ -114,7 +113,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
     this.shuffleId = dep.shuffleId();
     this.partitioner = dep.partitioner();
     this.numPartitions = partitioner.numPartitions();
-    this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
+    this.writeMetrics = writeMetrics;
     this.serializer = dep.serializer();
     this.shuffleBlockResolver = shuffleBlockResolver;
   }

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -38,6 +38,7 @@
 import org.apache.spark.memory.TooLargePageException;
 import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.DiskBlockObjectWriter;
 import org.apache.spark.storage.FileSegment;
@@ -75,7 +76,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
   private final TaskMemoryManager taskMemoryManager;
   private final BlockManager blockManager;
   private final TaskContext taskContext;
-  private final ShuffleWriteMetrics writeMetrics;
+  private final ShuffleWriteMetricsReporter writeMetrics;
 
   /**
    * Force this sorter to spill when there are this many elements in memory.
@@ -113,7 +114,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
       int initialSize,
       int numPartitions,
       SparkConf conf,
-      ShuffleWriteMetrics writeMetrics) {
+      ShuffleWriteMetricsReporter writeMetrics) {
     super(memoryManager,
       (int) Math.min(PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, memoryManager.pageSizeBytes()),
       memoryManager.getTungstenMemoryMode());
@@ -144,7 +145,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
    */
   private void writeSortedFile(boolean isLastFile) {
 
-    final ShuffleWriteMetrics writeMetricsToUse;
+    final ShuffleWriteMetricsReporter writeMetricsToUse;
 
     if (isLastFile) {
       // We're writing the final non-spill file, so we _do_ want to count this as shuffle bytes.
@@ -241,9 +242,14 @@ private void writeSortedFile(boolean isLastFile) {
       //
       // Note that we intentionally ignore the value of `writeMetricsToUse.shuffleWriteTime()`.
       // Consistent with ExternalSorter, we do not count this IO towards shuffle write time.
-      // This means that this IO time is not accounted for anywhere; SPARK-3577 will fix this.
-      writeMetrics.incRecordsWritten(writeMetricsToUse.recordsWritten());
-      taskContext.taskMetrics().incDiskBytesSpilled(writeMetricsToUse.bytesWritten());
+      // SPARK-3577 tracks the spill time separately.
+
+      // This is guaranteed to be a ShuffleWriteMetrics based on the if check in the beginning
+      // of this method.
+      writeMetrics.incRecordsWritten(
+        ((ShuffleWriteMetrics)writeMetricsToUse).recordsWritten());
+      taskContext.taskMetrics().incDiskBytesSpilled(
+        ((ShuffleWriteMetrics)writeMetricsToUse).bytesWritten());
     }
   }
 

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -37,7 +37,6 @@
 
 import org.apache.spark.*;
 import org.apache.spark.annotation.Private;
-import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.io.CompressionCodec;
 import org.apache.spark.io.CompressionCodec$;
 import org.apache.spark.io.NioBufferedFileInputStream;
@@ -47,6 +46,7 @@
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.scheduler.MapStatus$;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 import org.apache.spark.serializer.SerializationStream;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
@@ -73,7 +73,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final TaskMemoryManager memoryManager;
   private final SerializerInstance serializer;
   private final Partitioner partitioner;
-  private final ShuffleWriteMetrics writeMetrics;
+  private final ShuffleWriteMetricsReporter writeMetrics;
   private final int shuffleId;
   private final int mapId;
   private final TaskContext taskContext;
@@ -122,7 +122,8 @@ public UnsafeShuffleWriter(
       SerializedShuffleHandle<K, V> handle,
       int mapId,
       TaskContext taskContext,
-      SparkConf sparkConf) throws IOException {
+      SparkConf sparkConf,
+      ShuffleWriteMetricsReporter writeMetrics) throws IOException {
     final int numPartitions = handle.dependency().partitioner().numPartitions();
     if (numPartitions > SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE()) {
       throw new IllegalArgumentException(
@@ -138,7 +139,7 @@ public UnsafeShuffleWriter(
     this.shuffleId = dep.shuffleId();
     this.serializer = dep.serializer().newInstance();
     this.partitioner = dep.partitioner();
-    this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
+    this.writeMetrics = writeMetrics;
     this.taskContext = taskContext;
     this.sparkConf = sparkConf;
     this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);

diff --git a/core/src/main/java/org/apache/spark/storage/TimeTrackingOutputStream.java b/core/src/main/java/org/apache/spark/storage/TimeTrackingOutputStream.java
@@ -21,7 +21,7 @@
 import java.io.OutputStream;
 
 import org.apache.spark.annotation.Private;
-import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 
 /**
  * Intercepts write calls and tracks total time spent writing in order to update shuffle write
@@ -30,10 +30,11 @@
 @Private
 public final class TimeTrackingOutputStream extends OutputStream {
 
-  private final ShuffleWriteMetrics writeMetrics;
+  private final ShuffleWriteMetricsReporter writeMetrics;
   private final OutputStream outputStream;
 
-  public TimeTrackingOutputStream(ShuffleWriteMetrics writeMetrics, OutputStream outputStream) {
+  public TimeTrackingOutputStream(
+      ShuffleWriteMetricsReporter writeMetrics, OutputStream outputStream) {
     this.writeMetrics = writeMetrics;
     this.outputStream = outputStream;
   }

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -31,6 +31,7 @@
 import org.apache.spark.SparkEnv;
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.SparkOutOfMemoryError;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.storage.BlockManager;
@@ -741,7 +742,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
         if (numKeys >= growthThreshold && longArray.size() < MAX_CAPACITY) {
           try {
             growAndRehash();
-          } catch (OutOfMemoryError oom) {
+          } catch (SparkOutOfMemoryError oom) {
             canGrowArray = false;
           }
         }
@@ -757,7 +758,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
   private boolean acquireNewPage(long required) {
     try {
       currentPage = allocatePage(required);
-    } catch (OutOfMemoryError e) {
+    } catch (SparkOutOfMemoryError e) {
       return false;
     }
     dataPages.add(currentPage);

diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
@@ -16,10 +16,10 @@
 -->
 
 <script id="executors-summary-template" type="text/html">
-  <h4 style="clear: left; display: inline-block;">Summary</h4>
+  <h4 class="title-table">Summary</h4>
   <div class="container-fluid">
     <div class="container-fluid">
-      <table id="summary-execs-table" class="table table-striped compact">
+      <table id="summary-execs-table" class="table table-striped compact cell-border">
         <thead>
         <th></th>
         <th>RDD Blocks</th>
@@ -64,10 +64,10 @@ <h4 style="clear: left; display: inline-block;">Summary</h4>
       </table>
     </div>
   </div>
-  <h4 style="clear: left; display: inline-block;">Executors</h4>
+  <h4 class="title-table">Executors</h4>
   <div class="container-fluid">
     <div class="container-fluid">
-      <table id="active-executors-table" class="table table-striped compact">
+      <table id="active-executors-table" class="table table-striped compact cell-border">
         <thead>
         <tr>
           <th>