Merge branch 'master' into SPARK-30603

apache · Jan 23, 2020 · 3d54845 · 3d54845
2 parents 93d316a + 2330a56
commit 3d54845
Show file tree

Hide file tree

Showing 60 changed files with 1,769 additions and 627 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -66,14 +66,6 @@ jobs:
         export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
         export MAVEN_CLI_OPTS="--no-transfer-progress"
         mkdir -p ~/.m2
-        # `Maven Central` is too flaky in terms of downloading artifacts in `GitHub Action` environment.
-        # `Google Maven Central Mirror` is too slow in terms of sycing upstream. To get the best combination,
-        #   1) we set `Google Maven Central` as a mirror of `central` in `GitHub Action` environment only.
-        #   2) we duplicates `Maven Central` in pom.xml with ID `central_without_mirror`.
-        # In other words, in GitHub Action environment, `central` is mirrored by `Google Maven Central` first.
-        # If `Google Maven Central` doesn't provide the artifact due to its slowness, `central_without_mirror` will be used.
-        # Note that we aim to achieve the above while keeping the existing behavior of non-`GitHub Action` environment unchanged.
-        echo "<settings><mirrors><mirror><id>google-maven-central</id><name>GCS Maven Central mirror</name><url>https://maven-central.storage-download.googleapis.com/repos/central/data/</url><mirrorOf>central</mirrorOf></mirror></mirrors></settings>" > ~/.m2/settings.xml
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
         rm -rf ~/.m2/repository/org/apache/spark
 

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -335,6 +335,7 @@ exportMethods("%<=>%",
               "ntile",
               "otherwise",
               "over",
+              "overlay",
               "percent_rank",
               "pmod",
               "posexplode",

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -136,6 +136,14 @@ NULL
 #'           format to. See 'Details'.
 #'      }
 #' @param y Column to compute on.
+#' @param pos In \itemize{
+#'                \item \code{locate}: a start position of search.
+#'                \item \code{overlay}: a start postiton for replacement.
+#'                }
+#' @param len In \itemize{
+#'               \item \code{lpad} the maximum length of each output result.
+#'               \item \code{overlay} a number of bytes to replace.
+#'               }
 #' @param ... additional Columns.
 #' @name column_string_functions
 #' @rdname column_string_functions
@@ -1319,6 +1327,35 @@ setMethod("negate",
             column(jc)
           })
 
+#' @details
+#' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace},
+#' starting from byte position \code{pos} of \code{src} and proceeding for
+#' \code{len} bytes.
+#'
+#' @param replace a Column with replacement.
+#'
+#' @rdname column_string_functions
+#' @aliases overlay overlay,Column-method,numericOrColumn-method
+#' @note overlay since 3.0.0
+setMethod("overlay",
+  signature(x = "Column", replace = "Column", pos = "numericOrColumn"),
+  function(x, replace, pos, len = -1) {
+    if (is.numeric(pos)) {
+      pos <- lit(as.integer(pos))
+    }
+
+    if (is.numeric(len)) {
+      len <- lit(as.integer(len))
+    }
+
+    jc <- callJStatic(
+      "org.apache.spark.sql.functions", "overlay",
+      x@jc, replace@jc, pos@jc, len@jc
+    )
+
+    column(jc)
+  })
+
 #' @details
 #' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string.
 #'
@@ -2819,7 +2856,6 @@ setMethod("window", signature(x = "Column"),
 #'
 #' @param substr a character string to be matched.
 #' @param str a Column where matches are sought for each entry.
-#' @param pos start position of search.
 #' @rdname column_string_functions
 #' @aliases locate locate,character,Column-method
 #' @note locate since 1.5.0
@@ -2834,7 +2870,6 @@ setMethod("locate", signature(substr = "character", str = "Column"),
 #' @details
 #' \code{lpad}: Left-padded with pad to a length of len.
 #'
-#' @param len maximum length of each output result.
 #' @param pad a character string to be padded with.
 #' @rdname column_string_functions
 #' @aliases lpad lpad,Column,numeric,character-method

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -1149,6 +1149,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @name NULL
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") })
+
 #' @rdname column_window_functions
 #' @name NULL
 setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") })

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1405,6 +1405,8 @@ test_that("column functions", {
     trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
   c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) +
     date_trunc("quarter", c) + current_date() + current_timestamp()
+  c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) +
+    overlay(c1, c2, 3, 4)
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))

diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -317,7 +317,7 @@ final class ShuffleBlockFetcherIterator(
         collectFetchRequests(address, blockInfos, collectedRemoteRequests)
       }
     }
-    val totalBytes = localBlockBytes + remoteBlockBytes
+    val totalBytes = localBlockBytes + remoteBlockBytes + hostLocalBlockBytes
     logInfo(s"Getting $numBlocksToFetch (${Utils.bytesToString(totalBytes)}) non-empty blocks " +
       s"including ${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local and " +
       s"${hostLocalBlocks.size} (${Utils.bytesToString(hostLocalBlockBytes)}) " +

diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: GROUPBY Clause
-displayTitle: GROUPBY Clause
+title: GROUP BY Clause
+displayTitle: GROUP BY Clause
 license: |
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
@@ -18,5 +18,208 @@ license: |
   See the License for the specific language governing permissions and
   limitations under the License.
 ---
+The <code>GROUP BY</code> clause is used to group the rows based on a set of specified grouping expressions and compute aggregations on 
+the group of rows based on one or more specified aggregate functions. Spark also supports advanced aggregations to do multiple 
+aggregations for the same input record set via `GROUPING SETS`, `CUBE`, `ROLLUP` clauses.
 
-**This page is under construction**
+### Syntax
+{% highlight sql %}
+GROUP BY [ GROUPING SETS grouping_sets ] group_expression [ , group_expression [ , ... ] ]
+    [ ( WITH ROLLUP | WITH CUBE | GROUPING SETS grouping_sets ) ) ]
+{% endhighlight %}
+
+### Parameters
+<dl>
+  <dt><code><em>GROUPING SETS</em></code></dt>
+  <dd>
+    Groups the rows for each subset of the expressions specified in the grouping sets. For example, 
+    <code>GROUP BY GROUPING SETS (warehouse, product)</code> is semantically equivalent
+    to union of results of <code>GROUP BY warehouse</code> and <code>GROUP BY product</code>. This clause
+    is shorthand for a <code>UNION ALL</code> where each leg of the <code>UNION ALL</code> 
+    operator performs aggregation of subset of the columns specified in the <code>GROUPING SETS</code> clause.
+  </dd>
+  <dt><code><em>grouping_sets</em></code></dt>
+  <dd>
+    Specifies one of more groupings based on which the <code>GROUP BY</code> clause performs aggregations. A grouping
+    set is specified by a list of comma-separated expressions in parentheses.<br><br>
+    <b>Syntax:</b>
+      <code>
+        (() | (expression [ , ...]))
+      </code>
+  </dd>
+  <dt><code><em>grouping_expression</em></code></dt>
+  <dd>
+    Specifies the critieria based on which the rows are grouped together. The grouping of rows is performed based on
+    result values of the grouping expressions. A grouping expression may be a column alias, a column position
+    or an expression.
+  </dd>
+  <dt><code><em>ROLLUP</em></code></dt>
+  <dd>
+    Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations 
+    based on multiple grouping sets. <code>ROLLUP</code> is shorthand for <code>GROUPING SETS</code>. For example,
+    GROUP BY warehouse, product  WITH ROLLUP is equivalent to GROUP BY <code>warehouse, product</code> GROUPING SETS
+    <code> ((warehouse, product), (warehouse), ())</code>.
+    The N elements of a <code>ROLLUP</code> specification results in N+1 <code>GROUPING SETS</code>.
+  </dd>
+  <dt><code><em>CUBE</em></code></dt>
+  <dd>
+    <code>CUBE</code> clause is used to perform aggregations based on combination of grouping columns specified in the 
+    <code>GROUP BY</code> clause. For example, <code>GROUP BY warehouse, product  WITH CUBE</code> is equivalent 
+    to GROUP BY <code>warehouse, product</code> GROUPING SETS <code>((warehouse, product), (warehouse), (product), ())</code>.
+    The N elements of a <code>CUBE</code> specification results in 2^N <code>GROUPING SETS</code>.
+  </dd>
+</dl>
+
+### Examples
+{% highlight sql %}
+CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT);
+INSERT INTO dealer VALUES (100, 'Fremont', 'Honda Civic', 10),
+                          (100, 'Fremont', 'Honda Accord', 15),
+                          (100, 'Fremont', 'Honda CRV', 7),
+                          (200, 'Dublin', 'Honda Civic', 20),
+                          (200, 'Dublin', 'Honda Accord', 10),
+                          (200, 'Dublin', 'Honda CRV', 3),
+                          (300, 'San Jose', 'Honda Civic', 5),
+                          (300, 'San Jose', 'Honda Accord', 8);
+
+-- Sum of quantity per dealership. Group by `id`.
+SELECT id, sum(quantity) FROM dealer GROUP BY id ORDER BY id;
+
+  +---+-------------+
+  |id |sum(quantity)|
+  +---+-------------+
+  |100|32           |
+  |200|33           |
+  |300|13           |
+  +---+-------------+
+
+-- Use column position in GROUP by clause.
+SELECT id, sum(quantity) FROM dealer GROUP BY 1 ORDER BY 1;
+
+  +---+-------------+
+  |id |sum(quantity)|
+  +---+-------------+
+  |100|32           |
+  |200|33           |
+  |300|13           |
+  +---+-------------+
+
+-- Multiple aggregations.
+-- 1. Sum of quantity per dealership.
+-- 2. Max quantity per dealership. 
+SELECT id, sum(quantity) AS sum, max(quantity) AS max FROM dealer GROUP BY id ORDER BY id;
+
+  +---+---+---+
+  |id |sum|max|
+  +---+---+---+
+  |100|32 |15 |
+  |200|33 |20 |
+  |300|13 |8  |
+  +---+---+---+
+
+-- Aggregations using multiple sets of grouping columns in a single statement.
+-- Following performs aggregations based on four sets of grouping columns.
+-- 1. city, car_model
+-- 2. city
+-- 3. car_model
+-- 4. Empty grouping set. Returns quantities for all city and car models.
+SELECT city, car_model, sum(quantity) AS sum FROM dealer
+   GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ())
+   ORDER BY city;
+
+  +--------+------------+---+
+  |city    |car_model   |sum|
+  +--------+------------+---+
+  |null    |null        |78 |
+  |null    |Honda Accord|33 |
+  |null    |Honda CRV   |10 |
+  |null    |Honda Civic |35 |
+  |Dublin  |null        |33 |
+  |Dublin  |Honda Accord|10 |
+  |Dublin  |Honda CRV   |3  |
+  |Dublin  |Honda Civic |20 |
+  |Fremont |null        |32 |
+  |Fremont |Honda Accord|15 |
+  |Fremont |Honda CRV   |7  |
+  |Fremont |Honda Civic |10 |
+  |San Jose|null        |13 |
+  |San Jose|Honda Accord|8  |
+  |San Jose|Honda Civic |5  |
+  +--------+------------+---+
+
+-- Alternate syntax for `GROUPING SETS` in which both `GROUP BY` and `GROUPING SETS` 
+-- specifications are present.
+SELECT city, car_model, sum(quantity) AS sum FROM dealer
+   GROUP BY city, car_model GROUPING SETS ((city, car_model), (city), (car_model), ())
+   ORDER BY city, car_model;
+
+  +--------+------------+---+
+  |city    |car_model   |sum|
+  +--------+------------+---+
+  |null    |null        |78 |
+  |null    |Honda Accord|33 |
+  |null    |Honda CRV   |10 |
+  |null    |Honda Civic |35 |
+  |Dublin  |null        |33 |
+  |Dublin  |Honda Accord|10 |
+  |Dublin  |Honda CRV   |3  |
+  |Dublin  |Honda Civic |20 |
+  |Fremont |null        |32 |
+  |Fremont |Honda Accord|15 |
+  |Fremont |Honda CRV   |7  |
+  |Fremont |Honda Civic |10 |
+  |San Jose|null        |13 |
+  |San Jose|Honda Accord|8  |
+  |San Jose|Honda Civic |5  |
+  +--------+------------+---+
+
+-- Group by processing with `ROLLUP` clause.
+-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), ())
+SELECT city, car_model, sum(quantity) AS sum FROM dealer
+   GROUP BY city, car_model WITH ROLLUP
+   ORDER BY city, car_model;
+
+  +--------+------------+---+
+  |city    |car_model   |sum|
+  +--------+------------+---+
+  |null    |null        |78 |
+  |Dublin  |null        |33 |
+  |Dublin  |Honda Accord|10 |
+  |Dublin  |Honda CRV   |3  |
+  |Dublin  |Honda Civic |20 |
+  |Fremont |null        |32 |
+  |Fremont |Honda Accord|15 |
+  |Fremont |Honda CRV   |7  |
+  |Fremont |Honda Civic |10 |
+  |San Jose|null        |13 |
+  |San Jose|Honda Accord|8  |
+  |San Jose|Honda Civic |5  |
+  +--------+------------+---+
+
+-- Group by processing with `CUBE` clause.
+-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ())
+SELECT city, car_model, sum(quantity) AS sum FROM dealer
+   GROUP BY city, car_model WITH CUBE 
+   ORDER BY city, car_model;
+
+  +--------+------------+---+
+  |city    |car_model   |sum|
+  +--------+------------+---+
+  |null    |null        |78 |
+  |null    |Honda Accord|33 |
+  |null    |Honda CRV   |10 |
+  |null    |Honda Civic |35 |
+  |Dublin  |null        |33 |
+  |Dublin  |Honda Accord|10 |
+  |Dublin  |Honda CRV   |3  |
+  |Dublin  |Honda Civic |20 |
+  |Fremont |null        |32 |
+  |Fremont |Honda Accord|15 |
+  |Fremont |Honda CRV   |7  |
+  |Fremont |Honda Civic |10 |
+  |San Jose|null        |13 |
+  |San Jose|Honda Accord|8  |
+  |San Jose|Honda Civic |5  |
+  +--------+------------+---+
+
+{% endhighlight %}