From 42df21610f925f19e570f513faf9aeb537d29483 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 06:27:57 +0200 Subject: [PATCH 1/8] Implement grouping and grouping_id --- R/pkg/NAMESPACE | 2 + R/pkg/R/functions.R | 86 +++++++++++++++++++++++ R/pkg/R/generics.R | 8 +++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 56 ++++++++++++++- 4 files changed, 150 insertions(+), 2 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index e8de34d9371a0..a52feef6cf25e 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -249,6 +249,8 @@ exportMethods("%<=>%", "getField", "getItem", "greatest", + "is_grouping", + "grouping_id", "hex", "histogram", "hour", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f9687d680e7a2..4615a549092cf 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3890,3 +3890,89 @@ setMethod("not", jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc) column(jc) }) + +#' is_grouping +#' +#' Indicates whether a specified column in a GROUP BY list is aggregated or not, +#' returns 1 for aggregated or 0 for not aggregated in the result set. +#' +#' Same as \code{GROUPING} in SQL and \code{grouping} function in Scala. +#' +#' @param x Column to compute on +#' +#' @rdname is_grouping +#' @name is_grouping +#' @family agg_funcs +#' @aliases is_grouping,Column-method +#' @export +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # With cube +#' agg( +#' cube(df, "cyl", "gear", "am"), +#' mean(df$mpg), +#' is_grouping(df$cyl), is_grouping(df$gear), is_grouping(df$am) +#' ) +#' +#' # With rollup +#' agg( +#' rollup(df, "cyl", "gear", "am"), +#' mean(df$mpg), +#' is_grouping(df$cyl), is_grouping(df$gear), is_grouping(df$am) +#' ) +#' } +#' @note is_grouping since 2.3.0 +#' @seealso \link{cube}, \link{grouping_id}, \link{rollup} +setMethod("is_grouping", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "grouping", x@jc) + column(jc) + }) + +#' grouping_id +#' +#' Returns the level of grouping. +#' +#' Equals to \code{ +#' (is_grouping(c1) <<; (n-1)) + (is_grouping(c2) <<; (n-2)) + ... + is_grouping(cn) +#' } +#' +#' @param x Column to compute on +#' @param ... additional Column(s). +#' +#' @rdname grouping_id +#' @name grouping_id +#' @family agg_funcs +#' @aliases grouping_id,Column-method +#' @export +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # With cube +#' agg( +#' cube(df, "cyl", "gear", "am"), +#' mean(df$mpg), +#' grouping_id(df$cyl, df$gear, df$am) +#' ) +#' +#' # With rollup +#' agg( +#' rollup(df, "cyl", "gear", "am"), +#' mean(df$mpg), +#' grouping_id(df$cyl, df$gear, df$am) +#' ) +#' } +#' @note grouping_id since 2.3.0 +#' @seealso \link{cube}, \link{is_grouping}, \link{rollup} +setMethod("grouping_id", + signature(x = "Column"), + function(x, ...) { + jcols <- lapply(list(x, ...), function (x) { + stopifnot(class(x) == "Column") + x@jc + }) + jc <- callJStatic("org.apache.spark.sql.functions", "grouping_id", jcols) + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index d4e4958dc078c..f105d97c5a760 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1052,6 +1052,14 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) +#' @rdname +#' @export +setGeneric("is_grouping", function(x) { standardGeneric("is_grouping") }) + +#' @rdname +#' @export +setGeneric("grouping_id", function(x, ...) { standardGeneric("grouping_id") }) + #' @rdname hex #' @export setGeneric("hex", function(x) { standardGeneric("hex") }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 08296354ca7ed..a32645a5cd377 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1848,7 +1848,11 @@ test_that("test multi-dimensional aggregations with cube and rollup", { orderBy( agg( cube(df, "year", "department"), - expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary") + expr("sum(salary) AS total_salary"), + expr("avg(salary) AS average_salary"), + alias(is_grouping(df$year), "grouping_year"), + alias(is_grouping(df$department), "grouping_department"), + alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department" ) @@ -1875,6 +1879,30 @@ test_that("test multi-dimensional aggregations with cube and rollup", { mean(c(21000, 32000, 22000)), # 2017 22000, 32000, 21000 # 2017 each department ), + grouping_year = c( + 1, # global + 1, 1, 1, # by department + 0, # 2016 + 0, 0, 0, # 2016 by department + 0, # 2017 + 0, 0, 0 # 2017 by department + ), + grouping_department = c( + 1, # global + 0, 0, 0, # by department + 1, # 2016 + 0, 0, 0, # 2016 by department + 1, # 2017 + 0, 0, 0 # 2017 by department + ), + grouping_id = c( + 3, # 0x11 + 2, 2, 2, # 0x10 + 1, # 0x01 + 0, 0, 0, # 0x00 + 1, # 0x01 + 0, 0, 0 # 0x00 + ), stringsAsFactors = FALSE ) @@ -1896,7 +1924,10 @@ test_that("test multi-dimensional aggregations with cube and rollup", { orderBy( agg( rollup(df, "year", "department"), - expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary") + expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"), + alias(is_grouping(df$year), "grouping_year"), + alias(is_grouping(df$department), "grouping_department"), + alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department" ) @@ -1920,6 +1951,27 @@ test_that("test multi-dimensional aggregations with cube and rollup", { mean(c(21000, 32000, 22000)), # 2017 22000, 32000, 21000 # 2017 each department ), + grouping_year = c( + 1, # global + 0, # 2016 + 0, 0, 0, # 2016 each department + 0, # 2017 + 0, 0, 0 # 2017 each department + ), + grouping_department = c( + 1, # global + 1, # 2016 + 0, 0, 0, # 2016 each department + 1, # 2017 + 0, 0, 0 # 2017 each department + ), + grouping_id = c( + 3, # 0x11 + 1, # 0x01 + 0, 0, 0, # 0x00 + 1, # 0x01 + 0, 0, 0 # 0x00 + ), stringsAsFactors = FALSE ) From ae47854e4b6133f97edc380504a8e048e2976183 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 08:10:25 +0200 Subject: [PATCH 2/8] Add missing rdnames --- R/pkg/R/generics.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f105d97c5a760..806af5c3fb05e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1052,11 +1052,11 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) -#' @rdname +#' @rdname is_grouping #' @export setGeneric("is_grouping", function(x) { standardGeneric("is_grouping") }) -#' @rdname +#' @rdname grouping_id #' @export setGeneric("grouping_id", function(x, ...) { standardGeneric("grouping_id") }) From 5729e65c6bbff9a632ba40a501f2d46a88480dc7 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 20:24:06 +0200 Subject: [PATCH 3/8] Remove notes --- R/pkg/R/functions.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 4615a549092cf..344c1d1e017e8 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3923,7 +3923,6 @@ setMethod("not", #' ) #' } #' @note is_grouping since 2.3.0 -#' @seealso \link{cube}, \link{grouping_id}, \link{rollup} setMethod("is_grouping", signature(x = "Column"), function(x) { @@ -3965,7 +3964,6 @@ setMethod("is_grouping", #' ) #' } #' @note grouping_id since 2.3.0 -#' @seealso \link{cube}, \link{is_grouping}, \link{rollup} setMethod("grouping_id", signature(x = "Column"), function(x, ...) { From fa21d6c137468ac25a2d6607a08006bd73ffabcc Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 20:31:16 +0200 Subject: [PATCH 4/8] Rename is_grouping to grouping_col --- R/pkg/NAMESPACE | 2 +- R/pkg/R/functions.R | 18 +++++++++--------- R/pkg/R/generics.R | 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index a52feef6cf25e..acb25bc319f22 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -249,7 +249,7 @@ exportMethods("%<=>%", "getField", "getItem", "greatest", - "is_grouping", + "grouping_col", "grouping_id", "hex", "histogram", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 344c1d1e017e8..fcd99a22249a4 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3891,7 +3891,7 @@ setMethod("not", column(jc) }) -#' is_grouping +#' grouping_col #' #' Indicates whether a specified column in a GROUP BY list is aggregated or not, #' returns 1 for aggregated or 0 for not aggregated in the result set. @@ -3900,10 +3900,10 @@ setMethod("not", #' #' @param x Column to compute on #' -#' @rdname is_grouping -#' @name is_grouping +#' @rdname grouping_col +#' @name grouping_col #' @family agg_funcs -#' @aliases is_grouping,Column-method +#' @aliases grouping_col,Column-method #' @export #' @examples \dontrun{ #' df <- createDataFrame(mtcars) @@ -3912,18 +3912,18 @@ setMethod("not", #' agg( #' cube(df, "cyl", "gear", "am"), #' mean(df$mpg), -#' is_grouping(df$cyl), is_grouping(df$gear), is_grouping(df$am) +#' grouping_col(df$cyl), grouping_col(df$gear), grouping_col(df$am) #' ) #' #' # With rollup #' agg( #' rollup(df, "cyl", "gear", "am"), #' mean(df$mpg), -#' is_grouping(df$cyl), is_grouping(df$gear), is_grouping(df$am) +#' grouping_col(df$cyl), grouping_col(df$gear), grouping_col(df$am) #' ) #' } -#' @note is_grouping since 2.3.0 -setMethod("is_grouping", +#' @note grouping_col since 2.3.0 +setMethod("grouping_col", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "grouping", x@jc) @@ -3935,7 +3935,7 @@ setMethod("is_grouping", #' Returns the level of grouping. #' #' Equals to \code{ -#' (is_grouping(c1) <<; (n-1)) + (is_grouping(c2) <<; (n-2)) + ... + is_grouping(cn) +#' (grouping_col(c1) <<; (n-1)) + (grouping_col(c2) <<; (n-2)) + ... + grouping_col(cn) #' } #' #' @param x Column to compute on diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 806af5c3fb05e..75afb997155cb 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1052,9 +1052,9 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) -#' @rdname is_grouping +#' @rdname grouping_col #' @export -setGeneric("is_grouping", function(x) { standardGeneric("is_grouping") }) +setGeneric("grouping_col", function(x) { standardGeneric("grouping_col") }) #' @rdname grouping_id #' @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index a32645a5cd377..5ef3e434128d8 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1850,8 +1850,8 @@ test_that("test multi-dimensional aggregations with cube and rollup", { cube(df, "year", "department"), expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"), - alias(is_grouping(df$year), "grouping_year"), - alias(is_grouping(df$department), "grouping_department"), + alias(grouping_col(df$year), "grouping_year"), + alias(grouping_col(df$department), "grouping_department"), alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department" @@ -1925,8 +1925,8 @@ test_that("test multi-dimensional aggregations with cube and rollup", { agg( rollup(df, "year", "department"), expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"), - alias(is_grouping(df$year), "grouping_year"), - alias(is_grouping(df$department), "grouping_department"), + alias(grouping_col(df$year), "grouping_year"), + alias(grouping_col(df$department), "grouping_department"), alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department" From 7c2b12ad26316c71591dfaa33972e19488223608 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 21:01:01 +0200 Subject: [PATCH 5/8] Note that additional columns are optional --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index fcd99a22249a4..6d3380acdba4e 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3939,7 +3939,7 @@ setMethod("grouping_col", #' } #' #' @param x Column to compute on -#' @param ... additional Column(s). +#' @param ... additional Column(s) (optional). #' #' @rdname grouping_id #' @name grouping_id From 5b7f9bf68b986262f17a696b44563e66ac99cbc3 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sat, 29 Apr 2017 21:09:11 +0200 Subject: [PATCH 6/8] Adjust comments --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 5ef3e434128d8..7acca91b8304f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1896,12 +1896,12 @@ test_that("test multi-dimensional aggregations with cube and rollup", { 0, 0, 0 # 2017 by department ), grouping_id = c( - 3, # 0x11 - 2, 2, 2, # 0x10 - 1, # 0x01 - 0, 0, 0, # 0x00 - 1, # 0x01 - 0, 0, 0 # 0x00 + 3, # 11 + 2, 2, 2, # 10 + 1, # 01 + 0, 0, 0, # 00 + 1, # 01 + 0, 0, 0 # 00 ), stringsAsFactors = FALSE ) @@ -1966,11 +1966,11 @@ test_that("test multi-dimensional aggregations with cube and rollup", { 0, 0, 0 # 2017 each department ), grouping_id = c( - 3, # 0x11 - 1, # 0x01 - 0, 0, 0, # 0x00 - 1, # 0x01 - 0, 0, 0 # 0x00 + 3, # 11 + 1, # 01 + 0, 0, 0, # 00 + 1, # 01 + 0, 0, 0 # 00 ), stringsAsFactors = FALSE ) From 927a6282f165fb0b3f4b39ff21a657998df358c4 Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 30 Apr 2017 03:35:17 +0200 Subject: [PATCH 7/8] Rewrite grouping_id formula as a SparkR expression --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 6d3380acdba4e..3c3e6fd8065a8 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3935,7 +3935,7 @@ setMethod("grouping_col", #' Returns the level of grouping. #' #' Equals to \code{ -#' (grouping_col(c1) <<; (n-1)) + (grouping_col(c2) <<; (n-2)) + ... + grouping_col(cn) +#' grouping_col(c1) * 2^(n - 1) + grouping_col(c2) * 2^(n - 2) + ... + grouping_col(cn) #' } #' #' @param x Column to compute on From b4cccd90a8722478d20d3455975873c06c26cd2f Mon Sep 17 00:00:00 2001 From: zero323 Date: Mon, 1 May 2017 07:16:10 +0200 Subject: [PATCH 8/8] Rename grouping_col to grouping_bit --- R/pkg/NAMESPACE | 2 +- R/pkg/R/functions.R | 18 +++++++++--------- R/pkg/R/generics.R | 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index acb25bc319f22..7ecd168137e8d 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -249,7 +249,7 @@ exportMethods("%<=>%", "getField", "getItem", "greatest", - "grouping_col", + "grouping_bit", "grouping_id", "hex", "histogram", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 3c3e6fd8065a8..38384a89919a2 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3891,7 +3891,7 @@ setMethod("not", column(jc) }) -#' grouping_col +#' grouping_bit #' #' Indicates whether a specified column in a GROUP BY list is aggregated or not, #' returns 1 for aggregated or 0 for not aggregated in the result set. @@ -3900,10 +3900,10 @@ setMethod("not", #' #' @param x Column to compute on #' -#' @rdname grouping_col -#' @name grouping_col +#' @rdname grouping_bit +#' @name grouping_bit #' @family agg_funcs -#' @aliases grouping_col,Column-method +#' @aliases grouping_bit,Column-method #' @export #' @examples \dontrun{ #' df <- createDataFrame(mtcars) @@ -3912,18 +3912,18 @@ setMethod("not", #' agg( #' cube(df, "cyl", "gear", "am"), #' mean(df$mpg), -#' grouping_col(df$cyl), grouping_col(df$gear), grouping_col(df$am) +#' grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am) #' ) #' #' # With rollup #' agg( #' rollup(df, "cyl", "gear", "am"), #' mean(df$mpg), -#' grouping_col(df$cyl), grouping_col(df$gear), grouping_col(df$am) +#' grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am) #' ) #' } -#' @note grouping_col since 2.3.0 -setMethod("grouping_col", +#' @note grouping_bit since 2.3.0 +setMethod("grouping_bit", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "grouping", x@jc) @@ -3935,7 +3935,7 @@ setMethod("grouping_col", #' Returns the level of grouping. #' #' Equals to \code{ -#' grouping_col(c1) * 2^(n - 1) + grouping_col(c2) * 2^(n - 2) + ... + grouping_col(cn) +#' grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2) + ... + grouping_bit(cn) #' } #' #' @param x Column to compute on diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 75afb997155cb..ca3fcc4d3060f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1052,9 +1052,9 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) -#' @rdname grouping_col +#' @rdname grouping_bit #' @export -setGeneric("grouping_col", function(x) { standardGeneric("grouping_col") }) +setGeneric("grouping_bit", function(x) { standardGeneric("grouping_bit") }) #' @rdname grouping_id #' @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 7acca91b8304f..12867c15d1f95 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1850,8 +1850,8 @@ test_that("test multi-dimensional aggregations with cube and rollup", { cube(df, "year", "department"), expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"), - alias(grouping_col(df$year), "grouping_year"), - alias(grouping_col(df$department), "grouping_department"), + alias(grouping_bit(df$year), "grouping_year"), + alias(grouping_bit(df$department), "grouping_department"), alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department" @@ -1925,8 +1925,8 @@ test_that("test multi-dimensional aggregations with cube and rollup", { agg( rollup(df, "year", "department"), expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"), - alias(grouping_col(df$year), "grouping_year"), - alias(grouping_col(df$department), "grouping_department"), + alias(grouping_bit(df$year), "grouping_year"), + alias(grouping_bit(df$department), "grouping_department"), alias(grouping_id(df$year, df$department), "grouping_id") ), "year", "department"