From 769780697d81f91e911b5af516c24b8b4291f27d Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 3 Mar 2017 16:53:22 -1000 Subject: [PATCH 1/8] union checks for name consistency --- R/pkg/R/DataFrame.R | 3 +++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 +++ 2 files changed, 6 insertions(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index cc4cfa3423ced..df8ded01bfaed 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2666,6 +2666,9 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { setMethod("union", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { + if (!all.equal(names(x), names(y))){ + stop("Names of input data frames are different.") + } unioned <- callJMethod(x@sdf, "union", y@sdf) dataFrame(unioned) }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index ce0f5a198a259..98b88ea27ea1f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1822,6 +1822,9 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", { expect_equal(count(excepted), 2) expect_equal(first(excepted)$name, "Justin") + expected_error(union(df, df2[, c(2, 1)]), + "Names of input data frames are different.") + intersected <- arrange(intersect(df, df2), df$age) expect_is(unioned, "SparkDataFrame") expect_equal(count(intersected), 1) From 293dc35fd203c0926aeb1e0b483372eb525aeec3 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 3 Mar 2017 22:08:35 -1000 Subject: [PATCH 2/8] fix test issue --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 98b88ea27ea1f..40d49b076e4ad 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1822,7 +1822,7 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", { expect_equal(count(excepted), 2) expect_equal(first(excepted)$name, "Justin") - expected_error(union(df, df2[, c(2, 1)]), + expect_error(union(df, df2[, c(2, 1)]), "Names of input data frames are different.") intersected <- arrange(intersect(df, df2), df$age) From ef8450157fb6c6535f1608899bc3898974ba8454 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 3 Mar 2017 23:12:59 -1000 Subject: [PATCH 3/8] fix equal test --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index df8ded01bfaed..230cbd516505a 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2666,7 +2666,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { setMethod("union", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { - if (!all.equal(names(x), names(y))){ + if (!isTRUE(all.equal(names(x), names(y)))) { stop("Names of input data frames are different.") } unioned <- callJMethod(x@sdf, "union", y@sdf) From 7ea0c4a3929630e1f3508931f300b433725cfe05 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 11:16:10 -0800 Subject: [PATCH 4/8] check names in rbind rather than union --- R/pkg/R/DataFrame.R | 7 ++++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 +++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 230cbd516505a..475c089b93dc5 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2666,9 +2666,6 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { setMethod("union", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { - if (!isTRUE(all.equal(names(x), names(y)))) { - stop("Names of input data frames are different.") - } unioned <- callJMethod(x@sdf, "union", y@sdf) dataFrame(unioned) }) @@ -2712,6 +2709,10 @@ setMethod("unionAll", setMethod("rbind", signature(... = "SparkDataFrame"), function(x, ..., deparse.level = 1) { + nm <- lapply(list(x, ...), names) + if (!isTRUE(Reduce(all.equal, nm))) { + stop("Names of input data frames are different.") + } if (nargs() == 3) { union(x, ...) } else { diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 40d49b076e4ad..784f51d48b09a 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1817,14 +1817,18 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", { expect_equal(count(unioned2), 12) expect_equal(first(unioned2)$name, "Michael") + df3 <- df2 + names(df3)[1] <- "newName" + expect_error(union(df, df3), + "Names of input data frames are different.") + expect_error(union(df, df2, df3), + "Names of input data frames are different.") + excepted <- arrange(except(df, df2), desc(df$age)) expect_is(unioned, "SparkDataFrame") expect_equal(count(excepted), 2) expect_equal(first(excepted)$name, "Justin") - expect_error(union(df, df2[, c(2, 1)]), - "Names of input data frames are different.") - intersected <- arrange(intersect(df, df2), df$age) expect_is(unioned, "SparkDataFrame") expect_equal(count(intersected), 1) From b8b96d61d48417037000372f021ed012928ee2dd Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 14:34:28 -0800 Subject: [PATCH 5/8] update doc and test --- R/pkg/R/DataFrame.R | 6 ++++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 475c089b93dc5..24eed5db00173 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL. +#' Input SparkDataFrames can have different schemas (names and data types). #' #' Note: This does not remove duplicate rows across the two SparkDataFrames. #' @@ -2685,7 +2686,8 @@ setMethod("unionAll", #' Union two or more SparkDataFrames #' -#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL. +#' Union two or more SparkDataFrames by row. In constrast with \link{union}, this method +#' requires that the SparkDataFrames to be unioned have the same column names. #' #' Note: This does not remove duplicate rows across the two SparkDataFrames. #' @@ -2712,7 +2714,7 @@ setMethod("rbind", nm <- lapply(list(x, ...), names) if (!isTRUE(Reduce(all.equal, nm))) { stop("Names of input data frames are different.") - } + } if (nargs() == 3) { union(x, ...) } else { diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 784f51d48b09a..3e4210442c114 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1819,9 +1819,9 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", { df3 <- df2 names(df3)[1] <- "newName" - expect_error(union(df, df3), + expect_error(rbind(df, df3), "Names of input data frames are different.") - expect_error(union(df, df2, df3), + expect_error(rbind(df, df2, df3), "Names of input data frames are different.") excepted <- arrange(except(df, df2), desc(df$age)) From decc4683c536e328cd040c2bd3d80ad77fed588a Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 14:36:02 -0800 Subject: [PATCH 6/8] update doc --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 24eed5db00173..243302d9e405c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2686,8 +2686,8 @@ setMethod("unionAll", #' Union two or more SparkDataFrames #' -#' Union two or more SparkDataFrames by row. In constrast with \link{union}, this method -#' requires that the SparkDataFrames to be unioned have the same column names. +#' Union two or more SparkDataFrames by row. In constrast to \link{union}, this method +#' requires that the input SparkDataFrames have the same column names. #' #' Note: This does not remove duplicate rows across the two SparkDataFrames. #' From cc80de34f3919c366dfb51d4e7e89e1161ea1331 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 15:46:29 -0800 Subject: [PATCH 7/8] fix test issue --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 243302d9e405c..e2489019669b8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2712,7 +2712,7 @@ setMethod("rbind", signature(... = "SparkDataFrame"), function(x, ..., deparse.level = 1) { nm <- lapply(list(x, ...), names) - if (!isTRUE(Reduce(all.equal, nm))) { + if (length(unique(nm)) != 1) { stop("Names of input data frames are different.") } if (nargs() == 3) { From 54427d505b7771cb558fcd3d764ce559ba764c7a Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 18:33:26 -0800 Subject: [PATCH 8/8] update doc --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index e2489019669b8..7198af89e26dc 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2686,7 +2686,7 @@ setMethod("unionAll", #' Union two or more SparkDataFrames #' -#' Union two or more SparkDataFrames by row. In constrast to \link{union}, this method +#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method #' requires that the input SparkDataFrames have the same column names. #' #' Note: This does not remove duplicate rows across the two SparkDataFrames.