apache · zero323 · Sep 19, 2020 · Sep 20, 2020 · Sep 20, 2020 · MichaelChirico
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2863,11 +2863,19 @@ setMethod("unionAll",
 #' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
 #' into account. Input SparkDataFrames can have different data types in the schema.
 #'
+#' When the parameter `allowMissingColumns` is `TRUE`, this function allows
+#' different set of column names between two `SparkDataFrames`.
+#' Missing columns at each side, will be filled with null values.
+#' The missing columns at left `SparkDataFrame` will be added at the end in the schema
+#' of the union result.
+#'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #' This function resolves columns by name (not by position).
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
+#' @param allowMissingColumns logical
+#' @param ... further arguments to be passed to or from other methods.
 #' @return A SparkDataFrame containing the result of the union.
 #' @family SparkDataFrame functions
 #' @rdname unionByName
@@ -2880,12 +2888,15 @@ setMethod("unionAll",
 #' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
 #' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
 #' head(unionByName(df1, df2))
+#'
+#' df3 <- select(createDataFrame(mtcars), "carb")
+#' head(unionByName(df1, df3, allowMissingColumns = TRUE))
 #' }
 #' @note unionByName since 2.3.0
 setMethod("unionByName",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
-          function(x, y) {
-            unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+          function(x, y, allowMissingColumns=FALSE) {
+            unioned <- callJMethod(x@sdf, "unionByName", y@sdf, allowMissingColumns)
             dataFrame(unioned)
           })
 

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -638,7 +638,7 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 
 #' @rdname unionByName
-setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
+setGeneric("unionByName", function(x, y, ...) { standardGeneric("unionByName") })
 
 #' @rdname unpersist
 setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2696,6 +2696,19 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
   expect_error(rbind(df, df2, df3),
                "Names of input data frames are different.")
 
+
+  df4 <- unionByName(df2, select(df2, "age"), TRUE)
+
+  expect_equal(
+      sum(collect(
+          select(df4, alias(isNull(df4$name), "missing_name")
+      ))$missing_name),
+      3
+  )
+
+  testthat::expect_error(unionByName(df2, select(df2, "age"), FALSE))
+  testthat::expect_error(unionByName(df2, select(df2, "age")))
+
   excepted <- arrange(except(df, df2), desc(df$age))
   expect_is(unioned, "SparkDataFrame")
   expect_equal(count(excepted), 2)