From 6fc97e02975909cb72e27077aac97d4f90b332d5 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 15:48:55 -0800 Subject: [PATCH 01/31] Support for collect() on Columns --- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/column.R | 63 +++-- R/pkg/R/functions.R | 265 +++++++++++----------- R/pkg/R/generics.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 41 +++- 5 files changed, 209 insertions(+), 164 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3b7b8250b94f7..0d35429a9bcd3 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1177,7 +1177,7 @@ setMethod("foreachPartition", ############################## SELECT ################################## getColumn <- function(x, c) { - column(callJMethod(x@sdf, "col", c)) + column(callJMethod(x@sdf, "col", c), x) } #' @rdname select diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 3ffd9a9890b2e..08fa4960c3391 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +setOldClass("DataFrame") +setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL")) # Column Class @@ -29,24 +31,43 @@ setOldClass("jobj") #' @slot jc reference to JVM DataFrame column #' @export setClass("Column", - slots = list(jc = "jobj")) + slots = list(jc = "jobj", df = "DataFrameOrNULL")) -setMethod("initialize", "Column", function(.Object, jc) { +setMethod("initialize", "Column", function(.Object, jc, df) { .Object@jc <- jc + + # Some Column objects don't have any referencing DataFrame. In such case, df will be NULL. + if (missing(df)) { + df <- NULL + } + .Object@df <- df .Object }) +setMethod("show", signature="Column", definition=function(object) { + MAX_ELEMENTS <- 20 + show(head(object, MAX_ELEMENTS)) + cat(paste0("\b...\nDisplaying up to ", as.character(MAX_ELEMENTS) ," elements only.")) +}) + +setMethod("collect", signature="Column", definition=function(x) { + if (is.null(x@df)) { + stop("This column cannot be collected as it's not associated to any DataFrame.") + } + collect(select(x@df, x))[, 1] +}) + +setMethod("head", signature="Column", definition=function(x, n=6) { + if (is.null(x@df)) { + stop("This column cannot be collected as it's not associated to any DataFrame.") + } + head(select(x@df, x), n)[, 1] +}) + setMethod("column", signature(x = "jobj"), - function(x) { - new("Column", x) - }) - -#' @rdname show -#' @name show -setMethod("show", "Column", - function(object) { - cat("Column", callJMethod(object@jc, "toString"), "\n") + function(x, df=NA) { + new("Column", x, df) }) operators <- list( @@ -79,7 +100,7 @@ createOperator <- function(op) { callJMethod(e1@jc, operators[[op]], e2) } } - column(jc) + column(jc, e1@df) }) } @@ -87,7 +108,7 @@ createColumnFunction1 <- function(name) { setMethod(name, signature(x = "Column"), function(x) { - column(callJMethod(x@jc, name)) + column(callJMethod(x@jc, name), x@df) }) } @@ -99,7 +120,7 @@ createColumnFunction2 <- function(name) { data <- data@jc } jc <- callJMethod(x@jc, name, data) - column(jc) + column(jc, x@df) }) } @@ -129,7 +150,7 @@ setMethod("alias", signature(object = "Column"), function(object, data) { if (is.character(data)) { - column(callJMethod(object@jc, "as", data)) + column(callJMethod(object@jc, "as", data), object@df) } else { stop("data should be character") } @@ -148,7 +169,7 @@ setMethod("alias", setMethod("substr", signature(x = "Column"), function(x, start, stop) { jc <- callJMethod(x@jc, "substr", as.integer(start - 1), as.integer(stop - start + 1)) - column(jc) + column(jc, x@df) }) #' between @@ -164,7 +185,7 @@ setMethod("between", signature(x = "Column"), function(x, bounds) { if (is.vector(bounds) && length(bounds) == 2) { jc <- callJMethod(x@jc, "between", bounds[1], bounds[2]) - column(jc) + column(jc, x@df) } else { stop("bounds should be a vector of lower and upper bounds") } @@ -184,11 +205,11 @@ setMethod("cast", signature(x = "Column"), function(x, dataType) { if (is.character(dataType)) { - column(callJMethod(x@jc, "cast", dataType)) + column(callJMethod(x@jc, "cast", dataType), x@df) } else if (is.list(dataType)) { json <- tojson(dataType) jdataType <- callJStatic("org.apache.spark.sql.types.DataType", "fromJson", json) - column(callJMethod(x@jc, "cast", jdataType)) + column(callJMethod(x@jc, "cast", jdataType), x@df) } else { stop("dataType should be character or list") } @@ -210,7 +231,7 @@ setMethod("%in%", signature(x = "Column"), function(x, table) { jc <- callJMethod(x@jc, "isin", as.list(table)) - return(column(jc)) + column(jc, x@df) }) #' otherwise @@ -227,5 +248,5 @@ setMethod("otherwise", function(x, value) { value <- if (class(value) == "Column") { value@jc } else { value } jc <- callJMethod(x@jc, "otherwise", value) - column(jc) + column(jc, x@df) }) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index e5521f3cffadf..97ccebf42666a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -54,7 +54,7 @@ setMethod("abs", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "abs", x@jc) - column(jc) + column(jc, x@df) }) #' acos @@ -71,7 +71,7 @@ setMethod("acos", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "acos", x@jc) - column(jc) + column(jc, x@df) }) #' approxCountDistinct @@ -87,7 +87,7 @@ setMethod("approxCountDistinct", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc) - column(jc) + column(jc, x@df) }) #' ascii @@ -104,7 +104,7 @@ setMethod("ascii", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "ascii", x@jc) - column(jc) + column(jc, x@df) }) #' asin @@ -121,7 +121,7 @@ setMethod("asin", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "asin", x@jc) - column(jc) + column(jc, x@df) }) #' atan @@ -137,7 +137,7 @@ setMethod("atan", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "atan", x@jc) - column(jc) + column(jc, x@df) }) #' avg @@ -153,7 +153,7 @@ setMethod("avg", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "avg", x@jc) - column(jc) + column(jc, x@df) }) #' base64 @@ -170,7 +170,7 @@ setMethod("base64", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "base64", x@jc) - column(jc) + column(jc, x@df) }) #' bin @@ -187,7 +187,7 @@ setMethod("bin", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "bin", x@jc) - column(jc) + column(jc, x@df) }) #' bitwiseNOT @@ -203,7 +203,7 @@ setMethod("bitwiseNOT", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc) - column(jc) + column(jc, x@df) }) #' cbrt @@ -219,7 +219,7 @@ setMethod("cbrt", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "cbrt", x@jc) - column(jc) + column(jc, x@df) }) #' ceil @@ -235,7 +235,7 @@ setMethod("ceil", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "ceil", x@jc) - column(jc) + column(jc, x@df) }) #' Though scala functions has "col" function, we don't expose it in SparkR @@ -272,7 +272,7 @@ setMethod("corr", signature(x = "Column"), function(x, col2) { stopifnot(class(col2) == "Column") jc <- callJStatic("org.apache.spark.sql.functions", "corr", x@jc, col2@jc) - column(jc) + column(jc, x@df) }) #' cov @@ -301,12 +301,14 @@ setMethod("cov", signature(x = "characterOrColumn"), setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"), function(col1, col2) { stopifnot(class(col1) == class(col2)) + df <- NULL if (class(col1) == "Column") { + df <- col1@df col1 <- col1@jc col2 <- col2@jc } jc <- callJStatic("org.apache.spark.sql.functions", "covar_samp", col1, col2) - column(jc) + column(jc, df) }) #' covar_pop @@ -325,12 +327,14 @@ setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterO setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"), function(col1, col2) { stopifnot(class(col1) == class(col2)) + df <- NULL if (class(col1) == "Column") { + df <- col1@df col1 <- col1@jc col2 <- col2@jc } jc <- callJStatic("org.apache.spark.sql.functions", "covar_pop", col1, col2) - column(jc) + column(jc, df) }) #' cos @@ -346,7 +350,7 @@ setMethod("cos", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "cos", x@jc) - column(jc) + column(jc, x@df) }) #' cosh @@ -362,7 +366,7 @@ setMethod("cosh", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "cosh", x@jc) - column(jc) + column(jc, x@df) }) #' count @@ -378,7 +382,7 @@ setMethod("count", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "count", x@jc) - column(jc) + column(jc, x@df) }) #' crc32 @@ -395,7 +399,7 @@ setMethod("crc32", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "crc32", x@jc) - column(jc) + column(jc, x@df) }) #' hash @@ -415,7 +419,7 @@ setMethod("hash", x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "hash", jcols) - column(jc) + column(jc, x@df) }) #' dayofmonth @@ -431,7 +435,7 @@ setMethod("dayofmonth", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "dayofmonth", x@jc) - column(jc) + column(jc, x@df) }) #' dayofyear @@ -447,7 +451,7 @@ setMethod("dayofyear", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "dayofyear", x@jc) - column(jc) + column(jc, x@df) }) #' decode @@ -464,7 +468,7 @@ setMethod("decode", signature(x = "Column", charset = "character"), function(x, charset) { jc <- callJStatic("org.apache.spark.sql.functions", "decode", x@jc, charset) - column(jc) + column(jc, x@df) }) #' encode @@ -481,7 +485,7 @@ setMethod("encode", signature(x = "Column", charset = "character"), function(x, charset) { jc <- callJStatic("org.apache.spark.sql.functions", "encode", x@jc, charset) - column(jc) + column(jc, x@df) }) #' exp @@ -497,7 +501,7 @@ setMethod("exp", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "exp", x@jc) - column(jc) + column(jc, x@df) }) #' expm1 @@ -513,7 +517,7 @@ setMethod("expm1", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "expm1", x@jc) - column(jc) + column(jc, x@df) }) #' factorial @@ -529,7 +533,7 @@ setMethod("factorial", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "factorial", x@jc) - column(jc) + column(jc, x@df) }) #' first @@ -545,7 +549,7 @@ setMethod("first", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "first", x@jc) - column(jc) + column(jc, x@df) }) #' floor @@ -561,7 +565,7 @@ setMethod("floor", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "floor", x@jc) - column(jc) + column(jc, x@df) }) #' hex @@ -577,7 +581,7 @@ setMethod("hex", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "hex", x@jc) - column(jc) + column(jc, x@df) }) #' hour @@ -593,7 +597,7 @@ setMethod("hour", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "hour", x@jc) - column(jc) + column(jc, x@df) }) #' initcap @@ -612,7 +616,7 @@ setMethod("initcap", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "initcap", x@jc) - column(jc) + column(jc, x@df) }) #' is.nan @@ -640,7 +644,7 @@ setMethod("isnan", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc) - column(jc) + column(jc, x@df) }) #' kurtosis @@ -656,7 +660,7 @@ setMethod("kurtosis", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "kurtosis", x@jc) - column(jc) + column(jc, x@df) }) #' last @@ -672,7 +676,7 @@ setMethod("last", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "last", x@jc) - column(jc) + column(jc, x@df) }) #' last_day @@ -690,7 +694,7 @@ setMethod("last_day", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "last_day", x@jc) - column(jc) + column(jc, x@df) }) #' length @@ -706,7 +710,7 @@ setMethod("length", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "length", x@jc) - column(jc) + column(jc, x@df) }) #' log @@ -722,7 +726,7 @@ setMethod("log", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "log", x@jc) - column(jc) + column(jc, x@df) }) #' log10 @@ -738,7 +742,7 @@ setMethod("log10", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "log10", x@jc) - column(jc) + column(jc, x@df) }) #' log1p @@ -754,7 +758,7 @@ setMethod("log1p", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "log1p", x@jc) - column(jc) + column(jc, x@df) }) #' log2 @@ -770,7 +774,7 @@ setMethod("log2", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "log2", x@jc) - column(jc) + column(jc, x@df) }) #' lower @@ -786,7 +790,7 @@ setMethod("lower", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "lower", x@jc) - column(jc) + column(jc, x@df) }) #' ltrim @@ -802,7 +806,7 @@ setMethod("ltrim", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc) - column(jc) + column(jc, x@df) }) #' max @@ -818,7 +822,7 @@ setMethod("max", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "max", x@jc) - column(jc) + column(jc, x@df) }) #' md5 @@ -835,7 +839,7 @@ setMethod("md5", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "md5", x@jc) - column(jc) + column(jc, x@df) }) #' mean @@ -852,7 +856,7 @@ setMethod("mean", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "mean", x@jc) - column(jc) + column(jc, x@df) }) #' min @@ -868,7 +872,7 @@ setMethod("min", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "min", x@jc) - column(jc) + column(jc, x@df) }) #' minute @@ -884,7 +888,7 @@ setMethod("minute", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "minute", x@jc) - column(jc) + column(jc, x@df) }) #' month @@ -900,7 +904,7 @@ setMethod("month", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "month", x@jc) - column(jc) + column(jc, x@df) }) #' negate @@ -916,7 +920,7 @@ setMethod("negate", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "negate", x@jc) - column(jc) + column(jc, x@df) }) #' quarter @@ -932,7 +936,7 @@ setMethod("quarter", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "quarter", x@jc) - column(jc) + column(jc, x@df) }) #' reverse @@ -948,7 +952,7 @@ setMethod("reverse", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "reverse", x@jc) - column(jc) + column(jc, x@df) }) #' rint @@ -965,7 +969,7 @@ setMethod("rint", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "rint", x@jc) - column(jc) + column(jc, x@df) }) #' round @@ -981,7 +985,7 @@ setMethod("round", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "round", x@jc) - column(jc) + column(jc, x@df) }) #' rtrim @@ -997,7 +1001,7 @@ setMethod("rtrim", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc) - column(jc) + column(jc, x@df) }) #' sd @@ -1035,7 +1039,7 @@ setMethod("second", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "second", x@jc) - column(jc) + column(jc, x@df) }) #' sha1 @@ -1052,7 +1056,7 @@ setMethod("sha1", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sha1", x@jc) - column(jc) + column(jc, x@df) }) #' signum @@ -1068,7 +1072,7 @@ setMethod("signum", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "signum", x@jc) - column(jc) + column(jc, x@df) }) #' sin @@ -1084,7 +1088,7 @@ setMethod("sin", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sin", x@jc) - column(jc) + column(jc, x@df) }) #' sinh @@ -1100,7 +1104,7 @@ setMethod("sinh", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sinh", x@jc) - column(jc) + column(jc, x@df) }) #' skewness @@ -1116,7 +1120,7 @@ setMethod("skewness", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "skewness", x@jc) - column(jc) + column(jc, x@df) }) #' soundex @@ -1132,7 +1136,7 @@ setMethod("soundex", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "soundex", x@jc) - column(jc) + column(jc, x@df) }) #' @rdname sd @@ -1141,7 +1145,7 @@ setMethod("stddev", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc) - column(jc) + column(jc, x@df) }) #' stddev_pop @@ -1158,7 +1162,7 @@ setMethod("stddev_pop", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc) - column(jc) + column(jc, x@df) }) #' stddev_samp @@ -1175,7 +1179,7 @@ setMethod("stddev_samp", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc) - column(jc) + column(jc, x@df) }) #' struct @@ -1197,10 +1201,11 @@ setMethod("struct", if (class(x) == "Column") { jcols <- lapply(list(x, ...), function(x) { x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "struct", jcols) + column(jc, x@df) } else { jc <- callJStatic("org.apache.spark.sql.functions", "struct", x, list(...)) + column(jc) } - column(jc) }) #' sqrt @@ -1216,7 +1221,7 @@ setMethod("sqrt", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sqrt", x@jc) - column(jc) + column(jc, x@df) }) #' sum @@ -1232,7 +1237,7 @@ setMethod("sum", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sum", x@jc) - column(jc) + column(jc, x@df) }) #' sumDistinct @@ -1248,7 +1253,7 @@ setMethod("sumDistinct", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc) - column(jc) + column(jc, x@df) }) #' tan @@ -1264,7 +1269,7 @@ setMethod("tan", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "tan", x@jc) - column(jc) + column(jc, x@df) }) #' tanh @@ -1280,7 +1285,7 @@ setMethod("tanh", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "tanh", x@jc) - column(jc) + column(jc, x@df) }) #' toDegrees @@ -1296,7 +1301,7 @@ setMethod("toDegrees", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "toDegrees", x@jc) - column(jc) + column(jc, x@df) }) #' toRadians @@ -1312,7 +1317,7 @@ setMethod("toRadians", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "toRadians", x@jc) - column(jc) + column(jc, x@df) }) #' to_date @@ -1328,7 +1333,7 @@ setMethod("to_date", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc) - column(jc) + column(jc, x@df) }) #' trim @@ -1344,7 +1349,7 @@ setMethod("trim", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc) - column(jc) + column(jc, x@df) }) #' unbase64 @@ -1361,7 +1366,7 @@ setMethod("unbase64", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "unbase64", x@jc) - column(jc) + column(jc, x@df) }) #' unhex @@ -1378,7 +1383,7 @@ setMethod("unhex", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "unhex", x@jc) - column(jc) + column(jc, x@df) }) #' upper @@ -1394,7 +1399,7 @@ setMethod("upper", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "upper", x@jc) - column(jc) + column(jc, x@df) }) #' var @@ -1425,7 +1430,7 @@ setMethod("variance", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc) - column(jc) + column(jc, x@df) }) #' var_pop @@ -1442,7 +1447,7 @@ setMethod("var_pop", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc) - column(jc) + column(jc, x@df) }) #' var_samp @@ -1459,7 +1464,7 @@ setMethod("var_samp", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc) - column(jc) + column(jc, x@df) }) #' weekofyear @@ -1475,7 +1480,7 @@ setMethod("weekofyear", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "weekofyear", x@jc) - column(jc) + column(jc, x@df) }) #' year @@ -1491,7 +1496,7 @@ setMethod("year", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "year", x@jc) - column(jc) + column(jc, x@df) }) #' atan2 @@ -1510,7 +1515,7 @@ setMethod("atan2", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "atan2", y@jc, x) - column(jc) + column(jc, y@df) }) #' datediff @@ -1528,7 +1533,7 @@ setMethod("datediff", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "datediff", y@jc, x) - column(jc) + column(jc, y@df) }) #' hypot @@ -1546,7 +1551,7 @@ setMethod("hypot", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "hypot", y@jc, x) - column(jc) + column(jc, y@df) }) #' levenshtein @@ -1564,7 +1569,7 @@ setMethod("levenshtein", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "levenshtein", y@jc, x) - column(jc) + column(jc, y@df) }) #' months_between @@ -1582,7 +1587,7 @@ setMethod("months_between", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "months_between", y@jc, x) - column(jc) + column(jc, y@df) }) #' nanvl @@ -1601,7 +1606,7 @@ setMethod("nanvl", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "nanvl", y@jc, x) - column(jc) + column(jc, y@df) }) #' pmod @@ -1620,7 +1625,7 @@ setMethod("pmod", signature(y = "Column"), x <- x@jc } jc <- callJStatic("org.apache.spark.sql.functions", "pmod", y@jc, x) - column(jc) + column(jc, y@df) }) @@ -1636,7 +1641,7 @@ setMethod("approxCountDistinct", signature(x = "Column"), function(x, rsd = 0.05) { jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd) - column(jc) + column(jc, x@df) }) #' Count Distinct @@ -1656,7 +1661,7 @@ setMethod("countDistinct", }) jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc, jcols) - column(jc) + column(jc, x@df) }) @@ -1677,7 +1682,7 @@ setMethod("concat", x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "concat", jcols) - column(jc) + column(jc, x@df) }) #' greatest @@ -1699,7 +1704,7 @@ setMethod("greatest", x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "greatest", jcols) - column(jc) + column(jc, x@df) }) #' least @@ -1721,7 +1726,7 @@ setMethod("least", x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "least", jcols) - column(jc) + column(jc, x@df) }) #' ceiling @@ -1796,7 +1801,7 @@ setMethod("n", signature(x = "Column"), setMethod("date_format", signature(y = "Column", x = "character"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "date_format", y@jc, x) - column(jc) + column(jc, y@df) }) #' from_utc_timestamp @@ -1811,7 +1816,7 @@ setMethod("date_format", signature(y = "Column", x = "character"), setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x) - column(jc) + column(jc, y@df) }) #' instr @@ -1830,7 +1835,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), setMethod("instr", signature(y = "Column", x = "character"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "instr", y@jc, x) - column(jc) + column(jc, y@df) }) #' next_day @@ -1856,7 +1861,7 @@ setMethod("instr", signature(y = "Column", x = "character"), setMethod("next_day", signature(y = "Column", x = "character"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "next_day", y@jc, x) - column(jc) + column(jc, y@df) }) #' to_utc_timestamp @@ -1871,7 +1876,7 @@ setMethod("next_day", signature(y = "Column", x = "character"), setMethod("to_utc_timestamp", signature(y = "Column", x = "character"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x) - column(jc) + column(jc, y@df) }) #' add_months @@ -1886,7 +1891,7 @@ setMethod("to_utc_timestamp", signature(y = "Column", x = "character"), setMethod("add_months", signature(y = "Column", x = "numeric"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "add_months", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' date_add @@ -1901,7 +1906,7 @@ setMethod("add_months", signature(y = "Column", x = "numeric"), setMethod("date_add", signature(y = "Column", x = "numeric"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "date_add", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' date_sub @@ -1916,7 +1921,7 @@ setMethod("date_add", signature(y = "Column", x = "numeric"), setMethod("date_sub", signature(y = "Column", x = "numeric"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "date_sub", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' format_number @@ -1939,7 +1944,7 @@ setMethod("format_number", signature(y = "Column", x = "numeric"), jc <- callJStatic("org.apache.spark.sql.functions", "format_number", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' sha2 @@ -1957,7 +1962,7 @@ setMethod("format_number", signature(y = "Column", x = "numeric"), setMethod("sha2", signature(y = "Column", x = "numeric"), function(y, x) { jc <- callJStatic("org.apache.spark.sql.functions", "sha2", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' shiftLeft @@ -1975,7 +1980,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"), jc <- callJStatic("org.apache.spark.sql.functions", "shiftLeft", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' shiftRight @@ -1993,7 +1998,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"), jc <- callJStatic("org.apache.spark.sql.functions", "shiftRight", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' shiftRightUnsigned @@ -2011,7 +2016,7 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"), jc <- callJStatic("org.apache.spark.sql.functions", "shiftRightUnsigned", y@jc, as.integer(x)) - column(jc) + column(jc, y@df) }) #' concat_ws @@ -2028,7 +2033,7 @@ setMethod("concat_ws", signature(sep = "character", x = "Column"), function(sep, x, ...) { jcols <- lapply(list(x, ...), function(x) { x@jc }) jc <- callJStatic("org.apache.spark.sql.functions", "concat_ws", sep, jcols) - column(jc) + column(jc, x@df) }) #' conv @@ -2047,7 +2052,7 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri jc <- callJStatic("org.apache.spark.sql.functions", "conv", x@jc, fromBase, toBase) - column(jc) + column(jc, x@df) }) #' expr @@ -2081,7 +2086,7 @@ setMethod("format_string", signature(format = "character", x = "Column"), jc <- callJStatic("org.apache.spark.sql.functions", "format_string", format, jcols) - column(jc) + column(jc, x@df) }) #' from_unixtime @@ -2104,7 +2109,7 @@ setMethod("from_unixtime", signature(x = "Column"), jc <- callJStatic("org.apache.spark.sql.functions", "from_unixtime", x@jc, format) - column(jc) + column(jc, x@df) }) #' locate @@ -2123,7 +2128,7 @@ setMethod("locate", signature(substr = "character", str = "Column"), jc <- callJStatic("org.apache.spark.sql.functions", "locate", substr, str@jc, as.integer(pos)) - column(jc) + column(jc, str@df) }) #' lpad @@ -2140,7 +2145,7 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), jc <- callJStatic("org.apache.spark.sql.functions", "lpad", x@jc, as.integer(len), pad) - column(jc) + column(jc, x@df) }) #' rand @@ -2206,7 +2211,7 @@ setMethod("regexp_extract", jc <- callJStatic("org.apache.spark.sql.functions", "regexp_extract", x@jc, pattern, as.integer(idx)) - column(jc) + column(jc, x@df) }) #' regexp_replace @@ -2224,7 +2229,7 @@ setMethod("regexp_replace", jc <- callJStatic("org.apache.spark.sql.functions", "regexp_replace", x@jc, pattern, replacement) - column(jc) + column(jc, x@df) }) #' rpad @@ -2241,7 +2246,7 @@ setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"), jc <- callJStatic("org.apache.spark.sql.functions", "rpad", x@jc, as.integer(len), pad) - column(jc) + column(jc, x@df) }) #' substring_index @@ -2266,7 +2271,7 @@ setMethod("substring_index", jc <- callJStatic("org.apache.spark.sql.functions", "substring_index", x@jc, delim, as.integer(count)) - column(jc) + column(jc, x@df) }) #' translate @@ -2286,7 +2291,7 @@ setMethod("translate", function(x, matchingString, replaceString) { jc <- callJStatic("org.apache.spark.sql.functions", "translate", x@jc, matchingString, replaceString) - column(jc) + column(jc, x@df) }) #' unix_timestamp @@ -2315,7 +2320,7 @@ setMethod("unix_timestamp", signature(x = "missing", format = "missing"), setMethod("unix_timestamp", signature(x = "Column", format = "missing"), function(x, format) { jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc) - column(jc) + column(jc, x@df) }) #' @rdname unix_timestamp @@ -2324,7 +2329,7 @@ setMethod("unix_timestamp", signature(x = "Column", format = "missing"), setMethod("unix_timestamp", signature(x = "Column", format = "character"), function(x, format = "yyyy-MM-dd HH:mm:ss") { jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format) - column(jc) + column(jc, x@df) }) #' when #' @@ -2339,10 +2344,9 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"), #' @examples \dontrun{when(df$age == 2, df$age + 1)} setMethod("when", signature(condition = "Column", value = "ANY"), function(condition, value) { - condition <- condition@jc value <- if (class(value) == "Column") { value@jc } else { value } - jc <- callJStatic("org.apache.spark.sql.functions", "when", condition, value) - column(jc) + jc <- callJStatic("org.apache.spark.sql.functions", "when", condition@jc, value) + column(jc, condition@df) }) #' ifelse @@ -2362,14 +2366,13 @@ setMethod("when", signature(condition = "Column", value = "ANY"), setMethod("ifelse", signature(test = "Column", yes = "ANY", no = "ANY"), function(test, yes, no) { - test <- test@jc yes <- if (class(yes) == "Column") { yes@jc } else { yes } no <- if (class(no) == "Column") { no@jc } else { no } jc <- callJMethod(callJStatic("org.apache.spark.sql.functions", "when", - test, yes), + test@jc, yes), "otherwise", no) - column(jc) + column(jc, test@df) }) ###################### Window functions###################### @@ -2579,7 +2582,7 @@ setMethod("array_contains", signature(x = "Column", value = "ANY"), function(x, value) { jc <- callJStatic("org.apache.spark.sql.functions", "array_contains", x@jc, value) - column(jc) + column(jc, x@df) }) #' explode @@ -2595,7 +2598,7 @@ setMethod("explode", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "explode", x@jc) - column(jc) + column(jc, x@df) }) #' size @@ -2611,7 +2614,7 @@ setMethod("size", signature(x = "Column"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "size", x@jc) - column(jc) + column(jc, x@df) }) #' sort_array @@ -2636,5 +2639,5 @@ setMethod("sort_array", signature(x = "Column"), function(x, asc = TRUE) { jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc) - column(jc) + column(jc, x@df) }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 2dba71abec689..5d431ff471a64 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -754,7 +754,7 @@ setGeneric("ceil", function(x) { standardGeneric("ceil") }) #' @rdname col #' @export -setGeneric("column", function(x) { standardGeneric("column") }) +setGeneric("column", function(x, df) { standardGeneric("column") }) #' @rdname concat #' @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index cc118108f61cc..300e4e53cfdb1 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -811,9 +811,9 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[,1], "DataFrame") - expect_equal(columns(df[,1]), c("name")) - expect_equal(columns(df[,"age"]), c("age")) + expect_is(df[,1,drop=F], "DataFrame") + expect_equal(columns(df[,1,drop=F]), c("name")) + expect_equal(columns(df[,"age",drop=F]), c("age")) df2 <- df[,c("age", "name")] expect_is(df2, "DataFrame") expect_equal(columns(df2), c("age", "name")) @@ -878,13 +878,13 @@ test_that("subsetting", { expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") - df2 <- df[df$age == 19, 1] + df2 <- df[df$age == 19, 1, drop=F] expect_is(df2, "DataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") - df3 <- df[df$age > 20, 2] + df3 <- df[df$age > 20, 2, drop=F] expect_equal(count(df3), 1) expect_equal(columns(df3), c("age")) @@ -900,7 +900,7 @@ test_that("subsetting", { expect_equal(count(df6), 1) expect_equal(columns(df6), c("name", "age")) - df7 <- subset(df, select = "name") + df7 <- subset(df, select = "name", drop=F) expect_equal(count(df7), 3) expect_equal(columns(df7), c("name")) @@ -1047,13 +1047,13 @@ test_that("column functions", { schema = c("a", "b", "c")) result <- collect(select(df, struct("a", "c"))) expected <- data.frame(row.names = 1:2) - expected$"struct(a, c)" <- list(listToStruct(list(a = 1L, c = 3L)), + expected$"struct(a,c)" <- list(listToStruct(list(a = 1L, c = 3L)), listToStruct(list(a = 4L, c = 6L))) expect_equal(result, expected) result <- collect(select(df, struct(df$a, df$b))) expected <- data.frame(row.names = 1:2) - expected$"struct(a, b)" <- list(listToStruct(list(a = 1L, b = 2L)), + expected$"struct(a,b)" <- list(listToStruct(list(a = 1L, b = 2L)), listToStruct(list(a = 4L, b = 5L))) expect_equal(result, expected) @@ -1813,7 +1813,7 @@ test_that("attach() on a DataFrame", { stat2 <- summary(age) expect_equal(collect(stat2)[5, "age"], "30") detach("df") - stat3 <- summary(df[, "age"]) + stat3 <- summary(df[, "age", drop=F]) expect_equal(collect(stat3)[5, "age"], "30") expect_error(age) }) @@ -1853,7 +1853,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { df1 <- select(df, cast(df$age, "integer")) coltypes(df) <- c("character", "integer") expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"))) - value <- collect(df[, 2])[[3, 1]] + value <- collect(df[, 2, drop=F])[[3, 1]] expect_equal(value, collect(df1)[[3, 1]]) expect_equal(value, 22) @@ -1897,6 +1897,27 @@ test_that("Method str()", { expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris))) }) +test_that("collect/show/head on Columns", { + + # collect + x <- irisDF$Sepal_Length + 100 + y <- cos(x + irisDF$Sepal_Width) ^ 2 + z <- sin(x + irisDF$Sepal_Width) ^ 2 + expect_equal(any(collect(y + z) == 1), TRUE) + + # show and print + expect_equal(capture.output(show(z + y))[1], " [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1") + expect_equal(capture.output(print(z + y))[1], " [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1") + + # head + expect_equal(all(round(head(z + y)) == 1), TRUE) + expect_equal(length(head(z + y, 100)), 100) + + # Columns without parent DataFrame + expect_error(x <- collect(rand()), + "This column cannot be collected as it's not associated to any DataFrame.") +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) From fbf9b02b478b8eb4845232e09932d068cb393fd8 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 16:00:52 -0800 Subject: [PATCH 02/31] Removed drop=F from other PR --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 300e4e53cfdb1..bc6fb2eebe2e7 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -811,9 +811,9 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[,1,drop=F], "DataFrame") - expect_equal(columns(df[,1,drop=F]), c("name")) - expect_equal(columns(df[,"age",drop=F]), c("age")) + expect_is(df[,1], "DataFrame") + expect_equal(columns(df[,1]), c("name")) + expect_equal(columns(df[,"age"]), c("age")) df2 <- df[,c("age", "name")] expect_is(df2, "DataFrame") expect_equal(columns(df2), c("age", "name")) @@ -878,13 +878,13 @@ test_that("subsetting", { expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") - df2 <- df[df$age == 19, 1, drop=F] + df2 <- df[df$age == 19, 1] expect_is(df2, "DataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") - df3 <- df[df$age > 20, 2, drop=F] + df3 <- df[df$age > 20, 2] expect_equal(count(df3), 1) expect_equal(columns(df3), c("age")) @@ -900,7 +900,7 @@ test_that("subsetting", { expect_equal(count(df6), 1) expect_equal(columns(df6), c("name", "age")) - df7 <- subset(df, select = "name", drop=F) + df7 <- subset(df, select = "name") expect_equal(count(df7), 3) expect_equal(columns(df7), c("name")) @@ -1813,7 +1813,7 @@ test_that("attach() on a DataFrame", { stat2 <- summary(age) expect_equal(collect(stat2)[5, "age"], "30") detach("df") - stat3 <- summary(df[, "age", drop=F]) + stat3 <- summary(df[, "age"]) expect_equal(collect(stat3)[5, "age"], "30") expect_error(age) }) @@ -1853,7 +1853,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { df1 <- select(df, cast(df$age, "integer")) coltypes(df) <- c("character", "integer") expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"))) - value <- collect(df[, 2, drop=F])[[3, 1]] + value <- collect(df[, 2])[[3, 1]] expect_equal(value, collect(df1)[[3, 1]]) expect_equal(value, 22) From 04e728b58ea1dde43fb12a1f458f2b175161c7de Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 17:35:34 -0800 Subject: [PATCH 03/31] Removed NA default value for df in column() method --- R/pkg/R/column.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 08fa4960c3391..2787d7e610fe7 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -66,7 +66,7 @@ setMethod("head", signature="Column", definition=function(x, n=6) { setMethod("column", signature(x = "jobj"), - function(x, df=NA) { + function(x, df) { new("Column", x, df) }) From 2d9ee18dbdbcd25882a18e8fa0fa404d87c497c1 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 19:51:17 -0800 Subject: [PATCH 04/31] Added docs to head(Column) --- R/pkg/R/DataFrame.R | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 0d35429a9bcd3..17ee55f0583bd 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -966,13 +966,14 @@ setMethod("take", #' Head #' -#' Return the first NUM rows of a DataFrame as a data.frame. If NUM is NULL, -#' then head() returns the first 6 rows in keeping with the current data.frame -#' convention in R. +#' Return the first elements of a dataset. If \code{x} is a DataFrame, its first +#' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first +#' elements will be returned as a vector. The number of elements to be returned +#' is given by parameter \code{num}. Default value for \code{num} is 6. #' -#' @param x A SparkSQL DataFrame +#' @param x A Spark DataFrame or Column #' @param num The number of rows to return. Default is 6. -#' @return A data.frame +#' @return A data.frame or vector #' #' @family DataFrame functions #' @rdname head @@ -980,11 +981,18 @@ setMethod("take", #' @export #' @examples #'\dontrun{ +#' # Initialize Spark context and SQL context #' sc <- sparkR.init() #' sqlContext <- sparkRSQL.init(sc) -#' path <- "path/to/file.json" -#' df <- read.json(sqlContext, path) -#' head(df) +#' +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Get the first 6 elements of the DataFrame +#' head(irisDF) +#' +#' # Get the first 20 elements of a Column +#' head(irisDF$Sepal_Length) #' } setMethod("head", signature(x = "DataFrame"), From dc3df1926bb3931459a0ea381e5695b7bb56ca15 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 21:23:48 -0800 Subject: [PATCH 05/31] Still can't recreate build issue. Added default value df=NULL --- R/pkg/R/column.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 2787d7e610fe7..fac622f9954f4 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -66,7 +66,7 @@ setMethod("head", signature="Column", definition=function(x, n=6) { setMethod("column", signature(x = "jobj"), - function(x, df) { + function(x, df=NULL) { new("Column", x, df) }) From 1e06d3ccbb0cc441b9929f626812b8bc7dcbbda3 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 22:09:41 -0800 Subject: [PATCH 06/31] Noticed collate order is automatically changed after running install-dev.sh. Therefore, adding DESCRIPTION --- R/pkg/DESCRIPTION | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 465bc37788e5d..0cd0d75df0f70 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -18,10 +18,10 @@ Collate: 'schema.R' 'generics.R' 'jobj.R' - 'RDD.R' - 'pairRDD.R' 'column.R' 'group.R' + 'RDD.R' + 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' 'backend.R' @@ -36,3 +36,4 @@ Collate: 'stats.R' 'types.R' 'utils.R' +RoxygenNote: 5.0.1 From d697d447ab8467eedade5d6e4c130dcf9a251e5e Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 23 Feb 2016 23:53:06 -0800 Subject: [PATCH 07/31] Still can't recreate build issue. Added minimal test case to debug it --- R/pkg/R/column.R | 7 +++++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index fac622f9954f4..d94ea0b3df2bf 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -66,8 +66,11 @@ setMethod("head", signature="Column", definition=function(x, n=6) { setMethod("column", signature(x = "jobj"), - function(x, df=NULL) { - new("Column", x, df) + function(x, df) { + if (missing(df)) { + df <- NULL + } + new("Column", jc=x, df=df) }) operators <- list( diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index bc6fb2eebe2e7..aa377d4d80277 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1918,6 +1918,11 @@ test_that("collect/show/head on Columns", { "This column cannot be collected as it's not associated to any DataFrame.") }) +test_that("Minimal column test.", { + x <- column(irisDF$Sepal_Length@jc) + expect_equal(collect(select(irisDF, x))[1, 1], 5.1) +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) From 24b6154b127ca57e17fa79831b2fee1edcc38b1b Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 24 Feb 2016 00:32:03 -0800 Subject: [PATCH 08/31] Reverted removing spaces in test cases. This works in my environment, though. However, it fails on Jenkins --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index aa377d4d80277..dbc3d6b4fe1ec 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1047,13 +1047,13 @@ test_that("column functions", { schema = c("a", "b", "c")) result <- collect(select(df, struct("a", "c"))) expected <- data.frame(row.names = 1:2) - expected$"struct(a,c)" <- list(listToStruct(list(a = 1L, c = 3L)), + expected$"struct(a, c)" <- list(listToStruct(list(a = 1L, c = 3L)), listToStruct(list(a = 4L, c = 6L))) expect_equal(result, expected) result <- collect(select(df, struct(df$a, df$b))) expected <- data.frame(row.names = 1:2) - expected$"struct(a,b)" <- list(listToStruct(list(a = 1L, b = 2L)), + expected$"struct(a, b)" <- list(listToStruct(list(a = 1L, b = 2L)), listToStruct(list(a = 4L, b = 5L))) expect_equal(result, expected) From 6a38a3ca79087538dd7d3452a3cb487f9b0b6dd2 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 24 Feb 2016 13:52:33 -0800 Subject: [PATCH 09/31] Added docs for collect(Column) --- R/pkg/R/DataFrame.R | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 17ee55f0583bd..5bb57153c9bca 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -846,9 +846,15 @@ setMethod("dim", c(count(x), ncol(x)) }) -#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame. +#' Download Spark datasets into R #' -#' @param x A SparkSQL DataFrame +#' If applied to a DataFrame, \code{collect} returns a data.frame. If applied to a +#' Column, it returns a vector of the same type. +#' +#' \strong{Note:} Since R data.frames and vectors are +#' held in memory, ensure that you have enough memory on your system to +#' accommodate the contents. +#' @param x A Spark DataFrame or Column #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns #' should be converted to factors. FALSE by default. #' @@ -858,12 +864,18 @@ setMethod("dim", #' @export #' @examples #'\dontrun{ +#' # Initialize Spark context and SQL context #' sc <- sparkR.init() #' sqlContext <- sparkRSQL.init(sc) -#' path <- "path/to/file.json" -#' df <- read.json(sqlContext, path) -#' collected <- collect(df) -#' firstName <- collected[[1]]$name +#' +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Collect it +#' df <- collect(irisDF) +#' +#' # Collect a column +#' v <- collect(irisDF$Sepal_Length * 100) #' } setMethod("collect", signature(x = "DataFrame"), From bf4df2628f2dc214b607ddaa606728771aa685d0 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 30 Mar 2016 11:59:56 -0700 Subject: [PATCH 10/31] Handled case of 'orhpan' Columns with no parent DataFrame --- R/pkg/R/column.R | 24 ++++++++++++++++++------ R/pkg/R/functions.R | 27 ++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index d94ea0b3df2bf..bea915ebca036 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -46,22 +46,34 @@ setMethod("initialize", "Column", function(.Object, jc, df) { setMethod("show", signature="Column", definition=function(object) { MAX_ELEMENTS <- 20 - show(head(object, MAX_ELEMENTS)) - cat(paste0("\b...\nDisplaying up to ", as.character(MAX_ELEMENTS) ," elements only.")) + head.df <- head(object, MAX_ELEMENTS) + + if (length(head.df) == 0) { + colname <- callJMethod(object@jc, "toString") + cat(paste0(colname, "\n")) + cat(paste0("\n")) + } else { + show(head.df) + } + if (length(head.df) == MAX_ELEMENTS) { + cat(paste0("\b...\nDisplaying up to ", as.character(MAX_ELEMENTS) ," elements only.")) + } }) setMethod("collect", signature="Column", definition=function(x) { if (is.null(x@df)) { - stop("This column cannot be collected as it's not associated to any DataFrame.") + character(0) + } else { + collect(select(x@df, x))[, 1] } - collect(select(x@df, x))[, 1] }) setMethod("head", signature="Column", definition=function(x, n=6) { if (is.null(x@df)) { - stop("This column cannot be collected as it's not associated to any DataFrame.") + collect(x) + } else { + head(select(x@df, x), n)[, 1] } - head(select(x@df, x), n)[, 1] }) setMethod("column", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 97ccebf42666a..a41f17b1d9c65 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2160,7 +2160,11 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), setMethod("rand", signature(seed = "missing"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "rand") - column(jc) + + # By assigning a one-row data.frame, the result of this function can be collected + # returning a one-element Column + df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + column(jc, df) }) #' @rdname rand @@ -2169,7 +2173,11 @@ setMethod("rand", signature(seed = "missing"), setMethod("rand", signature(seed = "numeric"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "rand", as.integer(seed)) - column(jc) + + # By assigning a one-row data.frame, the result of this function can be collected + # returning a one-element Column + df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + column(jc, df) }) #' randn @@ -2184,7 +2192,11 @@ setMethod("rand", signature(seed = "numeric"), setMethod("randn", signature(seed = "missing"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "randn") - column(jc) + + # By assigning a one-row data.frame, the result of this function can be collected + # returning a one-element Column + df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + column(jc, df) }) #' @rdname randn @@ -2193,7 +2205,8 @@ setMethod("randn", signature(seed = "missing"), setMethod("randn", signature(seed = "numeric"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed)) - column(jc) + df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + column(jc, df) }) #' regexp_extract @@ -2311,7 +2324,11 @@ setMethod("translate", setMethod("unix_timestamp", signature(x = "missing", format = "missing"), function(x, format) { jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp") - column(jc) + + # By assigning a one-row data.frame, the result of this function can be collected + # returning a one-element Column + df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + column(jc, df) }) #' @rdname unix_timestamp From c86bebb9ce02e5623d328d33109fff1ee8eeca56 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 30 Mar 2016 12:02:14 -0700 Subject: [PATCH 11/31] Added tests for orphan Columns --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index dbc3d6b4fe1ec..839b07dd1f10c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1914,8 +1914,7 @@ test_that("collect/show/head on Columns", { expect_equal(length(head(z + y, 100)), 100) # Columns without parent DataFrame - expect_error(x <- collect(rand()), - "This column cannot be collected as it's not associated to any DataFrame.") + expect_equal(is.numeric(collect(rand())), TRUE) }) test_that("Minimal column test.", { From e5659ee85391937cdf301bd1acc01487bc55c129 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 30 Mar 2016 15:30:21 -0700 Subject: [PATCH 12/31] Fixed style issues --- R/pkg/R/column.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index bea915ebca036..450026f54246b 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -44,7 +44,7 @@ setMethod("initialize", "Column", function(.Object, jc, df) { .Object }) -setMethod("show", signature="Column", definition=function(object) { +setMethod("show", signature = "Column", definition = function(object) { MAX_ELEMENTS <- 20 head.df <- head(object, MAX_ELEMENTS) @@ -56,11 +56,11 @@ setMethod("show", signature="Column", definition=function(object) { show(head.df) } if (length(head.df) == MAX_ELEMENTS) { - cat(paste0("\b...\nDisplaying up to ", as.character(MAX_ELEMENTS) ," elements only.")) + cat(paste0("\b...\nDisplaying up to ", as.character(MAX_ELEMENTS), " elements only.")) } }) -setMethod("collect", signature="Column", definition=function(x) { +setMethod("collect", signature = "Column", definition = function(x) { if (is.null(x@df)) { character(0) } else { @@ -68,7 +68,7 @@ setMethod("collect", signature="Column", definition=function(x) { } }) -setMethod("head", signature="Column", definition=function(x, n=6) { +setMethod("head", signature = "Column", definition = function(x, n=6) { if (is.null(x@df)) { collect(x) } else { @@ -82,7 +82,7 @@ setMethod("column", if (missing(df)) { df <- NULL } - new("Column", jc=x, df=df) + new("Column", jc = x, df = df) }) operators <- list( From 9c1661f0fe892fd3dcd40e8e489a940c9b129747 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 6 May 2016 16:04:31 -0700 Subject: [PATCH 13/31] pkg/R/columnR --- R/pkg/R/column.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 7e80f298992d1..f3df8c4247be8 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -setOldClass("DataFrame") -setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL")) +setOldClass("SparkDataFrame") +setClassUnion("SparkDataFrameOrNull", c("SparkDataFrame", "NULL")) # Column Class @@ -31,7 +31,7 @@ setOldClass("jobj") #' @slot jc reference to JVM SparkDataFrame column #' @export setClass("Column", - slots = list(jc = "jobj", df = "DataFrameOrNULL")) + slots = list(jc = "jobj", df = "SparkDataFrameOrNull")) setMethod("initialize", "Column", function(.Object, jc, df) { .Object@jc <- jc From ed0abf24d7f65ad2381f6d664ba23e440013c97a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 11 Oct 2016 15:39:15 -0700 Subject: [PATCH 14/31] Removed method collect() --- R/pkg/R/DataFrame.R | 9 +++------ R/pkg/R/column.R | 8 -------- R/pkg/inst/tests/testthat/test_sparkSQL.R | 6 +++--- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 6c5d8e486c2fb..b52610d557099 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1037,13 +1037,12 @@ setMethod("dim", #' Download Spark datasets into R #' -#' If applied to a SparkDataFrame, \code{collect} returns a data.frame. If applied to a -#' Column, it returns a vector of the same type. +#' Converts a SparkDataFrame into a data.frame. #' -#' \strong{Note:} Since R data.frames and vectors are +#' \strong{Note:} Since R data.frames are #' held in memory, ensure that you have enough memory on your system to #' accommodate the contents. -#' @param x A SparkDataFrame or Column +#' @param x A SparkDataFrame #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns #' should be converted to factors. FALSE by default. @@ -1066,8 +1065,6 @@ setMethod("dim", #' # Collect it #' df <- collect(irisDF) #' -#' # Collect a column -#' v <- collect(irisDF$Sepal_Length * 100) #' } #' @note collect since 1.4.0 setMethod("collect", diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 6d9bed6fbb559..3140b3dfcf85d 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -67,14 +67,6 @@ setMethod("show", signature = "Column", definition = function(object) { } }) -setMethod("collect", signature = "Column", definition = function(x) { - if (is.null(x@df)) { - character(0) - } else { - collect(select(x@df, x))[, 1] - } -}) - setMethod("head", signature = "Column", definition = function(x, n=6) { if (is.null(x@df)) { collect(x) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3eea9f8ae7925..b44a8f24a2506 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2258,7 +2258,7 @@ test_that("collect/show/head on Columns", { x <- irisDF$Sepal_Length + 100 y <- cos(x + irisDF$Sepal_Width) ^ 2 z <- sin(x + irisDF$Sepal_Width) ^ 2 - expect_equal(any(collect(y + z) == 1), TRUE) + expect_equal(any(head(y + z) == 1), TRUE) # show and print expect_equal(capture.output(show(z + y))[1], " [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1") @@ -2269,12 +2269,12 @@ test_that("collect/show/head on Columns", { expect_equal(length(head(z + y, 100)), 100) # Columns without parent DataFrame - expect_equal(is.numeric(collect(rand())), TRUE) + expect_equal(is.numeric(head(rand())), TRUE) }) test_that("Minimal column test.", { x <- column(irisDF$Sepal_Length@jc) - expect_equal(collect(select(irisDF, x))[1, 1], 5.1) + expect_equal(head(select(irisDF, x))[1, 1], 5.1) }) test_that("Histogram", { From d2470fa736e92ee907426a031bce98e34fa3eb86 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 12 Oct 2016 09:58:59 -0700 Subject: [PATCH 15/31] Removed spark.init() call, fixed docs --- R/pkg/R/DataFrame.R | 41 +++++++++++++---------------------------- R/pkg/R/column.R | 2 +- R/pkg/R/functions.R | 10 +++++----- R/pkg/R/generics.R | 4 ++++ 4 files changed, 23 insertions(+), 34 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index b52610d557099..75b733f3ccb5f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1035,16 +1035,10 @@ setMethod("dim", c(count(x), ncol(x)) }) -#' Download Spark datasets into R +#' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame. #' -#' Converts a SparkDataFrame into a data.frame. -#' -#' \strong{Note:} Since R data.frames are -#' held in memory, ensure that you have enough memory on your system to -#' accommodate the contents. -#' @param x A SparkDataFrame -#' @param stringsAsFactors (Optional) A logical indicating whether or not string columns - +#' @param x a SparkDataFrame. +#' @param stringsAsFactors (Optional) a logical indicating whether or not string columns #' should be converted to factors. FALSE by default. #' @param ... further arguments to be passed to or from other methods. #' @@ -1055,16 +1049,11 @@ setMethod("dim", #' @export #' @examples #'\dontrun{ -#' # Initialize Spark context and SQL context -#' sc <- sparkR.init() -#' sqlContext <- sparkRSQL.init(sc) -#' -#' # Create a DataFrame from the Iris dataset -#' irisDF <- createDataFrame(sqlContext, iris) -#' -#' # Collect it -#' df <- collect(irisDF) -#' +#' sparkR.session() +#' path <- "path/to/file.json" +#' df <- read.json(path) +#' collected <- collect(df) +#' firstName <- collected[[1]]$name #' } #' @note collect since 1.4.0 setMethod("collect", @@ -1178,25 +1167,22 @@ setMethod("take", }) #' Head -#' -#' Return the first elements of a dataset. If \code{x} is a SparkDataFrame, its first +#' Return the first elements of a SparkDataFrame or Column. If \code{x} is a SparkDataFrame, its first #' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first #' elements will be returned as a vector. The number of elements to be returned #' is given by parameter \code{num}. Default value for \code{num} is 6. #' -#' @param x A SparkDataFrame or Column -#' @param num The number of rows to return. Default is 6. -#' @return A data.frame or vector +#' @param num the number of rows to return. Default is 6. +#' @return A data.frame. #' #' @family SparkDataFrame functions #' @aliases head,SparkDataFrame-method -#' @rdname head #' @name head #' @export #' @examples #'\dontrun{ #' # Initialize Spark context and SQL context -#' sc <- sparkR.init() +#' sc <- sparkR.session() #' sqlContext <- sparkRSQL.init(sc) #' #' # Create a DataFrame from the Iris dataset @@ -1206,8 +1192,7 @@ setMethod("take", #' head(irisDF) #' #' # Get the first 20 elements of a Column -#' head(irisDF$Sepal_Length) -#' } +#' head(irisDF$Sepal_Length, 20) #' @note head since 1.4.0 setMethod("head", signature(x = "SparkDataFrame"), diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 3140b3dfcf85d..d0429091385ea 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -51,7 +51,7 @@ setMethod("initialize", "Column", function(.Object, jc, df) { .Object }) -setMethod("show", signature = "Column", definition = function(object) { +setMethod("show", signature = "Column", function(object) { MAX_ELEMENTS <- 20 head.df <- head(object, MAX_ELEMENTS) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index d2e5a531b5f2c..a053994a5b762 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2848,7 +2848,7 @@ setMethod("rand", signature(seed = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + df <- as.DataFrame(data.frame(0)) column(jc, df) }) @@ -2863,7 +2863,7 @@ setMethod("rand", signature(seed = "numeric"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + df <- as.DataFrame(data.frame(0)) column(jc, df) }) @@ -2885,7 +2885,7 @@ setMethod("randn", signature(seed = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + df <- as.DataFrame(data.frame(0)) column(jc, df) }) @@ -2897,7 +2897,7 @@ setMethod("randn", signature(seed = "missing"), setMethod("randn", signature(seed = "numeric"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed)) - df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + df <- as.DataFrame(data.frame(0)) column(jc, df) }) @@ -3051,7 +3051,7 @@ setMethod("unix_timestamp", signature(x = "missing", format = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(sparkRSQL.init(), data.frame(0)) + df <- as.DataFrame(data.frame(0)) column(jc, df) }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 24c062c8e366e..7cb2348d93614 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -550,6 +550,10 @@ setGeneric("group_by", function(x, ...) { standardGeneric("group_by") }) #' @export setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) +#' @param x a SparkDataFrame. +#' @rdname head +setGeneric("head") + #' @rdname insertInto #' @export setGeneric("insertInto", function(x, tableName, ...) { standardGeneric("insertInto") }) From bf739e426da714c9a4ac38fea1f7fc6aff5198a3 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 12 Oct 2016 10:33:33 -0700 Subject: [PATCH 16/31] Used a singleton to generate empty DataFrame --- R/pkg/R/DataFrame.R | 18 +++++++++++++----- R/pkg/R/functions.R | 10 +++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 75b733f3ccb5f..174058290765b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1166,8 +1166,9 @@ setMethod("take", collect(limited) }) -#' Head -#' Return the first elements of a SparkDataFrame or Column. If \code{x} is a SparkDataFrame, its first +#' Return the first part of a SparkDataFrame or Column +#' +#' If \code{x} is a SparkDataFrame, its first #' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first #' elements will be returned as a vector. The number of elements to be returned #' is given by parameter \code{num}. Default value for \code{num} is 6. @@ -1182,11 +1183,10 @@ setMethod("take", #' @examples #'\dontrun{ #' # Initialize Spark context and SQL context -#' sc <- sparkR.session() -#' sqlContext <- sparkRSQL.init(sc) +#' sparkR.session() #' #' # Create a DataFrame from the Iris dataset -#' irisDF <- createDataFrame(sqlContext, iris) +#' irisDF <- as.DataFrame(iris) #' #' # Get the first 6 elements of the DataFrame #' head(irisDF) @@ -3327,3 +3327,11 @@ setMethod("randomSplit", } sapply(sdfs, dataFrame) }) + +# A global singleton for an empty SparkR DataFrame. +getEmptySparkRDataFrame <- function() { + if (is.null(.sparkREnv$EMPTY_DF)) { + .sparkREnv$EMPTY_DF <- as.DataFrame(data.frame(0)) + } + return(.sparkREnv$EMPTY_DF) +} \ No newline at end of file diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a053994a5b762..923c198fca293 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2848,7 +2848,7 @@ setMethod("rand", signature(seed = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(data.frame(0)) + df <- getEmptySparkRDataFrame() column(jc, df) }) @@ -2863,7 +2863,7 @@ setMethod("rand", signature(seed = "numeric"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(data.frame(0)) + df <- getEmptySparkRDataFrame() column(jc, df) }) @@ -2885,7 +2885,7 @@ setMethod("randn", signature(seed = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(data.frame(0)) + df <- getEmptySparkRDataFrame() column(jc, df) }) @@ -2897,7 +2897,7 @@ setMethod("randn", signature(seed = "missing"), setMethod("randn", signature(seed = "numeric"), function(seed) { jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed)) - df <- as.DataFrame(data.frame(0)) + df <- getEmptySparkRDataFrame() column(jc, df) }) @@ -3051,7 +3051,7 @@ setMethod("unix_timestamp", signature(x = "missing", format = "missing"), # By assigning a one-row data.frame, the result of this function can be collected # returning a one-element Column - df <- as.DataFrame(data.frame(0)) + df <- getEmptySparkRDataFrame() column(jc, df) }) From 20e53e83a4d564c17d0b180ae5eab8ca9d6c1410 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 12 Oct 2016 10:37:36 -0700 Subject: [PATCH 17/31] Minor docs change --- R/pkg/R/generics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 7cb2348d93614..27d5f97deda5f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -550,7 +550,7 @@ setGeneric("group_by", function(x, ...) { standardGeneric("group_by") }) #' @export setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) -#' @param x a SparkDataFrame. +#' @param x a SparkDataFrame or Column #' @rdname head setGeneric("head") From 266d5ffd87929adf081944de19ee36f892125977 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 12 Oct 2016 11:40:13 -0700 Subject: [PATCH 18/31] Added missing bracket --- R/pkg/R/DataFrame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 174058290765b..4225de65a6f7d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1193,6 +1193,7 @@ setMethod("take", #' #' # Get the first 20 elements of a Column #' head(irisDF$Sepal_Length, 20) +#' } #' @note head since 1.4.0 setMethod("head", signature(x = "SparkDataFrame"), From 257fa8669eb0520d4827c0bc3d52a07037bd38f8 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 12 Oct 2016 14:18:59 -0700 Subject: [PATCH 19/31] Added documentation for parameter df of class Column --- R/pkg/R/column.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index d0429091385ea..a93e9f0d36a0b 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -31,6 +31,7 @@ setOldClass("jobj") #' @rdname column #' #' @slot jc reference to JVM SparkDataFrame column +#' @slot df the parent SparkDataFrame of the Column object #' @export #' @note Column since 1.4.0 setClass("Column", From 0691c32cacc3218742c7f345b4bc498ba2f826e5 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 13 Oct 2016 12:04:58 -0700 Subject: [PATCH 20/31] Fixed docs issues --- R/pkg/R/column.R | 2 ++ R/pkg/R/generics.R | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index a93e9f0d36a0b..249c03f8e796e 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -52,6 +52,7 @@ setMethod("initialize", "Column", function(.Object, jc, df) { .Object }) +#' @rdname show setMethod("show", signature = "Column", function(object) { MAX_ELEMENTS <- 20 head.df <- head(object, MAX_ELEMENTS) @@ -68,6 +69,7 @@ setMethod("show", signature = "Column", function(object) { } }) +#' @rdname head setMethod("head", signature = "Column", definition = function(x, n=6) { if (is.null(x@df)) { collect(x) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 27d5f97deda5f..65adedead9a2e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -552,7 +552,7 @@ setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) #' @param x a SparkDataFrame or Column #' @rdname head -setGeneric("head") +#setGeneric("head") #' @rdname insertInto #' @export From 445407caa11b0f555e32d51008fdf83d5c4ecd97 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 13 Oct 2016 13:49:40 -0700 Subject: [PATCH 21/31] Removed commented code --- R/pkg/R/generics.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 65adedead9a2e..57628f7e66922 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -552,7 +552,6 @@ setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) #' @param x a SparkDataFrame or Column #' @rdname head -#setGeneric("head") #' @rdname insertInto #' @export From 1ace2e5785030df790b3565d2a0b24358229d655 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 13 Oct 2016 17:00:01 -0700 Subject: [PATCH 22/31] Fixed docs issues. Renamed parameter n as num for method head to have consistency with DataFrame's head --- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/column.R | 4 ++-- R/pkg/R/generics.R | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 4225de65a6f7d..799bcd9d12a5e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1172,7 +1172,7 @@ setMethod("take", #' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first #' elements will be returned as a vector. The number of elements to be returned #' is given by parameter \code{num}. Default value for \code{num} is 6. -#' +#' @param x a SparkDataFrame or Column #' @param num the number of rows to return. Default is 6. #' @return A data.frame. #' diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 249c03f8e796e..5c490707c3521 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -70,11 +70,11 @@ setMethod("show", signature = "Column", function(object) { }) #' @rdname head -setMethod("head", signature = "Column", definition = function(x, n=6) { +setMethod("head", signature = "Column", definition = function(x, num = 6) { if (is.null(x@df)) { collect(x) } else { - head(select(x@df, x), n)[, 1] + head(select(x@df, x), num)[, 1] } }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 57628f7e66922..24c062c8e366e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -550,9 +550,6 @@ setGeneric("group_by", function(x, ...) { standardGeneric("group_by") }) #' @export setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") }) -#' @param x a SparkDataFrame or Column -#' @rdname head - #' @rdname insertInto #' @export setGeneric("insertInto", function(x, tableName, ...) { standardGeneric("insertInto") }) From 2bfb8a62adf81e6953e398147e5f8d9122bc47ef Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 14 Oct 2016 10:32:06 -0700 Subject: [PATCH 23/31] Style fixes --- R/pkg/R/column.R | 16 +++++++++------- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 5c490707c3521..d8369a2f8b48d 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -70,13 +70,15 @@ setMethod("show", signature = "Column", function(object) { }) #' @rdname head -setMethod("head", signature = "Column", definition = function(x, num = 6) { - if (is.null(x@df)) { - collect(x) - } else { - head(select(x@df, x), num)[, 1] - } -}) +setMethod("head", + signature = "Column", + function(x, num = 6L) { + if (is.null(x@df)) { + collect(x) + } else { + head(select(x@df, x), num)[, 1] + } + }) #' @rdname column #' @name column diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index b44a8f24a2506..8e8fdb42c4aa4 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2252,7 +2252,7 @@ test_that("Method str()", { expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris))) }) -test_that("collect/show/head on Columns", { +test_that("show/head on Columns", { # collect x <- irisDF$Sepal_Length + 100 From e0bba0a0655f750751a6b9b05a7f6cf5eb807e05 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 14 Oct 2016 11:50:45 -0700 Subject: [PATCH 24/31] Fixed docs issues --- R/pkg/R/column.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index d8369a2f8b48d..330d35c9cb6cc 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -82,6 +82,7 @@ setMethod("head", #' @rdname column #' @name column +#' @param df the parent SparkDataFrame. This is used to retrieve the contents of the column through method head. #' @aliases column,jobj-method setMethod("column", signature(x = "jobj"), From b25deb14267815b23d0d2dbd79b28f24853c563a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 19 Oct 2016 12:40:25 -0700 Subject: [PATCH 25/31] Cosmetic changes --- R/pkg/R/DataFrame.R | 10 +++++----- R/pkg/R/column.R | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 799bcd9d12a5e..5804d25b6b54c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1178,21 +1178,21 @@ setMethod("take", #' #' @family SparkDataFrame functions #' @aliases head,SparkDataFrame-method +#' @rdname head #' @name head #' @export #' @examples #'\dontrun{ -#' # Initialize Spark context and SQL context #' sparkR.session() #' #' # Create a DataFrame from the Iris dataset -#' irisDF <- as.DataFrame(iris) +#' df <- as.DataFrame(airquality) #' #' # Get the first 6 elements of the DataFrame -#' head(irisDF) +#' head(df) #' #' # Get the first 20 elements of a Column -#' head(irisDF$Sepal_Length, 20) +#' head(df$Ozone, 20) #' } #' @note head since 1.4.0 setMethod("head", @@ -3334,5 +3334,5 @@ getEmptySparkRDataFrame <- function() { if (is.null(.sparkREnv$EMPTY_DF)) { .sparkREnv$EMPTY_DF <- as.DataFrame(data.frame(0)) } - return(.sparkREnv$EMPTY_DF) + .sparkREnv$EMPTY_DF } \ No newline at end of file diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 330d35c9cb6cc..31ef5a632b21e 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -74,7 +74,7 @@ setMethod("head", signature = "Column", function(x, num = 6L) { if (is.null(x@df)) { - collect(x) + character(0) } else { head(select(x@df, x), num)[, 1] } From 76061ad8320c0ab1a52da1d0a7ea96e2db5d8a68 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 19 Oct 2016 15:42:29 -0700 Subject: [PATCH 26/31] More cosmetics --- R/pkg/R/DataFrame.R | 6 +++--- R/pkg/R/column.R | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5804d25b6b54c..a4c29b80662ab 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3331,8 +3331,8 @@ setMethod("randomSplit", # A global singleton for an empty SparkR DataFrame. getEmptySparkRDataFrame <- function() { - if (is.null(.sparkREnv$EMPTY_DF)) { - .sparkREnv$EMPTY_DF <- as.DataFrame(data.frame(0)) + if (is.null(.sparkREnv$.emptyDataFrame)) { + .sparkREnv$.emptyDataFrame <- as.DataFrame(data.frame(0)) } - .sparkREnv$EMPTY_DF + .sparkREnv$.emptyDataFrame } \ No newline at end of file diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 31ef5a632b21e..91032ebdb7316 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -71,7 +71,7 @@ setMethod("show", signature = "Column", function(object) { #' @rdname head setMethod("head", - signature = "Column", + signature(x = "Column"), function(x, num = 6L) { if (is.null(x@df)) { character(0) From ed1b382524265340d0b692a89c6be388eddb549a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 21 Oct 2016 12:38:13 -0700 Subject: [PATCH 27/31] More cosmetics --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a4c29b80662ab..4b53604b58af4 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3331,7 +3331,7 @@ setMethod("randomSplit", # A global singleton for an empty SparkR DataFrame. getEmptySparkRDataFrame <- function() { - if (is.null(.sparkREnv$.emptyDataFrame)) { + if (!exists(".emptyDataFrame", envir=.sparkREnv)) { .sparkREnv$.emptyDataFrame <- as.DataFrame(data.frame(0)) } .sparkREnv$.emptyDataFrame From 1338d71a8f8f1387e08326851e15fcc84d5092f9 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 21 Oct 2016 13:55:31 -0700 Subject: [PATCH 28/31] Fixed style issues --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 4b53604b58af4..6cd6ec60a8433 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3331,7 +3331,7 @@ setMethod("randomSplit", # A global singleton for an empty SparkR DataFrame. getEmptySparkRDataFrame <- function() { - if (!exists(".emptyDataFrame", envir=.sparkREnv)) { + if (!exists(".emptyDataFrame", envir = .sparkREnv)) { .sparkREnv$.emptyDataFrame <- as.DataFrame(data.frame(0)) } .sparkREnv$.emptyDataFrame From c0f1906f001a9df1efb9968a3c370f11e9378b7e Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 25 Oct 2016 15:29:49 -0700 Subject: [PATCH 29/31] Added test for Columns with no parent DataFrame --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 8e8fdb42c4aa4..01a6cbb1e5603 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2270,6 +2270,10 @@ test_that("show/head on Columns", { # Columns without parent DataFrame expect_equal(is.numeric(head(rand())), TRUE) + + # Columns with NULL parent DataFrame + expect_equal(length(head(column("abc"))), 0) + }) test_that("Minimal column test.", { From 777aee3e8f2adbc7fefca20a8d1fb14faffd96c2 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 25 Oct 2016 15:35:16 -0700 Subject: [PATCH 30/31] Moved head documentation to generics.R --- R/pkg/R/DataFrame.R | 28 ---------------------------- R/pkg/R/generics.R | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 6cd6ec60a8433..3c4d44d1ef2a7 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1166,35 +1166,7 @@ setMethod("take", collect(limited) }) -#' Return the first part of a SparkDataFrame or Column -#' -#' If \code{x} is a SparkDataFrame, its first -#' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first -#' elements will be returned as a vector. The number of elements to be returned -#' is given by parameter \code{num}. Default value for \code{num} is 6. -#' @param x a SparkDataFrame or Column -#' @param num the number of rows to return. Default is 6. -#' @return A data.frame. -#' -#' @family SparkDataFrame functions -#' @aliases head,SparkDataFrame-method #' @rdname head -#' @name head -#' @export -#' @examples -#'\dontrun{ -#' sparkR.session() -#' -#' # Create a DataFrame from the Iris dataset -#' df <- as.DataFrame(airquality) -#' -#' # Get the first 6 elements of the DataFrame -#' head(df) -#' -#' # Get the first 20 elements of a Column -#' head(df$Ozone, 20) -#' } -#' @note head since 1.4.0 setMethod("head", signature(x = "SparkDataFrame"), function(x, num = 6L) { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 24c062c8e366e..1753251f96c03 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -980,6 +980,36 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) +#' Return the first part of a SparkDataFrame or Column +#' +#' If \code{x} is a SparkDataFrame, its first +#' rows will be returned as a data.frame. If the dataset is a \code{Column}, its first +#' elements will be returned as a vector. The number of elements to be returned +#' is given by parameter \code{num}. Default value for \code{num} is 6. +#' @param x a SparkDataFrame or Column +#' @param num the number of rows to return. Default is 6. +#' @return A data.frame. +#' +#' @family SparkDataFrame functions +#' @aliases head,SparkDataFrame-method +#' @rdname head +#' @name head +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' +#' # Create a DataFrame from the Iris dataset +#' df <- as.DataFrame(airquality) +#' +#' # Get the first 6 elements of the DataFrame +#' head(df) +#' +#' # Get the first 20 elements of a Column +#' head(df$Ozone, 20) +#' } +#' @note head since 1.4.0 + #' @rdname hex #' @export setGeneric("hex", function(x) { standardGeneric("hex") }) From 619f23b367b94a47f3f63dea008ab4d674218b34 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 25 Oct 2016 15:40:29 -0700 Subject: [PATCH 31/31] Moved documentation for show method to generics.R --- R/pkg/R/DataFrame.R | 18 ------------------ R/pkg/R/generics.R | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3c4d44d1ef2a7..6fa72767de23d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -223,25 +223,7 @@ setMethod("showDF", cat(s) }) -#' show -#' -#' Print class and type information of a Spark object. -#' -#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. -#' -#' @family SparkDataFrame functions #' @rdname show -#' @aliases show,SparkDataFrame-method -#' @name show -#' @export -#' @examples -#'\dontrun{ -#' sparkR.session() -#' path <- "path/to/file.json" -#' df <- read.json(path) -#' show(df) -#'} -#' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", function(object) { cols <- lapply(dtypes(object), function(l) { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 1753251f96c03..fd94838942038 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -683,6 +683,27 @@ setGeneric("select", function(x, col, ...) { standardGeneric("select") } ) #' @export setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr") }) +#' show +#' +#' Print class and type information of a Spark object. +#' +#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. +#' +#' @family SparkDataFrame functions +#' @rdname show +#' @aliases show,SparkDataFrame-method +#' @name show +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' path <- "path/to/file.json" +#' df <- read.json(path) +#' show(df) +#'} +#' @note show(SparkDataFrame) since 1.4.0 +NULL + #' @rdname showDF #' @export setGeneric("showDF", function(x, ...) { standardGeneric("showDF") }) @@ -1009,6 +1030,7 @@ setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) #' head(df$Ozone, 20) #' } #' @note head since 1.4.0 +NULL #' @rdname hex #' @export