From d2a7ca8a78a0574fd592f705755461fd6724a1b0 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 26 Jun 2017 11:03:44 -0700 Subject: [PATCH 1/5] update doc for column nonaggregate functions --- R/pkg/R/functions.R | 408 ++++++++++++++++++-------------------------- R/pkg/R/generics.R | 63 ++++--- 2 files changed, 205 insertions(+), 266 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 31028585aaa13..7f3c4264e91fb 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -86,23 +86,47 @@ NULL #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))} NULL -#' lit +#' Non-aggregate functions for Column operations #' -#' A new \linkS4class{Column} is created to represent the literal value. +#' Non-aggregate functions defined for \code{Column}. +#' +#' @param x Column to compute on. Note the difference in the following methods: +#' \itemize{ +#' \item \code{lit}: it is a literal value or a Column. +#' \item \code{to_json}: it is the column containing the struct or array of the structs. +#' \item \code{from_json}: it is column containing the JSON string. +#' } +#' @param y Column to compute on. +#' @param ... additional argument(s). Note the difference in the following methods: +#' \itemize{ +#' \item \code{to_json}, \code{from_json}: this contains additional named properties to +#' control how it is converted, accepts the same options as the JSON data source. +#' \item \code{expr}: it contains an expression character object to be parsed. +#' } +#' @name column_nonaggregate_functions +#' @rdname column_nonaggregate_functions +#' @seealso coalesce,SparkDataFrame-method +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} +NULL + +#' @details +#' \code{lit}: A new \linkS4class{Column} is created to represent the literal value. #' If the parameter is a \linkS4class{Column}, it is returned unchanged. #' -#' @param x a literal value or a Column. -#' @family non-aggregate functions -#' @rdname lit -#' @name lit +#' @rdname column_nonaggregate_functions #' @export -#' @aliases lit,ANY-method +#' @aliases lit lit,ANY-method #' @examples +#' #' \dontrun{ -#' lit(df$name) -#' select(df, lit("x")) -#' select(df, lit("2015-01-01")) -#'} +#' tmp <- mutate(df, v1 = lit(df$mpg), v2 = lit("x"), v3 = lit("2015-01-01"), +#' v4 = negate(df$mpg), v5 = expr('length(model)'), +#' v6 = greatest(df$vs, df$am), v7 = least(df$vs, df$am), +#' v8 = column("mpg")) +#' head(tmp)} #' @note lit since 1.5.0 setMethod("lit", signature("ANY"), function(x) { @@ -297,18 +321,16 @@ setMethod("bin", column(jc) }) -#' bitwiseNOT -#' -#' Computes bitwise NOT. -#' -#' @param x Column to compute on. +#' @details +#' \code{bitwiseNOT}: Computes bitwise NOT. #' -#' @rdname bitwiseNOT -#' @name bitwiseNOT -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions #' @export -#' @aliases bitwiseNOT,Column-method -#' @examples \dontrun{bitwiseNOT(df$c)} +#' @aliases bitwiseNOT bitwiseNOT,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, bitwiseNOT(cast(df$vs, "int"))))} #' @note bitwiseNOT since 1.5.0 setMethod("bitwiseNOT", signature(x = "Column"), @@ -357,16 +379,12 @@ setMethod("ceil", column(jc) }) -#' Returns the first column that is not NA -#' -#' Returns the first column that is not NA, or NA if all inputs are. +#' @details +#' \code{coalesce}: Returns the first column that is not NA, or NA if all inputs are. #' -#' @rdname coalesce -#' @name coalesce -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions #' @export #' @aliases coalesce,Column-method -#' @examples \dontrun{coalesce(df$c, df$d, df$e)} #' @note coalesce(Column) since 2.1.1 setMethod("coalesce", signature(x = "Column"), @@ -387,18 +405,12 @@ col <- function(x) { column(callJStatic("org.apache.spark.sql.functions", "col", x)) } -#' Returns a Column based on the given column name -#' -#' Returns a Column based on the given column name. -#' -#' @param x Character column name. +#' @details +#' \code{column}: Returns a Column based on the given column name. #' -#' @rdname column -#' @name column -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions #' @export -#' @aliases column,character-method -#' @examples \dontrun{column("name")} +#' @aliases column column,character-method #' @note column since 1.6.0 setMethod("column", signature(x = "character"), @@ -856,22 +868,12 @@ setMethod("initcap", column(jc) }) -#' is.nan -#' -#' Return true if the column is NaN, alias for \link{isnan} -#' -#' @param x Column to compute on. +#' @details +#' \code{is.nan}: Alias for \link{isnan}. #' -#' @rdname is.nan -#' @name is.nan -#' @family non-aggregate functions -#' @aliases is.nan,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases is.nan is.nan,Column-method #' @export -#' @examples -#' \dontrun{ -#' is.nan(df$c) -#' isnan(df$c) -#' } #' @note is.nan since 2.0.0 setMethod("is.nan", signature(x = "Column"), @@ -879,8 +881,9 @@ setMethod("is.nan", isnan(x) }) -#' @rdname is.nan -#' @name isnan +#' @details +#' \code{is.nan}: Returns true if the column is NaN. +#' @rdname column_nonaggregate_functions #' @aliases isnan,Column-method #' @note isnan since 2.0.0 setMethod("isnan", @@ -1233,18 +1236,12 @@ setMethod("month", column(jc) }) -#' negate -#' -#' Unary minus, i.e. negate the expression. -#' -#' @param x Column to compute on. +#' @details +#' \code{negate}: Unary minus, i.e. negate the expression. #' -#' @rdname negate -#' @name negate -#' @family non-aggregate functions -#' @aliases negate,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases negate negate,Column-method #' @export -#' @examples \dontrun{negate(df$c)} #' @note negate since 1.5.0 setMethod("negate", signature(x = "Column"), @@ -1586,23 +1583,19 @@ setMethod("stddev_samp", column(jc) }) -#' struct -#' -#' Creates a new struct column that composes multiple input columns. -#' -#' @param x a column to compute on. -#' @param ... optional column(s) to be included. +#' @details +#' \code{struct}: Creates a new struct column that composes multiple input columns. #' -#' @rdname struct -#' @name struct -#' @family non-aggregate functions -#' @aliases struct,characterOrColumn-method +#' @rdname column_nonaggregate_functions +#' @aliases struct struct,characterOrColumn-method #' @export #' @examples +#' #' \dontrun{ -#' struct(df$c, df$d) -#' struct("col1", "col2") -#' } +#' tmp <- mutate(df, v1 = struct(df$mpg, df$cyl), v2 = struct("hp", "wt", "vs"), +#' v3 = create_array(df$mpg, df$cyl, df$hp), +#' v4 = create_map(lit("x"), lit(1.0), lit("y"), lit(-1.0))) +#' head(tmp)} #' @note struct since 1.6.0 setMethod("struct", signature(x = "characterOrColumn"), @@ -1790,30 +1783,23 @@ setMethod("to_date", column(jc) }) -#' to_json -#' -#' Converts a column containing a \code{structType} or array of \code{structType} into a Column -#' of JSON string. Resolving the Column can fail if an unsupported type is encountered. -#' -#' @param x Column containing the struct or array of the structs -#' @param ... additional named properties to control how it is converted, accepts the same options -#' as the JSON data source. +#' @details +#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType} +#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered. #' -#' @family non-aggregate functions -#' @rdname to_json -#' @name to_json -#' @aliases to_json,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases to_json to_json,Column-method #' @export #' @examples +#' #' \dontrun{ #' # Converts a struct into a JSON object -#' df <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") -#' select(df, to_json(df$d, dateFormat = 'dd/MM/yyyy')) +#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") +#' select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy')) #' #' # Converts an array of structs into a JSON array -#' df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") -#' select(df, to_json(df$people)) -#'} +#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") +#' df2 <- mutate(df2, people_json = to_json(df2$people))} #' @note to_json since 2.2.0 setMethod("to_json", signature(x = "Column"), function(x, ...) { @@ -2130,20 +2116,13 @@ setMethod("months_between", signature(y = "Column"), column(jc) }) -#' nanvl -#' -#' Returns col1 if it is not NaN, or col2 if col1 is NaN. -#' Both inputs should be floating point columns (DoubleType or FloatType). -#' -#' @param x first Column. -#' @param y second Column. +#' @details +#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the second column (\code{x}) if +#' the first column is NaN. Both inputs should be floating point columns (DoubleType or FloatType). #' -#' @rdname nanvl -#' @name nanvl -#' @family non-aggregate functions -#' @aliases nanvl,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases nanvl nanvl,Column-method #' @export -#' @examples \dontrun{nanvl(df$c, x)} #' @note nanvl since 1.5.0 setMethod("nanvl", signature(y = "Column"), function(y, x) { @@ -2236,20 +2215,13 @@ setMethod("concat", column(jc) }) -#' greatest -#' -#' Returns the greatest value of the list of column names, skipping null values. +#' @details +#' \code{greatest}: Returns the greatest value of the list of column names, skipping null values. #' This function takes at least 2 parameters. It will return null if all parameters are null. #' -#' @param x Column to compute on -#' @param ... other columns -#' -#' @family non-aggregate functions -#' @rdname greatest -#' @name greatest -#' @aliases greatest,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases greatest greatest,Column-method #' @export -#' @examples \dontrun{greatest(df$c, df$d)} #' @note greatest since 1.5.0 setMethod("greatest", signature(x = "Column"), @@ -2263,20 +2235,13 @@ setMethod("greatest", column(jc) }) -#' least -#' -#' Returns the least value of the list of column names, skipping null values. +#' @details +#' \code{least}: Returns the least value of the list of column names, skipping null values. #' This function takes at least 2 parameters. It will return null if all parameters are null. #' -#' @param x Column to compute on -#' @param ... other columns -#' -#' @family non-aggregate functions -#' @rdname least -#' @aliases least,Column-method -#' @name least +#' @rdname column_nonaggregate_functions +#' @aliases least least,Column-method #' @export -#' @examples \dontrun{least(df$c, df$d)} #' @note least since 1.5.0 setMethod("least", signature(x = "Column"), @@ -2357,28 +2322,23 @@ setMethod("date_format", signature(y = "Column", x = "character"), column(jc) }) -#' from_json -#' -#' Parses a column containing a JSON string into a Column of \code{structType} with the specified +#' @details +#' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType} with the specified #' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TRUE}. #' If the string is unparseable, the Column will contains the value NA. #' -#' @param x Column containing the JSON string. +#' @rdname column_nonaggregate_functions #' @param schema a structType object to use as the schema to use when parsing the JSON string. #' @param as.json.array indicating if input string is JSON array of objects or a single object. -#' @param ... additional named properties to control how the json is parsed, accepts the same -#' options as the JSON data source. -#' -#' @family non-aggregate functions -#' @rdname from_json -#' @name from_json -#' @aliases from_json,Column,structType-method +#' @aliases from_json from_json,Column,structType-method #' @export #' @examples +#' #' \dontrun{ -#' schema <- structType(structField("name", "string"), -#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy")) -#'} +#' df2 <- sql("SELECT named_struct('name', 'Bob') as people") +#' df2 <- mutate(df2, people_json = to_json(df2$people)) +#' schema <- structType(structField("name", "string")) +#' head(select(df2, from_json(df2$people_json, schema)))} #' @note from_json since 2.2.0 setMethod("from_json", signature(x = "Column", schema = "structType"), function(x, schema, as.json.array = FALSE, ...) { @@ -2681,18 +2641,13 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri column(jc) }) -#' expr -#' -#' Parses the expression string into the column that it represents, similar to -#' SparkDataFrame.selectExpr +#' @details +#' \code{expr}: Parses the expression string into the column that it represents, similar to +#' \code{SparkDataFrame.selectExpr} #' -#' @param x an expression character object to be parsed. -#' @family non-aggregate functions -#' @rdname expr -#' @aliases expr,character-method -#' @name expr +#' @rdname column_nonaggregate_functions +#' @aliases expr expr,character-method #' @export -#' @examples \dontrun{expr('length(name)')} #' @note expr since 1.5.0 setMethod("expr", signature(x = "character"), function(x) { @@ -2869,18 +2824,19 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), column(jc) }) -#' rand -#' -#' Generate a random column with independent and identically distributed (i.i.d.) samples +#' @details +#' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.) samples #' from U[0.0, 1.0]. #' +#' @rdname column_nonaggregate_functions #' @param seed a random seed. Can be missing. -#' @family non-aggregate functions -#' @rdname rand -#' @name rand -#' @aliases rand,missing-method +#' @aliases rand rand,missing-method #' @export -#' @examples \dontrun{rand()} +#' @examples +#' +#' \dontrun{ +#' tmp <- mutate(df, r1 = rand(), r2 = rand(10), r3 = randn(), r4 = randn(10)) +#' head(tmp)} #' @note rand since 1.5.0 setMethod("rand", signature(seed = "missing"), function(seed) { @@ -2888,8 +2844,7 @@ setMethod("rand", signature(seed = "missing"), column(jc) }) -#' @rdname rand -#' @name rand +#' @rdname column_nonaggregate_functions #' @aliases rand,numeric-method #' @export #' @note rand(numeric) since 1.5.0 @@ -2899,18 +2854,13 @@ setMethod("rand", signature(seed = "numeric"), column(jc) }) -#' randn -#' -#' Generate a column with independent and identically distributed (i.i.d.) samples from +#' @details +#' \code{randn}: Generates a column with independent and identically distributed (i.i.d.) samples from #' the standard normal distribution. #' -#' @param seed a random seed. Can be missing. -#' @family non-aggregate functions -#' @rdname randn -#' @name randn -#' @aliases randn,missing-method +#' @rdname column_nonaggregate_functions +#' @aliases randn randn,missing-method #' @export -#' @examples \dontrun{randn()} #' @note randn since 1.5.0 setMethod("randn", signature(seed = "missing"), function(seed) { @@ -2918,8 +2868,7 @@ setMethod("randn", signature(seed = "missing"), column(jc) }) -#' @rdname randn -#' @name randn +#' @rdname column_nonaggregate_functions #' @aliases randn,numeric-method #' @export #' @note randn(numeric) since 1.5.0 @@ -3089,20 +3038,26 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"), jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format) column(jc) }) -#' when -#' -#' Evaluates a list of conditions and returns one of multiple possible result expressions. + +#' @details +#' \code{when}: Evaluates a list of conditions and returns one of multiple possible result expressions. #' For unmatched expressions null is returned. #' +#' @rdname column_nonaggregate_functions #' @param condition the condition to test on. Must be a Column expression. #' @param value result expression. -#' @family non-aggregate functions -#' @rdname when -#' @name when -#' @aliases when,Column-method -#' @seealso \link{ifelse} +#' @aliases when when,Column-method #' @export -#' @examples \dontrun{when(df$age == 2, df$age + 1)} +#' @examples +#' +#' \dontrun{ +#' tmp <- mutate(df, mpg_na = otherwise(when(df$mpg > 20, df$mpg), lit(NaN)), +#' mpg2 = ifelse(df$mpg > 20 & df$am > 0, 0, 1), +#' mpg3 = ifelse(df$mpg > 20, df$mpg, 20.0)) +#' head(tmp) +#' tmp <- mutate(tmp, ind_na1 = is.nan(tmp$mpg_na), ind_na2 = isnan(tmp$mpg_na)) +#' head(select(tmp, coalesce(tmp$mpg_na, tmp$mpg))) +#' head(select(tmp, nanvl(tmp$mpg_na, tmp$hp)))} #' @note when since 1.5.0 setMethod("when", signature(condition = "Column", value = "ANY"), function(condition, value) { @@ -3112,25 +3067,16 @@ setMethod("when", signature(condition = "Column", value = "ANY"), column(jc) }) -#' ifelse -#' -#' Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied. +#' @details +#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied. #' Otherwise \code{no} is returned for unmatched conditions. #' +#' @rdname column_nonaggregate_functions #' @param test a Column expression that describes the condition. #' @param yes return values for \code{TRUE} elements of test. #' @param no return values for \code{FALSE} elements of test. -#' @family non-aggregate functions -#' @rdname ifelse -#' @name ifelse -#' @aliases ifelse,Column-method -#' @seealso \link{when} +#' @aliases ifelse ifelse,Column-method #' @export -#' @examples -#' \dontrun{ -#' ifelse(df$a > 1 & df$b > 2, 0, 1) -#' ifelse(df$a > 1, df$a, 1) -#' } #' @note ifelse since 1.5.0 setMethod("ifelse", signature(test = "Column", yes = "ANY", no = "ANY"), @@ -3533,19 +3479,12 @@ setMethod("posexplode", column(jc) }) -#' create_array -#' -#' Creates a new array column. The input columns must all have the same data type. -#' -#' @param x Column to compute on -#' @param ... additional Column(s). +#' @details +#' \code{create_array}: Creates a new array column. The input columns must all have the same data type. #' -#' @family non-aggregate functions -#' @rdname create_array -#' @name create_array -#' @aliases create_array,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases create_array create_array,Column-method #' @export -#' @examples \dontrun{create_array(df$x, df$y, df$z)} #' @note create_array since 2.3.0 setMethod("create_array", signature(x = "Column"), @@ -3558,22 +3497,15 @@ setMethod("create_array", column(jc) }) -#' create_map -#' -#' Creates a new map column. The input columns must be grouped as key-value pairs, +#' @details +#' \code{create_map}: Creates a new map column. The input columns must be grouped as key-value pairs, #' e.g. (key1, value1, key2, value2, ...). #' The key columns must all have the same data type, and can't be null. #' The value columns must all have the same data type. #' -#' @param x Column to compute on -#' @param ... additional Column(s). -#' -#' @family non-aggregate functions -#' @rdname create_map -#' @name create_map -#' @aliases create_map,Column-method +#' @rdname column_nonaggregate_functions +#' @aliases create_map create_map,Column-method #' @export -#' @examples \dontrun{create_map(lit("x"), lit(1.0), lit("y"), lit(-1.0))} #' @note create_map since 2.3.0 setMethod("create_map", signature(x = "Column"), @@ -3738,31 +3670,25 @@ setMethod("posexplode_outer", column(jc) }) -#' not -#' -#' Inversion of boolean expression. -#' -#' \code{not} and \code{!} cannot be applied directly to numerical column. -#' To achieve R-like truthiness column has to be casted to \code{BooleanType}. +#' @details +#' \code{not}: Inversion of boolean expression. \code{not} and \code{!} cannot be applied +#' directly to numerical column. To achieve R-like truthiness column has to be casted to \code{BooleanType}. #' -#' @param x Column to compute on -#' @rdname not -#' @name not -#' @aliases not,Column-method -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions +#' @aliases not not,Column-method #' @export #' @examples +#' #' \dontrun{ -#' df <- createDataFrame(data.frame( +#' df2 <- createDataFrame(data.frame( #' is_true = c(TRUE, FALSE, NA), #' flag = c(1, 0, 1) #' )) #' -#' head(select(df, not(df$is_true))) +#' head(select(df2, not(df2$is_true))) #' #' # Explicit cast is required when working with numeric column -#' head(select(df, not(cast(df$flag, "boolean")))) -#' } +#' head(select(df2, not(cast(df2$flag, "boolean"))))} #' @note not since 2.3.0 setMethod("not", signature(x = "Column"), @@ -3840,21 +3766,17 @@ setMethod("grouping_id", column(jc) }) -#' input_file_name -#' -#' Creates a string column with the input file name for a given row +#' @details +#' \code{input_file_name}: Creates a string column with the input file name for a given row. #' -#' @rdname input_file_name -#' @name input_file_name -#' @family non-aggregate functions -#' @aliases input_file_name,missing-method +#' @rdname column_nonaggregate_functions +#' @aliases input_file_name input_file_name,missing-method #' @export #' @examples -#' \dontrun{ -#' df <- read.text("README.md") #' -#' head(select(df, input_file_name())) -#' } +#' \dontrun{ +#' tmp <- read.text("README.md") +#' head(select(tmp, input_file_name()))} #' @note input_file_name since 2.3.0 setMethod("input_file_name", signature("missing"), function() { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f105174cea70d..f2f711f484bb1 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -422,9 +422,8 @@ setGeneric("cache", function(x) { standardGeneric("cache") }) setGeneric("checkpoint", function(x, eager = TRUE) { standardGeneric("checkpoint") }) #' @rdname coalesce -#' @param x a Column or a SparkDataFrame. -#' @param ... additional argument(s). If \code{x} is a Column, additional Columns can be optionally -#' provided. +#' @param x a SparkDataFrame. +#' @param ... additional argument(s). #' @export setGeneric("coalesce", function(x, ...) { standardGeneric("coalesce") }) @@ -863,12 +862,14 @@ setGeneric("rlike", function(x, ...) { standardGeneric("rlike") }) #' @export setGeneric("startsWith", function(x, prefix) { standardGeneric("startsWith") }) -#' @rdname when +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("when", function(condition, value) { standardGeneric("when") }) -#' @rdname otherwise +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") }) #' @rdname over @@ -935,8 +936,9 @@ setGeneric("base64", function(x) { standardGeneric("base64") }) #' @export setGeneric("bin", function(x) { standardGeneric("bin") }) -#' @rdname bitwiseNOT +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) #' @rdname bround @@ -986,12 +988,14 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") #' @export setGeneric("crc32", function(x) { standardGeneric("crc32") }) -#' @rdname create_array +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("create_array", function(x, ...) { standardGeneric("create_array") }) -#' @rdname create_map +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("create_map", function(x, ...) { standardGeneric("create_map") }) #' @rdname hash @@ -1054,8 +1058,9 @@ setGeneric("explode", function(x) { standardGeneric("explode") }) #' @export setGeneric("explode_outer", function(x) { standardGeneric("explode_outer") }) -#' @rdname expr +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("expr", function(x) { standardGeneric("expr") }) #' @rdname column_datetime_diff_functions @@ -1071,8 +1076,9 @@ setGeneric("format_number", function(y, x) { standardGeneric("format_number") }) #' @export setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") }) -#' @rdname from_json +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") }) #' @rdname column_datetime_functions @@ -1080,8 +1086,9 @@ setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") #' @name NULL setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") }) -#' @rdname greatest +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) #' @rdname column_aggregate_functions @@ -1111,9 +1118,9 @@ setGeneric("hypot", function(y, x) { standardGeneric("hypot") }) #' @export setGeneric("initcap", function(x) { standardGeneric("initcap") }) -#' @param x empty. Should be used with no argument. -#' @rdname input_file_name +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("input_file_name", function(x = "missing") { standardGeneric("input_file_name") }) @@ -1121,8 +1128,9 @@ setGeneric("input_file_name", #' @export setGeneric("instr", function(y, x) { standardGeneric("instr") }) -#' @rdname is.nan +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("isnan", function(x) { standardGeneric("isnan") }) #' @rdname column_aggregate_functions @@ -1147,16 +1155,18 @@ setGeneric("last_day", function(x) { standardGeneric("last_day") }) #' @export setGeneric("lead", function(x, offset, defaultValue = NULL) { standardGeneric("lead") }) -#' @rdname least +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("least", function(x, ...) { standardGeneric("least") }) #' @rdname levenshtein #' @export setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") }) -#' @rdname lit +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("lit", function(x) { standardGeneric("lit") }) #' @rdname locate @@ -1204,16 +1214,19 @@ setGeneric("months_between", function(y, x) { standardGeneric("months_between") #' @export setGeneric("n", function(x) { standardGeneric("n") }) -#' @rdname nanvl +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") }) -#' @rdname negate +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("negate", function(x) { standardGeneric("negate") }) -#' @rdname not +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("not", function(x) { standardGeneric("not") }) #' @rdname column_datetime_diff_functions @@ -1252,12 +1265,14 @@ setGeneric("posexplode_outer", function(x) { standardGeneric("posexplode_outer") #' @name NULL setGeneric("quarter", function(x) { standardGeneric("quarter") }) -#' @rdname rand +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("rand", function(seed) { standardGeneric("rand") }) -#' @rdname randn +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("randn", function(seed) { standardGeneric("randn") }) #' @rdname rank @@ -1373,8 +1388,9 @@ setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") }) #' @name NULL setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") }) -#' @rdname struct +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("struct", function(x, ...) { standardGeneric("struct") }) #' @rdname substring_index @@ -1399,8 +1415,9 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") }) #' @name NULL setGeneric("to_date", function(x, format) { standardGeneric("to_date") }) -#' @rdname to_json +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("to_json", function(x, ...) { standardGeneric("to_json") }) #' @rdname column_datetime_functions From 4e77d7b73015a175a6b81002f47aec4031f3a2fe Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 26 Jun 2017 14:50:33 -0700 Subject: [PATCH 2/5] fix test error --- R/pkg/R/functions.R | 43 ++++++++++++++++++++++++++++--------------- R/pkg/R/generics.R | 6 ++---- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 7f3c4264e91fb..4fcf8332dfcfd 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -106,6 +106,7 @@ NULL #' @name column_nonaggregate_functions #' @rdname column_nonaggregate_functions #' @seealso coalesce,SparkDataFrame-method +#' @family non-aggregate functions #' @examples #' \dontrun{ #' # Dataframe used throughout this doc @@ -405,12 +406,18 @@ col <- function(x) { column(callJStatic("org.apache.spark.sql.functions", "col", x)) } -#' @details -#' \code{column}: Returns a Column based on the given column name. +#' Returns a Column based on the given column name #' -#' @rdname column_nonaggregate_functions +#' Returns a Column based on the given column name. +#' +#' @param x Character column name. +#' +#' @rdname column +#' @name column +#' @family non-aggregate functions #' @export -#' @aliases column column,character-method +#' @aliases column,character-method +#' @examples \dontrun{column("name")} #' @note column since 1.6.0 setMethod("column", signature(x = "character"), @@ -882,9 +889,9 @@ setMethod("is.nan", }) #' @details -#' \code{is.nan}: Returns true if the column is NaN. +#' \code{isnan}: Returns true if the column is NaN. #' @rdname column_nonaggregate_functions -#' @aliases isnan,Column-method +#' @aliases isnan isnan,Column-method #' @note isnan since 2.0.0 setMethod("isnan", signature(x = "Column"), @@ -3670,25 +3677,31 @@ setMethod("posexplode_outer", column(jc) }) -#' @details -#' \code{not}: Inversion of boolean expression. \code{not} and \code{!} cannot be applied -#' directly to numerical column. To achieve R-like truthiness column has to be casted to \code{BooleanType}. +#' not #' -#' @rdname column_nonaggregate_functions -#' @aliases not not,Column-method +#' Inversion of boolean expression. +#' +#' \code{not} and \code{!} cannot be applied directly to numerical column. +#' To achieve R-like truthiness column has to be casted to \code{BooleanType}. +#' +#' @param x Column to compute on +#' @rdname not +#' @name not +#' @aliases not,Column-method +#' @family non-aggregate functions #' @export #' @examples -#' #' \dontrun{ -#' df2 <- createDataFrame(data.frame( +#' df <- createDataFrame(data.frame( #' is_true = c(TRUE, FALSE, NA), #' flag = c(1, 0, 1) #' )) #' -#' head(select(df2, not(df2$is_true))) +#' head(select(df, not(df$is_true))) #' #' # Explicit cast is required when working with numeric column -#' head(select(df2, not(cast(df2$flag, "boolean"))))} +#' head(select(df, not(cast(df$flag, "boolean")))) +#' } #' @note not since 2.3.0 setMethod("not", signature(x = "Column"), diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f2f711f484bb1..cde0d8a48ad76 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -867,9 +867,8 @@ setGeneric("startsWith", function(x, prefix) { standardGeneric("startsWith") }) #' @name NULL setGeneric("when", function(condition, value) { standardGeneric("when") }) -#' @rdname column_nonaggregate_functions +#' @rdname otherwise #' @export -#' @name NULL setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") }) #' @rdname over @@ -1224,9 +1223,8 @@ setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") }) #' @name NULL setGeneric("negate", function(x) { standardGeneric("negate") }) -#' @rdname column_nonaggregate_functions +#' @rdname not #' @export -#' @name NULL setGeneric("not", function(x) { standardGeneric("not") }) #' @rdname column_datetime_diff_functions From e32aacec4ddf9ad9df4b7cab5747201fe0da1c3e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 28 Jun 2017 10:35:34 -0700 Subject: [PATCH 3/5] revert from_json and to_json --- R/pkg/R/functions.R | 65 ++++++++++++++++++++++++--------------------- R/pkg/R/generics.R | 6 ++--- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 4fcf8332dfcfd..ce6837e052da5 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -90,19 +90,10 @@ NULL #' #' Non-aggregate functions defined for \code{Column}. #' -#' @param x Column to compute on. Note the difference in the following methods: -#' \itemize{ -#' \item \code{lit}: it is a literal value or a Column. -#' \item \code{to_json}: it is the column containing the struct or array of the structs. -#' \item \code{from_json}: it is column containing the JSON string. -#' } +#' @param x Column to compute on. In \code{lit}, it is a literal value or a Column. #' @param y Column to compute on. -#' @param ... additional argument(s). Note the difference in the following methods: -#' \itemize{ -#' \item \code{to_json}, \code{from_json}: this contains additional named properties to -#' control how it is converted, accepts the same options as the JSON data source. -#' \item \code{expr}: it contains an expression character object to be parsed. -#' } +#' @param ... additional argument(s). In \code{expr}, it contains an expression character +#' object to be parsed. #' @name column_nonaggregate_functions #' @rdname column_nonaggregate_functions #' @seealso coalesce,SparkDataFrame-method @@ -1790,23 +1781,30 @@ setMethod("to_date", column(jc) }) -#' @details -#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType} -#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered. +#' to_json #' -#' @rdname column_nonaggregate_functions -#' @aliases to_json to_json,Column-method +#' Converts a column containing a \code{structType} or array of \code{structType} into a Column +#' of JSON string. Resolving the Column can fail if an unsupported type is encountered. +#' +#' @param x Column containing the struct or array of the structs +#' @param ... additional named properties to control how it is converted, accepts the same options +#' as the JSON data source. +#' +#' @family non-aggregate functions +#' @rdname to_json +#' @name to_json +#' @aliases to_json,Column-method #' @export #' @examples -#' #' \dontrun{ #' # Converts a struct into a JSON object -#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") -#' select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy')) +#' df <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") +#' select(df, to_json(df$d, dateFormat = 'dd/MM/yyyy')) #' #' # Converts an array of structs into a JSON array -#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") -#' df2 <- mutate(df2, people_json = to_json(df2$people))} +#' df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") +#' select(df, to_json(df$people)) +#'} #' @note to_json since 2.2.0 setMethod("to_json", signature(x = "Column"), function(x, ...) { @@ -2329,23 +2327,28 @@ setMethod("date_format", signature(y = "Column", x = "character"), column(jc) }) -#' @details -#' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType} with the specified +#' from_json +#' +#' Parses a column containing a JSON string into a Column of \code{structType} with the specified #' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TRUE}. #' If the string is unparseable, the Column will contains the value NA. #' -#' @rdname column_nonaggregate_functions +#' @param x Column containing the JSON string. #' @param schema a structType object to use as the schema to use when parsing the JSON string. #' @param as.json.array indicating if input string is JSON array of objects or a single object. -#' @aliases from_json from_json,Column,structType-method +#' @param ... additional named properties to control how the json is parsed, accepts the same +#' options as the JSON data source. +#' +#' @family non-aggregate functions +#' @rdname from_json +#' @name from_json +#' @aliases from_json,Column,structType-method #' @export #' @examples -#' #' \dontrun{ -#' df2 <- sql("SELECT named_struct('name', 'Bob') as people") -#' df2 <- mutate(df2, people_json = to_json(df2$people)) -#' schema <- structType(structField("name", "string")) -#' head(select(df2, from_json(df2$people_json, schema)))} +#' schema <- structType(structField("name", "string"), +#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy")) +#'} #' @note from_json since 2.2.0 setMethod("from_json", signature(x = "Column", schema = "structType"), function(x, schema, as.json.array = FALSE, ...) { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index cde0d8a48ad76..ccd5289b6a411 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1075,9 +1075,8 @@ setGeneric("format_number", function(y, x) { standardGeneric("format_number") }) #' @export setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") }) -#' @rdname column_nonaggregate_functions +#' @rdname from_json #' @export -#' @name NULL setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") }) #' @rdname column_datetime_functions @@ -1413,9 +1412,8 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") }) #' @name NULL setGeneric("to_date", function(x, format) { standardGeneric("to_date") }) -#' @rdname column_nonaggregate_functions +#' @rdname to_json #' @export -#' @name NULL setGeneric("to_json", function(x, ...) { standardGeneric("to_json") }) #' @rdname column_datetime_functions From 83ecb984682ffbe8243a647f9584f78201f55d78 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 28 Jun 2017 10:41:43 -0700 Subject: [PATCH 4/5] add doc for monotonically_increasing_id --- R/pkg/R/functions.R | 29 +++++++++++++---------------- R/pkg/R/generics.R | 4 ++-- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index ce6837e052da5..bdcded11fbc4f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -91,6 +91,7 @@ NULL #' Non-aggregate functions defined for \code{Column}. #' #' @param x Column to compute on. In \code{lit}, it is a literal value or a Column. +#' In \code{monotonically_increasing_id}, it should be empty. #' @param y Column to compute on. #' @param ... additional argument(s). In \code{expr}, it contains an expression character #' object to be parsed. @@ -1192,27 +1193,23 @@ setMethod("minute", column(jc) }) -#' monotonically_increasing_id -#' -#' Return a column that generates monotonically increasing 64-bit integers. -#' -#' The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. -#' The current implementation puts the partition ID in the upper 31 bits, and the record number -#' within each partition in the lower 33 bits. The assumption is that the SparkDataFrame has -#' less than 1 billion partitions, and each partition has less than 8 billion records. -#' -#' As an example, consider a SparkDataFrame with two partitions, each with 3 records. +#' @details +#' \code{monotonically_increasing_id}: Returns a column that generates monotonically increasing +#' 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique, +#' but not consecutive. The current implementation puts the partition ID in the upper 31 bits, +#' and the record number within each partition in the lower 33 bits. The assumption is that the +#' SparkDataFrame has less than 1 billion partitions, and each partition has less than 8 billion +#' records. As an example, consider a SparkDataFrame with two partitions, each with 3 records. #' This expression would return the following IDs: #' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. -#' #' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL. #' -#' @rdname monotonically_increasing_id -#' @aliases monotonically_increasing_id,missing-method -#' @name monotonically_increasing_id -#' @family misc functions +#' @rdname column_nonaggregate_functions +#' @aliases monotonically_increasing_id monotonically_increasing_id,missing-method #' @export -#' @examples \dontrun{select(df, monotonically_increasing_id())} +#' @examples +#' +#' \dontrun{head(select(df, monotonically_increasing_id()))} setMethod("monotonically_increasing_id", signature("missing"), function() { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ccd5289b6a411..1c277b5807c89 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1192,9 +1192,9 @@ setGeneric("md5", function(x) { standardGeneric("md5") }) #' @name NULL setGeneric("minute", function(x) { standardGeneric("minute") }) -#' @param x empty. Should be used with no argument. -#' @rdname monotonically_increasing_id +#' @rdname column_nonaggregate_functions #' @export +#' @name NULL setGeneric("monotonically_increasing_id", function(x = "missing") { standardGeneric("monotonically_increasing_id") }) From 1d0989a4b075515663db1d41022498f787184a5c Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 28 Jun 2017 23:19:40 -0700 Subject: [PATCH 5/5] address comments --- R/pkg/R/functions.R | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index fd3ba0b696d6b..cb09e847d739a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -137,10 +137,9 @@ NULL #' Non-aggregate functions defined for \code{Column}. #' #' @param x Column to compute on. In \code{lit}, it is a literal value or a Column. -#' In \code{monotonically_increasing_id}, it should be empty. +#' In \code{expr}, it contains an expression character object to be parsed. #' @param y Column to compute on. -#' @param ... additional argument(s). In \code{expr}, it contains an expression character -#' object to be parsed. +#' @param ... additional Columns. #' @name column_nonaggregate_functions #' @rdname column_nonaggregate_functions #' @seealso coalesce,SparkDataFrame-method @@ -152,8 +151,8 @@ NULL NULL #' @details -#' \code{lit}: A new \linkS4class{Column} is created to represent the literal value. -#' If the parameter is a \linkS4class{Column}, it is returned unchanged. +#' \code{lit}: A new Column is created to represent the literal value. +#' If the parameter is a Column, it is returned unchanged. #' #' @rdname column_nonaggregate_functions #' @export @@ -835,6 +834,18 @@ setMethod("initcap", column(jc) }) +#' @details +#' \code{isnan}: Returns true if the column is NaN. +#' @rdname column_nonaggregate_functions +#' @aliases isnan isnan,Column-method +#' @note isnan since 2.0.0 +setMethod("isnan", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc) + column(jc) + }) + #' @details #' \code{is.nan}: Alias for \link{isnan}. #' @@ -848,18 +859,6 @@ setMethod("is.nan", isnan(x) }) -#' @details -#' \code{isnan}: Returns true if the column is NaN. -#' @rdname column_nonaggregate_functions -#' @aliases isnan isnan,Column-method -#' @note isnan since 2.0.0 -setMethod("isnan", - signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc) - column(jc) - }) - #' @details #' \code{kurtosis}: Returns the kurtosis of the values in a group. #' @@ -1141,6 +1140,7 @@ setMethod("minute", #' This expression would return the following IDs: #' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. #' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL. +#' The method should be used with no argument. #' #' @rdname column_nonaggregate_functions #' @aliases monotonically_increasing_id monotonically_increasing_id,missing-method @@ -3495,6 +3495,7 @@ setMethod("grouping_id", #' @details #' \code{input_file_name}: Creates a string column with the input file name for a given row. +#' The method should be used with no argument. #' #' @rdname column_nonaggregate_functions #' @aliases input_file_name input_file_name,missing-method