Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-13766: [R] Add slice_*() methods #14361

Merged
merged 11 commits into from
Oct 13, 2022
1 change: 1 addition & 0 deletions r/.lintr
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ linters: linters_with_defaults(
)
exclusions: list(
"R/arrowExports.R",
"R/dplyr-funcs-doc.R",
"data-raw/codegen.R"
)
1 change: 1 addition & 0 deletions r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Collate:
'dplyr-join.R'
'dplyr-mutate.R'
'dplyr-select.R'
'dplyr-slice.R'
'dplyr-summarize.R'
'dplyr-union.R'
'record-batch.R'
Expand Down
3 changes: 3 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,8 @@ importFrom(rlang,as_quosure)
importFrom(rlang,call2)
importFrom(rlang,call_args)
importFrom(rlang,caller_env)
importFrom(rlang,check_dots_empty)
importFrom(rlang,dots_list)
importFrom(rlang,dots_n)
importFrom(rlang,enexpr)
importFrom(rlang,enexprs)
Expand Down Expand Up @@ -472,6 +474,7 @@ importFrom(stats,na.fail)
importFrom(stats,na.omit)
importFrom(stats,na.pass)
importFrom(stats,quantile)
importFrom(stats,runif)
importFrom(tidyselect,all_of)
importFrom(tidyselect,contains)
importFrom(tidyselect,ends_with)
Expand Down
4 changes: 2 additions & 2 deletions r/R/array.R
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ stop_cant_convert_array <- function(x, type) {
"Can't create Array from object of type %s",
paste(class(x), collapse = " / ")
),
call = rlang::caller_env()
call = caller_env()
)
} else {
abort(
Expand All @@ -358,7 +358,7 @@ stop_cant_convert_array <- function(x, type) {
format(type$code()),
paste(class(x), collapse = " / ")
),
call = rlang::caller_env()
call = caller_env()
)
}
}
Expand Down
6 changes: 6 additions & 0 deletions r/R/arrow-datum.R
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,9 @@ head.ArrowDatum <- function(x, n = 6L, ...) {
} else {
n <- min(len, n)
}
if (!is.integer(n)) {
n <- floor(n)
}
if (n == len) {
return(x)
}
Expand All @@ -310,6 +313,9 @@ head.ArrowDatum <- function(x, n = 6L, ...) {
tail.ArrowDatum <- function(x, n = 6L, ...) {
assert_is(n, c("numeric", "integer"))
assert_that(length(n) == 1)
if (!is.integer(n)) {
n <- floor(n)
}
len <- NROW(x)
if (n < 0) {
# tail(x, negative) means all but the first n rows
Expand Down
27 changes: 26 additions & 1 deletion r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args
#' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure
#' @importFrom rlang new_quosures expr_text
#' @importFrom rlang new_quosures expr_text caller_env check_dots_empty dots_list
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @importFrom glue glue
#' @useDynLib arrow, .registration = TRUE
Expand Down Expand Up @@ -64,6 +64,31 @@ supported_dplyr_methods <- list(
rename_with = NULL,
union = NULL,
union_all = NULL,
slice_head = c(
"slicing within groups not supported;",
"Arrow datasets do not have row order, so head is non-deterministic;",
"`prop` only supported on queries where `nrow()` is knowable without evaluating"
),
slice_tail = c(
"slicing within groups not supported;",
"Arrow datasets do not have row order, so tail is non-deterministic;",
"`prop` only supported on queries where `nrow()` is knowable without evaluating"
),
slice_min = c(
"slicing within groups not supported;",
"`with_ties = TRUE` (dplyr default) is not supported;",
"`prop` only supported on queries where `nrow()` is knowable without evaluating"
),
slice_max = c(
"slicing within groups not supported;",
"`with_ties = TRUE` (dplyr default) is not supported;",
"`prop` only supported on queries where `nrow()` is knowable without evaluating"
),
slice_sample = c(
"slicing within groups not supported;",
"`replace = TRUE` and the `weight_by` argument not supported;",
"`n` only supported on queries where `nrow()` is knowable without evaluating"
),
glimpse = NULL,
show_query = NULL,
explain = NULL
Expand Down
16 changes: 13 additions & 3 deletions r/R/dataset-scan.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,14 @@ names.Scanner <- function(x) names(x$schema)

#' @export
head.Scanner <- function(x, n = 6L, ...) {
assert_is(n, c("numeric", "integer"))
assert_that(length(n) == 1)
# Negative n requires knowing nrow(x), which requires a scan itself
assert_that(n >= 0)
dataset___Scanner__head(x, n)
if (!is.integer(n)) {
n <- floor(n)
}
dataset___Scanner__head(x, floor(n))
}

#' @export
Expand All @@ -168,8 +173,13 @@ tail.Scanner <- function(x, n = 6L, ...) {
}

tail_from_batches <- function(batches, n) {
assert_is(n, c("numeric", "integer"))
assert_that(length(n) == 1)
# Negative n requires knowing nrow(x), which requires a scan itself
assert_that(n >= 0) # For now
assert_that(n >= 0)
if (!is.integer(n)) {
n <- floor(n)
}
result <- list()
batch_num <- 0
# Given a list of batches, iterate from the back
Expand Down Expand Up @@ -224,7 +234,7 @@ map_batches <- function(X, FUN, ..., .schema = NULL, .lazy = FALSE, .data.frame
}
FUN <- as_mapper(FUN)
reader <- as_record_batch_reader(X)
dots <- rlang::list2(...)
dots <- list2(...)

# If no schema is supplied, we have to evaluate the first batch here
if (is.null(.schema)) {
Expand Down
17 changes: 11 additions & 6 deletions r/R/dplyr-funcs-doc.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

#' Functions available in Arrow dplyr queries
#'
#' The `arrow` package contains methods for 32 `dplyr` table functions, many of
#' The `arrow` package contains methods for 37 `dplyr` table functions, many of
#' which are "verbs" that do transformations to one or more tables.
#' The package also has mappings of 205 R functions to the corresponding
#' The package also has mappings of 207 R functions to the corresponding
#' functions in the Arrow compute library. These allow you to write code inside
#' of `dplyr` methods that call R functions, including many in packages like
#' `stringr` and `lubridate`, and they will get translated to Arrow and run
Expand Down Expand Up @@ -62,6 +62,11 @@
#' * [`select()`][dplyr::select()]
#' * [`semi_join()`][dplyr::semi_join()]
#' * [`show_query()`][dplyr::show_query()]
#' * [`slice_head()`][dplyr::slice_head()]: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating
#' * [`slice_max()`][dplyr::slice_max()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating
#' * [`slice_min()`][dplyr::slice_min()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating
#' * [`slice_sample()`][dplyr::slice_sample()]: slicing within groups not supported; `replace = TRUE` and the `weight_by` argument not supported; `n` only supported on queries where `nrow()` is knowable without evaluating
#' * [`slice_tail()`][dplyr::slice_tail()]: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating
#' * [`summarise()`][dplyr::summarise()]
#' * [`tally()`][dplyr::tally()]
#' * [`transmute()`][dplyr::transmute()]
Expand All @@ -78,7 +83,7 @@
#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both
#' `str_sub()` and `stringr::str_sub()` work.
#'
#' In addition to these functions, you can call any of Arrow's 243 compute
#' In addition to these functions, you can call any of Arrow's 244 compute
#' functions directly. Arrow has many functions that don't map to an existing R
#' function. In other cases where there is an R function mapping, you can still
#' call the Arrow function directly if you don't want the adaptations that the R
Expand Down Expand Up @@ -185,13 +190,13 @@
#'
#' ## dplyr
#'
#' * [`across()`][dplyr::across()]: not yet supported inside `filter()`;
#' purrr-style lambda functions
#' and use of `where()` selection helper not yet supported
#' * [`across()`][dplyr::across()]: Use of `where()` selection helper not yet supported
#' * [`between()`][dplyr::between()]
#' * [`case_when()`][dplyr::case_when()]
#' * [`coalesce()`][dplyr::coalesce()]
#' * [`desc()`][dplyr::desc()]
#' * [`if_all()`][dplyr::if_all()]
#' * [`if_any()`][dplyr::if_any()]
#' * [`if_else()`][dplyr::if_else()]
#' * [`n()`][dplyr::n()]
#' * [`n_distinct()`][dplyr::n_distinct()]
Expand Down
4 changes: 2 additions & 2 deletions r/R/dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ register_bindings_type_cast <- function() {
# it is difficult to replicate the .name_repair semantics and expanding of
# unnamed data frame arguments in the same way that the tibble() constructor
# does.
args <- rlang::dots_list(..., .named = TRUE, .homonyms = "error")
args <- dots_list(..., .named = TRUE, .homonyms = "error")

build_expr(
"make_struct",
Expand All @@ -151,7 +151,7 @@ register_bindings_type_cast <- function() {
if (!is.null(row.names)) arrow_not_supported("row.names")
if (!is.null(check.rows)) arrow_not_supported("check.rows")

args <- rlang::dots_list(..., .named = fix.empty.names)
args <- dots_list(..., .named = fix.empty.names)
if (is.null(names(args))) {
names(args) <- rep("", length(args))
}
Expand Down
14 changes: 13 additions & 1 deletion r/R/dplyr-funcs.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,9 @@ call_binding_agg <- function(fun_name, ...) {
agg_funcs[[fun_name]](...)
}

# Called in .onLoad()
#' @importFrom stats runif
create_binding_cache <- function() {
# Called in .onLoad()
.cache$docs <- list()

# Register all available Arrow Compute functions, namespaced as arrow_fun.
Expand All @@ -160,6 +161,17 @@ create_binding_cache <- function() {
register_bindings_type()
register_bindings_augmented()

# HACK because random() doesn't work (ARROW-17974)
register_scalar_function(
"_random_along",
function(context, x) {
Array$create(runif(length(x)))
},
nealrichardson marked this conversation as resolved.
Show resolved Hide resolved
in_type = schema(x = boolean()),
out_type = float64(),
auto_convert = FALSE
)

# We only create the cache for nse_funcs and not agg_funcs
.cache$functions <- c(as.list(nse_funcs), arrow_funcs)
}
Expand Down
Loading