apache · nealrichardson · Aug 24, 2021 · Aug 24, 2021 · Aug 24, 2021 · Aug 24, 2021
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -247,6 +247,7 @@ jobs:
           Sys.setenv(
             RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "libarrow.zip"),
             MAKEFLAGS = paste0("-j", parallel::detectCores()),
+            ARROW_R_DEV = TRUE,
             "_R_CHECK_FORCE_SUGGESTS_" = FALSE
           )
           rcmdcheck::rcmdcheck("r",

diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -291,6 +291,7 @@ importFrom(bit64,print.integer64)
 importFrom(bit64,str.integer64)
 importFrom(methods,as)
 importFrom(purrr,as_mapper)
+importFrom(purrr,imap)
 importFrom(purrr,imap_chr)
 importFrom(purrr,keep)
 importFrom(purrr,map)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -19,6 +19,11 @@
 
 # arrow 5.0.0.9000
 
+## Breaking changes
+
+* `dplyr::summarize()` on an in-memory Arrow Table or RecordBatch no longer eagerly evaluates. Call `compute()` or `collect()` to evaluate the query.
+* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query. For calls to `summarize()`, you can set `options(arrow.summarise.sort = TRUE)` to match the current `dplyr` behavior of sorting on the grouping columns.
+
 # arrow 5.0.0
 
 ## More dplyr

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -17,7 +17,7 @@
 
 #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
 #' @importFrom R6 R6Class
-#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap_chr
+#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr
 #' @importFrom assertthat assert_that is.string
 #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec
@@ -35,7 +35,7 @@
     c(
       "select", "filter", "collect", "summarise", "group_by", "groups",
       "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
-      "arrange", "rename", "pull", "relocate", "compute"
+      "arrange", "rename", "pull", "relocate", "compute", "collapse"
     )
   )
   for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R
@@ -73,18 +73,14 @@ Scanner$create <- function(dataset,
                            projection = NULL,
                            filter = TRUE,
                            use_threads = option_use_threads(),
-                           use_async = NULL,
+                           use_async = getOption("arrow.use_async", FALSE),
                            batch_size = NULL,
                            fragment_scan_options = NULL,
                            ...) {
-  if (is.null(use_async)) {
-    use_async <- getOption("arrow.use_async", FALSE)
-  }
-
   if (inherits(dataset, "arrow_dplyr_query")) {
-    if (inherits(dataset$.data, "ArrowTabular")) {
-      # To handle mutate() on Table/RecordBatch, we need to collect(as_data_frame=FALSE) now
-      dataset <- dplyr::collect(dataset, as_data_frame = FALSE)
+    if (is_collapsed(dataset)) {
+      # TODO: Is there a way to get a RecordBatchReader rather than evaluating?
+      dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data
     }
 
     proj <- c(dataset$selected_columns, dataset$temp_columns)
@@ -117,7 +113,7 @@ Scanner$create <- function(dataset,
       ...
     ))
   }
-  if (inherits(dataset, c("data.frame", "RecordBatch", "Table"))) {
+  if (inherits(dataset, c("data.frame", "ArrowTabular"))) {
     dataset <- InMemoryDataset$create(dataset)
   }
   assert_is(dataset, "Dataset")

diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R
@@ -30,7 +30,7 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
     # Nothing to do
     return(.data)
   }
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
   # find and remove any dplyr::desc() and tidy-eval
   # the arrange expressions inside an Arrow data_mask
   sorts <- vector("list", length(exprs))

diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
@@ -19,19 +19,8 @@
 # The following S3 methods are registered on load if dplyr is present
 
 collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
-  x <- ensure_group_vars(x)
-  x <- ensure_arrange_vars(x) # this sets x$temp_columns
-  # Pull only the selected rows and cols into R
-  # See dataset.R for Dataset and Scanner(Builder) classes
-  tab <- Scanner$create(x)$ToTable()
-  # Arrange rows
-  if (length(x$arrange_vars) > 0) {
-    tab <- tab[
-      tab$SortIndices(names(x$arrange_vars), x$arrange_desc),
-      names(x$selected_columns), # this omits x$temp_columns from the result
-      drop = FALSE
-    ]
-  }
+  # See query-engine.R for ExecPlan/Nodes
+  tab <- do_exec_plan(x)
   if (as_data_frame) {
     df <- as.data.frame(tab)
     tab$invalidate()
@@ -47,16 +36,71 @@ collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
     x
   }
 }
-collect.Dataset <- function(x, ...) dplyr::collect(arrow_dplyr_query(x), ...)
+collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...)
 
 compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE)
 compute.ArrowTabular <- function(x, ...) x
 compute.Dataset <- compute.arrow_dplyr_query
 
 pull.arrow_dplyr_query <- function(.data, var = -1) {
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
   var <- vars_pull(names(.data), !!enquo(var))
   .data$selected_columns <- set_names(.data$selected_columns[var], var)
   dplyr::collect(.data)[[1]]
 }
 pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query
+
+# TODO: Correctly handle group_vars after summarize; also in collapse()
+restore_dplyr_features <- function(df, query) {
+  # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
+  # After calling collect(), make sure these features are carried over
+
+  if (length(query$group_by_vars) > 0) {
+    # Preserve groupings, if present
+    if (is.data.frame(df)) {
+      df <- dplyr::grouped_df(
+        df,
+        dplyr::group_vars(query),
+        drop = dplyr::group_by_drop_default(query)
+      )
+    } else {
+      # This is a Table, via compute() or collect(as_data_frame = FALSE)
+      df <- as_adq(df)
+      df$group_by_vars <- query$group_by_vars
+      df$drop_empty_groups <- query$drop_empty_groups
+    }
+  }
+  df
+}
+
+collapse.arrow_dplyr_query <- function(x, ...) {
+  # Figure out what schema will result from the query
+  x$schema <- implicit_schema(x)
+  # Nest inside a new arrow_dplyr_query
+  arrow_dplyr_query(x)
+}
+collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) {
+  arrow_dplyr_query(x)
+}
+
+implicit_schema <- function(.data) {
+  .data <- ensure_group_vars(.data)
+  old_schm <- .data$.data$schema
+
+  if (is.null(.data$aggregations)) {
+    new_fields <- map(.data$selected_columns, ~ .$type(old_schm))
+  } else {
+    new_fields <- map(summarize_projection(.data), ~ .$type(old_schm))
+    # * Put group_by_vars first (this can't be done by summarize,
+    #   they have to be last per the aggregate node signature,
+    #   and they get projected to this order after aggregation)
+    # * Infer the output types from the aggregations
+    group_fields <- new_fields[.data$group_by_vars]
+    agg_fields <- imap(
+      new_fields[setdiff(names(new_fields), .data$group_by_vars)],
+      ~ output_type(.data$aggregations[[.y]][["fun"]], .x)
+    )
+    new_fields <- c(group_fields, agg_fields)
+  }
+  schema(!!!new_fields)
+}
diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R
@@ -26,7 +26,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) {
     return(.data)
   }
 
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
   # tidy-eval the filter expressions inside an Arrow data_mask
   filters <- lapply(filts, arrow_eval, arrow_mask(.data))
   bad_filters <- map_lgl(filters, ~ inherits(., "try-error"))

diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
@@ -840,3 +840,18 @@ agg_funcs$n <- function() {
     options = list()
   )
 }
+
+output_type <- function(fun, input_type) {
+  # These are quick and dirty heuristics.
+  if (fun %in% c("any", "all")) {
+    bool()
+  } else if (fun %in% "sum") {
+    # It may upcast to a bigger type but this is close enough
+    input_type
+  } else if (fun %in% c("mean", "stddev", "variance")) {
+    float64()
+  } else {
+    # Just so things don't error, assume the resulting type is the same
+    input_type
+  }
+}
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
@@ -23,7 +23,7 @@ group_by.arrow_dplyr_query <- function(.data,
                                        .add = FALSE,
                                        add = .add,
                                        .drop = dplyr::group_by_drop_default(.data)) {
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
   new_groups <- enquos(...)
   # ... can contain expressions (i.e. can add (or rename?) columns) and so we
   # need to identify those and add them on to the query with mutate. Specifically,

diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R
@@ -24,7 +24,7 @@ mutate.arrow_dplyr_query <- function(.data,
                                      .before = NULL,
                                      .after = NULL) {
   call <- match.call()
-  exprs <- quos(...)
+  exprs <- ensure_named_exprs(quos(...))
 
   .keep <- match.arg(.keep)
   .before <- enquo(.before)
@@ -35,7 +35,7 @@ mutate.arrow_dplyr_query <- function(.data,
     return(.data)
   }
 
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
 
   # Restrict the cases we support for now
   if (length(dplyr::group_vars(.data)) > 0) {
@@ -45,11 +45,6 @@ mutate.arrow_dplyr_query <- function(.data,
     return(abandon_ship(call, .data, "mutate() on grouped data not supported in Arrow"))
   }
 
-  # Check for unnamed expressions and fix if any
-  unnamed <- !nzchar(names(exprs))
-  # Deparse and take the first element in case they're long expressions
-  names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label)
-
   mask <- arrow_mask(.data)
   results <- list()
   for (i in seq_along(exprs)) {
@@ -133,3 +128,11 @@ check_transmute_args <- function(..., .keep, .before, .after) {
   }
   enquos(...)
 }
+
+ensure_named_exprs <- function(exprs) {
+  # Check for unnamed expressions and fix if any
+  unnamed <- !nzchar(names(exprs))
+  # Deparse and take the first element in case they're long expressions
+  names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label)
+  exprs
+}
diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R
@@ -22,13 +22,13 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
 
 select.arrow_dplyr_query <- function(.data, ...) {
   check_select_helpers(enexprs(...))
-  column_select(arrow_dplyr_query(.data), !!!enquos(...))
+  column_select(as_adq(.data), !!!enquos(...))
 }
 select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query
 
 rename.arrow_dplyr_query <- function(.data, ...) {
   check_select_helpers(enexprs(...))
-  column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename)
+  column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
 }
 rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query
 
@@ -60,7 +60,7 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL
   # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
   # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829
 
-  .data <- arrow_dplyr_query(.data)
+  .data <- as_adq(.data)
 
   # Assign the schema to the expressions
   map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))