diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index d8ee21761a52..235b325ae209 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -56,7 +56,8 @@ jobs: sudo apt install -y -V \ pre-commit \ r-base \ - ruby-dev + ruby-dev \ + libuv1-dev - name: Cache pre-commit uses: actions/cache@v5 with: diff --git a/dev/tasks/r/github.linux.arrow.version.back.compat.yml b/dev/tasks/r/github.linux.arrow.version.back.compat.yml index 774c3e09f4c8..3bdd2607b596 100644 --- a/dev/tasks/r/github.linux.arrow.version.back.compat.yml +++ b/dev/tasks/r/github.linux.arrow.version.back.compat.yml @@ -107,6 +107,10 @@ jobs: # for 6.0.0 or later. rspm="https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" echo "RSPM=${rspm}" >> $GITHUB_ENV + else + # testthat requires fs which now requires libuv1-dev. Install it + # when RSPM isn't available + sudo apt update && sudo apt install -y -V libuv1-dev fi - uses: r-lib/actions/setup-r@v2 with: diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 282a8c83f4de..1c46086ed436 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -245,11 +245,12 @@ tasks: - r-libarrow-darwin-arm64-{no_rc_r_version}\.zip - r-libarrow-darwin-x86_64-{no_rc_r_version}\.zip - r-pkg__bin__windows__contrib__4.5__arrow_{no_rc_r_version}\.zip - - r-pkg__bin__windows__contrib__4.4__arrow_{no_rc_r_version}\.zip + - r-pkg__bin__windows__contrib__4.6__arrow_{no_rc_r_version}\.zip - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.5__arrow_{no_rc_r_version}\.tgz - - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.4__arrow_{no_rc_r_version}\.tgz + - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.6__arrow_{no_rc_r_version}\.tgz - r-pkg__bin__macosx__big-sur-arm64__contrib__4.5__arrow_{no_rc_r_version}\.tgz - - r-pkg__bin__macosx__big-sur-arm64__contrib__4.4__arrow_{no_rc_r_version}\.tgz + # TODO: Uncomment once setup-r resolves release to 4.6 + # -r-pkg__bin__macosx__sonoma-arm64__contrib__4.6__arrow_{no_rc_r_version}\.tgz - r-pkg__src__contrib__arrow_{no_rc_r_version}\.tar\.gz {% for which in ["strong", "most"] %} diff --git a/r/NAMESPACE b/r/NAMESPACE index 9dbbad9c45cd..f42944fb58b5 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -432,6 +432,7 @@ importFrom(bit64,str.integer64) importFrom(glue,glue) importFrom(methods,as) importFrom(purrr,as_mapper) +importFrom(purrr,compact) importFrom(purrr,flatten) importFrom(purrr,imap) importFrom(purrr,imap_chr) diff --git a/r/NEWS.md b/r/NEWS.md index c4a8b6b6a866..f388a25f4c65 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -18,6 +18,26 @@ --> # arrow 24.0.0 +# arrow 24.0.0.9000 + +# arrow 24.0.0 + +## New features + +- `dplyr::when_any()` and `dplyr::when_all()` helper bindings (#49535). +- `dplyr::filter_out()` binding (@larry77, #49256). +- `dplyr::recode_values()`, `dplyr::replace_values()`, and `dplyr::replace_when()` bindings (#49536). +- `write_dataset()` gains a `preserve_order` argument to preserve row ordering within partitions (@marberts, #49343). + +## Minor improvements and fixes + +- Zero-length `POSIXct` objects with integer storage (as created by `as.POSIXct(NULL)` in R 4.5.2+) are now correctly mapped to timestamp type instead of integer (#49619). +- `all.equal()` S3 method is now correctly registered (@MichaelChirico, #49481). + +## Installation + +- arm64 (aarch64) Linux binaries are now available (#48574). + # arrow 23.0.1.2 ## Minor improvements and fixes diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 3964feb74d0f..9de7afe02254 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dbl map_dfr map_int map_lgl keep imap imap_chr -#' @importFrom purrr flatten reduce walk +#' @importFrom purrr compact flatten reduce walk #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos quo #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec diff --git a/r/R/dplyr-funcs-conditional.R b/r/R/dplyr-funcs-conditional.R index a7df2e8f0d2a..b2d5a6ac7953 100644 --- a/r/R/dplyr-funcs-conditional.R +++ b/r/R/dplyr-funcs-conditional.R @@ -15,6 +15,169 @@ # specific language governing permissions and limitations # under the License. +#' Parse logical condition formulas +#' +#' Converts condition ~ value formulas into Arrow expressions. Unlike +#' [parse_value_mapping()], the LHS must be a logical expression (not a value +#' to match against). +#' +#' @param formulas A list of two-sided formulas where LHS is a logical condition +#' and RHS is the value to use when TRUE (e.g., `x > 5 ~ "high"`). +#' @param mask The data mask for evaluating formula expressions. +#' +#' @return A list with `query` (list of logical expressions) and `value` +#' (list of replacement expressions). +#' +#' @keywords internal +#' @noRd +parse_condition_formulas <- function(formulas, mask) { + fn <- call_name(rlang::caller_call()) + # Compact NULL entries (allows conditional formulas like: if (cond) x ~ y) + formulas <- compact(formulas) + n <- length(formulas) + query <- vector("list", n) + value <- vector("list", n) + # Process each formula: condition ~ value + for (i in seq_len(n)) { + f <- formulas[[i]] + if (!is_formula(f, lhs = TRUE)) { + validation_error(paste0("Each argument to ", fn, "() must be a two-sided formula")) + } + # f[[2]] is LHS (logical condition), f[[3]] is RHS (value when TRUE) + query[[i]] <- arrow_eval(f[[2]], mask) + value[[i]] <- arrow_eval(f[[3]], mask) + # Validate LHS is logical (unlike parse_value_mapping which does equality matching) + if (!call_binding("is.logical", query[[i]])) { + validation_error(paste0("Left side of each formula in ", fn, "() must be a logical expression")) + } + } + list(query = query, value = value) +} + +#' Create case_when Expression from query/value lists +#' @param query List of logical Arrow Expressions. +#' @param value List of value Arrow Expressions. +#' @return An Arrow Expression representing the case_when. +#' @keywords internal +#' @noRd +build_case_when_expr <- function(query, value) { + Expression$create( + "case_when", + args = c( + Expression$create( + "make_struct", + args = query, + options = list(field_names = as.character(seq_along(query))) + ), + value + ) + ) +} + +#' Build a match expression for x against a value (scalar, NA, or vector). +#' @param x Arrow Expression for the column to match against. +#' @param match_value Value to match - R scalar, vector, or NA. Expressions +#' are compared with equality. +#' @return Arrow Expression that is TRUE when x matches match_value. +#' @keywords internal +#' @noRd +build_match_expr <- function(x, match_value) { + # Expressions or length-1 non-NA: use equality directly + if (inherits(match_value, "Expression") || length(match_value) == 1 && !is.na(match_value)) { + return(x == match_value) + } + + # R scalar NA requires is.na() since x == NA returns NA in Arrow + if (length(match_value) == 1) { + return(call_binding("is.na", x)) + } + + # R vector: use %in%, handling NA separately if present + has_na <- any(is.na(match_value)) + non_na_values <- match_value[!is.na(match_value)] + + if (length(non_na_values) == 0) { + call_binding("is.na", x) + } else if (has_na) { + call_binding("%in%", x, non_na_values) | call_binding("is.na", x) + } else { + call_binding("%in%", x, match_value) + } +} + +#' Build query/value lists from parallel from/to vectors. +#' NA values in `from` use is.na() for matching. +#' @param x Arrow Expression for the column to match against. +#' @param from Vector of values to match. +#' @param to Vector of replacement values (recycled to length of `from`). +#' @return list(query, value) for use with build_case_when_expr(). +#' @keywords internal +#' @noRd +parse_from_to_mapping <- function(x, from, to) { + n <- length(from) + to <- vctrs::vec_recycle(to, n) + query <- map(from, ~ build_match_expr(x, .x)) + value <- map(to, Expression$scalar) + list(query = query, value = value) +} + +#' Build query/value lists from value ~ replacement formulas. +#' NA values on LHS use is.na() for matching. +#' @param x Arrow Expression for the column to match against. +#' @param formulas List of two-sided formulas (value ~ replacement). +#' @param mask Data mask for evaluating formula expressions. +#' @param fn Calling function name (for error messages). +#' @return list(query, value) for use with build_case_when_expr(). +#' @keywords internal +#' @noRd +parse_formula_mapping <- function(x, formulas, mask, fn) { + # Compact NULL entries (allows conditional formulas like: if (cond) x ~ y) + formulas <- compact(formulas) + n <- length(formulas) + query <- vector("list", n) + value <- vector("list", n) + for (i in seq_len(n)) { + f <- formulas[[i]] + if (!is_formula(f, lhs = TRUE)) { + validation_error(paste0("Each argument to ", fn, "() must be a two-sided formula")) + } + # f[[2]] is LHS (value to match), f[[3]] is RHS (replacement) + lhs <- arrow_eval(f[[2]], mask) + query[[i]] <- build_match_expr(x, lhs) + value[[i]] <- arrow_eval(f[[3]], mask) + } + list(query = query, value = value) +} + +#' Dispatch to formula or from/to parser based on which args are provided. +#' Returns list(query, value) or NULL if no mappings. +#' @param x Arrow Expression for the column to match against. +#' @param formulas List of two-sided formulas (value ~ replacement). +#' @param from Vector of values to match (alternative to formulas). +#' @param to Vector of replacement values (used with `from`). +#' @param mask The data mask for evaluating formula expressions. +#' @keywords internal +#' @noRd +parse_value_mapping <- function(x, formulas = list(), from = NULL, to = NULL, mask) { + fn <- call_name(rlang::caller_call()) + # Mutually exclusive interfaces + if (length(formulas) > 0 && !is.null(from)) { + validation_error(paste0("Can't use both `...` and `from`/`to` in ", fn, "()")) + } + + if (length(formulas) > 0) { + parse_formula_mapping(x, formulas, mask, fn) + } else if (!is.null(from)) { + if (is.null(to)) { + validation_error("`to` must be provided when using `from`") + } + parse_from_to_mapping(x, from, to) + } else { + # No mappings provided + NULL + } +} + register_bindings_conditional <- function() { register_binding("%in%", function(x, table) { # We use `is_in` here, unlike with Arrays, which use `is_in_meta_binary` @@ -133,44 +296,79 @@ register_bindings_conditional <- function() { } formulas <- list2(...) - n <- length(formulas) - if (n == 0) { + if (length(formulas) == 0) { validation_error("No cases provided") } - query <- vector("list", n) - value <- vector("list", n) - mask <- caller_env() - for (i in seq_len(n)) { - f <- formulas[[i]] - if (!inherits(f, "formula")) { - validation_error("Each argument to case_when() must be a two-sided formula") - } - query[[i]] <- arrow_eval(f[[2]], mask) - value[[i]] <- arrow_eval(f[[3]], mask) - if (!call_binding("is.logical", query[[i]])) { - validation_error("Left side of each formula in case_when() must be a logical expression") - } - } + parsed <- parse_condition_formulas(formulas, caller_env()) + query <- parsed$query + value <- parsed$value if (!is.null(.default)) { if (length(.default) != 1) { - validation_error(paste0("`.default` must have size 1, not size ", length(.default), ".")) + arrow_not_supported("`.default` must be size 1; vectors of length > 1") } - - query[n + 1] <- TRUE - value[n + 1] <- .default + n <- length(query) + query[[n + 1]] <- TRUE + value[[n + 1]] <- .default } - Expression$create( - "case_when", - args = c( - Expression$create( - "make_struct", - args = query, - options = list(field_names = as.character(seq_along(query))) - ), - value - ) - ) + build_case_when_expr(query, value) }, notes = "`.ptype` and `.size` arguments not supported" ) + + register_binding("dplyr::replace_when", function(x, ...) { + formulas <- list2(...) + if (length(formulas) == 0) { + return(x) + } + parsed <- parse_condition_formulas(formulas, caller_env()) + query <- parsed$query + value <- parsed$value + n <- length(query) + query[[n + 1]] <- TRUE + value[[n + 1]] <- x + build_case_when_expr(query, value) + }) + + register_binding("dplyr::replace_values", function(x, ..., from = NULL, to = NULL) { + parsed <- parse_value_mapping(x, list2(...), from, to, caller_env()) + if (is.null(parsed)) { + return(x) + } + query <- parsed$query + value <- parsed$value + n <- length(query) + query[[n + 1]] <- TRUE + value[[n + 1]] <- x + build_case_when_expr(query, value) + }) + + register_binding( + "dplyr::recode_values", + function(x, ..., from = NULL, to = NULL, default = NULL, unmatched = "default", ptype = NULL) { + if (!is.null(ptype)) { + arrow_not_supported("`recode_values()` with `ptype` specified") + } + if (unmatched != "default") { + arrow_not_supported('`recode_values()` with `unmatched` other than "default"') + } + + parsed <- parse_value_mapping(x, list2(...), from, to, caller_env()) + if (is.null(parsed)) { + validation_error("`...` can't be empty") + } + query <- parsed$query + value <- parsed$value + + if (!is.null(default)) { + if (length(default) != 1) { + arrow_not_supported("`default` must be size 1; vectors of length > 1") + } + n <- length(query) + query[[n + 1]] <- TRUE + value[[n + 1]] <- Expression$scalar(default) + } + build_case_when_expr(query, value) + }, + notes = "`ptype` argument and `unmatched = \"error\"` not supported" + ) } diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index e0b3dd095c9f..f7ca29833c81 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -21,7 +21,7 @@ #' #' The `arrow` package contains methods for 38 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. -#' The package also has mappings of 226 R functions to the corresponding +#' The package also has mappings of 229 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run @@ -214,6 +214,9 @@ #' * [`if_else()`][dplyr::if_else()] #' * [`n()`][dplyr::n()] #' * [`n_distinct()`][dplyr::n_distinct()] +#' * [`recode_values()`][dplyr::recode_values()]: `ptype` argument and `unmatched = "error"` not supported +#' * [`replace_values()`][dplyr::replace_values()] +#' * [`replace_when()`][dplyr::replace_when()] #' * [`when_all()`][dplyr::when_all()] #' * [`when_any()`][dplyr::when_any()] #' diff --git a/r/README.md b/r/README.md index 268ee24bdf00..5e4707df6d36 100644 --- a/r/README.md +++ b/r/README.md @@ -1,14 +1,5 @@ # arrow - - -[![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow) -[![CI](https://github.com/apache/arrow/actions/workflows/r.yml/badge.svg?branch=main&event=push)](https://github.com/apache/arrow/actions/workflows/r.yml?query=branch%3Amain+event%3Apush) -[![R-universe status badge](https://apache.r-universe.dev/badges/arrow)](https://apache.r-universe.dev) -[![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow) - - - ## Overview The R `{arrow}` package provides access to many of the features of the [Apache Arrow C++ library](https://arrow.apache.org/docs/cpp/index.html) for R users. The goal of arrow is to provide an Arrow C++ backend to `{dplyr}`, and access to the Arrow C++ library through familiar base R and tidyverse functions, or `{R6}` classes. The dedicated R package website is located [here](https://arrow.apache.org/docs/r/index.html). diff --git a/r/man/acero.Rd b/r/man/acero.Rd index a43617493a33..f721aa5d9fcf 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -9,7 +9,7 @@ \description{ The \code{arrow} package contains methods for 38 \code{dplyr} table functions, many of which are "verbs" that do transformations to one or more tables. -The package also has mappings of 226 R functions to the corresponding +The package also has mappings of 229 R functions to the corresponding functions in the Arrow compute library. These allow you to write code inside of \code{dplyr} methods that call R functions, including many in packages like \code{stringr} and \code{lubridate}, and they will get translated to Arrow and run @@ -72,7 +72,7 @@ can assume that the function works in Acero just as it does in R. Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both \code{str_sub()} and \code{stringr::str_sub()} work. -In addition to these functions, you can call any of Arrow's 281 compute +In addition to these functions, you can call any of Arrow's 253 compute functions directly. Arrow has many functions that don't map to an existing R function. In other cases where there is an R function mapping, you can still call the Arrow function directly if you don't want the adaptations that the R @@ -207,6 +207,9 @@ Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[dplyr:if_else]{if_else()}} \item \code{\link[dplyr:context]{n()}} \item \code{\link[dplyr:n_distinct]{n_distinct()}} +\item \code{\link[dplyr:recode-and-replace-values]{recode_values()}}: \code{ptype} argument and \code{unmatched = "error"} not supported +\item \code{\link[dplyr:recode-and-replace-values]{replace_values()}} +\item \code{\link[dplyr:case-and-replace-when]{replace_when()}} \item \code{\link[dplyr:when-any-all]{when_all()}} \item \code{\link[dplyr:when-any-all]{when_any()}} } diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index abf6b8fc44a8..b809a63bcc6f 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -54,7 +54,7 @@ If \code{schema} is not provided, Arrow data types are inferred from the data: \item JSON numbers convert to \code{\link[=int64]{int64()}}, falling back to \code{\link[=float64]{float64()}} if a non-integer is encountered. \item JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert to \code{\link[=timestamp]{timestamp(unit = "s")}}, falling back to \code{\link[=utf8]{utf8()}} if a conversion error occurs. -\item JSON arrays convert to a \code{\link[vctrs:list_of]{vctrs::list_of()}} type, and inference proceeds recursively on the JSON arrays' values. +\item JSON arrays convert to a \code{\link[=list_of]{list_of()}} type, and inference proceeds recursively on the JSON arrays' values. \item Nested JSON objects convert to a \code{\link[=struct]{struct()}} type, and inference proceeds recursively on the JSON objects' values. } diff --git a/r/man/schema.Rd b/r/man/schema.Rd index ff77a05d84aa..65ab2eea0d27 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -7,7 +7,7 @@ schema(...) } \arguments{ -\item{...}{\link[vctrs:fields]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract +\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract a schema} } \description{ diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R index d99843ab9da9..f7d5b4d6b95f 100644 --- a/r/tests/testthat/test-dplyr-funcs-conditional.R +++ b/r/tests/testthat/test-dplyr-funcs-conditional.R @@ -296,8 +296,8 @@ test_that("case_when()", { ) expect_arrow_eval_error( case_when(int > 5 ~ 1, .default = c(0, 1)), - "`.default` must have size 1, not size 2", - class = "validation_error" + "`.default` must be size 1; vectors of length > 1 not supported in Arrow", + class = "arrow_not_supported" ) expect_arrow_eval_error( @@ -599,3 +599,280 @@ test_that("when_all()", { class = "arrow_not_supported" ) }) + +test_that("replace_when()", { + # replaces matching values, keeps original otherwise + compare_dplyr_binding( + .input |> + mutate(result = replace_when(int, int > 5 ~ 100L)) |> + collect(), + tbl + ) + + # multiple conditions + compare_dplyr_binding( + .input |> + mutate(result = replace_when(int, int > 7 ~ 100L, int < 3 ~ 0L)) |> + collect(), + tbl + ) + + # overlapping conditions - first match wins + compare_dplyr_binding( + .input |> + mutate(result = replace_when(int, int > 3 ~ 100L, int > 5 ~ 200L)) |> + collect(), + tbl + ) + + # no formulas returns x unchanged + compare_dplyr_binding( + .input |> + mutate(result = replace_when(int)) |> + collect(), + tbl + ) + + # Conditions on LHS of formulas are compacted out + condition <- FALSE + compare_dplyr_binding( + .input |> + mutate(result = replace_when(int, if (condition) int > 5 ~ 100L, int < 3 ~ 0L)) |> + collect(), + tbl + ) + + # validation errors + expect_arrow_eval_error( + replace_when(int, TRUE), + "Each argument to replace_when\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + replace_when(int, ~100L), + "Each argument to replace_when\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + replace_when(int, 0L ~ 100L), + "Left side of each formula in replace_when\\(\\) must be a logical expression", + class = "validation_error" + ) +}) + +test_that("replace_values()", { + # formula interface + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, "a" ~ "A", "b" ~ "B")) |> + collect(), + tbl + ) + + # from/to interface + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, from = c("a", "b"), to = c("A", "B"))) |> + collect(), + tbl + ) + + # from/to with list of vectors - multiple values map to single replacement + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, from = list(c("a", "b"), "c"), to = c("AB", "C"))) |> + collect(), + tbl + ) + + # unmatched values kept + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, "a" ~ "A")) |> + collect(), + tbl + ) + + # works with numeric values + compare_dplyr_binding( + .input |> + mutate(result = replace_values(int, 1L ~ 100L, 2L ~ 200L)) |> + collect(), + tbl + ) + + # explicit NA matching with formula + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, "a" ~ "A", NA ~ "missing")) |> + collect(), + tbl + ) + + # explicit NA matching with from/to + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, from = c("a", NA), to = c("A", "missing"))) |> + collect(), + tbl + ) + + # multiple values on LHS matches any + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, c("a", "b") ~ "AB")) |> + collect(), + tbl + ) + + # multiple values on LHS including NA matches any including NA + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, c(NA, "a") ~ "matched")) |> + collect(), + tbl + ) + + # from/to with list containing NA matches NA too + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr, from = list(c(NA, "a"), "b"), to = c("matched", "B"))) |> + collect(), + tbl + ) + + # no replacements returns x unchanged + compare_dplyr_binding( + .input |> + mutate(result = replace_values(chr)) |> + collect(), + tbl + ) + + # validation errors + expect_arrow_eval_error( + replace_values(chr, "A"), + "Each argument to replace_values\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + replace_values(chr, ~"A"), + "Each argument to replace_values\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + replace_values(chr, "a" ~ "A", from = "b"), + "Can't use both `...` and `from`/`to` in replace_values\\(\\)", + class = "validation_error" + ) + expect_arrow_eval_error( + replace_values(chr, from = "a"), + "`to` must be provided when using `from`", + class = "validation_error" + ) +}) + +test_that("recode_values()", { + # formula interface with default NA + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, "a" ~ "A", "b" ~ "B")) |> + collect(), + tbl + ) + + # from/to interface + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, from = c("a", "b"), to = c("A", "B"))) |> + collect(), + tbl + ) + + # from/to with list of vectors - multiple values map to single replacement + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, from = list(c("a", "b"), "c"), to = c("AB", "C"))) |> + collect(), + tbl + ) + + # custom default + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, "a" ~ "A", default = "other")) |> + collect(), + tbl + ) + + # works with numeric values + compare_dplyr_binding( + .input |> + mutate(result = recode_values(int, 1L ~ 100L, 2L ~ 200L)) |> + collect(), + tbl + ) + + # NA input with default - NA also becomes default + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, "a" ~ "A", "b" ~ "B", default = "other")) |> + collect(), + tbl + ) + + # multiple values on LHS matches any + compare_dplyr_binding( + .input |> + mutate(result = recode_values(chr, c("a", "b") ~ "AB", default = "other")) |> + collect(), + tbl + ) + + # validation errors + expect_arrow_eval_error( + recode_values(chr), + "`\\.\\.\\.` can't be empty", + class = "validation_error" + ) + expect_arrow_eval_error( + recode_values(chr, "A"), + "Each argument to recode_values\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + recode_values(chr, ~"A"), + "Each argument to recode_values\\(\\) must be a two-sided formula", + class = "validation_error" + ) + expect_arrow_eval_error( + recode_values(chr, "a" ~ "A", from = "b"), + "Can't use both `...` and `from`/`to` in recode_values\\(\\)", + class = "validation_error" + ) + expect_arrow_eval_error( + recode_values(chr, from = "a"), + "`to` must be provided when using `from`", + class = "validation_error" + ) + expect_arrow_eval_error( + recode_values(chr, "a" ~ "A", ptype = character()), + "`recode_values\\(\\)` with `ptype` specified not supported in Arrow", + class = "arrow_not_supported" + ) + expect_arrow_eval_error( + recode_values(chr, "a" ~ "A", unmatched = "error"), + "`recode_values\\(\\)` with `unmatched` other than \"default\" not supported in Arrow", + class = "arrow_not_supported" + ) + expect_arrow_eval_error( + recode_values(chr, "a" ~ "A", unmatched = "wat"), + "`recode_values\\(\\)` with `unmatched` other than \"default\" not supported in Arrow", + class = "arrow_not_supported" + ) + expect_arrow_eval_error( + recode_values(chr, "a" ~ "A", default = c("x", "y")), + "`default` must be size 1; vectors of length > 1 not supported in Arrow", + class = "arrow_not_supported" + ) +}) diff --git a/r/tools/checksums/r-libarrow-darwin-arm64-24.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-arm64-24.0.0.zip.sha512 new file mode 100644 index 000000000000..59c5c5d709dd --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-arm64-24.0.0.zip.sha512 @@ -0,0 +1 @@ +db0f950f78a3badc1cfd9f7c15ae9e03dd8daf06d7a1e1fb4655d155e1c1ed240421918b4271a131915d29ad1b93a3de145617e168e228b4f2880181656e80a1 r-libarrow-darwin-arm64-24.0.0.zip diff --git a/r/tools/checksums/r-libarrow-darwin-x86_64-24.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-x86_64-24.0.0.zip.sha512 new file mode 100644 index 000000000000..2575c3311874 --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-x86_64-24.0.0.zip.sha512 @@ -0,0 +1 @@ +1a3a9665572236ebeee5ec2397977c92e8cd1285513a3dd041cb9ee02fd51828b11b2673934a38a9f008d9cc8a0f00e72a6e4a5f9a0122d24f5eecff823e2923 r-libarrow-darwin-x86_64-24.0.0.zip diff --git a/r/tools/checksums/r-libarrow-linux-x86_64-24.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-linux-x86_64-24.0.0.zip.sha512 new file mode 100644 index 000000000000..79712d2ac94b --- /dev/null +++ b/r/tools/checksums/r-libarrow-linux-x86_64-24.0.0.zip.sha512 @@ -0,0 +1 @@ +298dbbfcb34c291ec7b44a207660e647fbe180d8643c0f59f01a48e2c1561ec999a39ac43ab3e7968885400ec2cdbdd09cdb2a3fcf765454b5cdf61c38bf408e r-libarrow-linux-x86_64-24.0.0.zip diff --git a/r/tools/checksums/r-libarrow-windows-x86_64-24.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-windows-x86_64-24.0.0.zip.sha512 new file mode 100644 index 000000000000..61811c1f79f4 --- /dev/null +++ b/r/tools/checksums/r-libarrow-windows-x86_64-24.0.0.zip.sha512 @@ -0,0 +1 @@ +2d521d2678f05075abd88bb36b965c009fdad112e11dfa3b9fbefe2f639b7e34c39f747995028b3cca6fb92cfa368298d3ce807ad8ba337267ff8b2c41b40f0a r-libarrow-windows-x86_64-24.0.0.zip diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index d50191ac18a1..ebe3acccee9d 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -923,7 +923,10 @@ cmake_find_package <- function(pkg, version = NULL, env_var_list) { td <- tempfile() dir.create(td) cleanup(td) - find_package <- paste0("find_package(", pkg, " ", version, " REQUIRED)") + find_package <- paste0( + "cmake_minimum_required(VERSION 3.10)\n", + "find_package(", pkg, " ", version, " REQUIRED)" + ) writeLines(find_package, file.path(td, "CMakeLists.txt")) env_vars <- env_vars_as_string(env_var_list) cmake_cmd <- paste0( diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index d13fc53db1ee..267a431cb332 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -100,7 +100,9 @@ _Special instructions on Linux:_ You will need to set `LD_LIBRARY_PATH` to the ` ```{bash, save=run & ubuntu & !sys_install} export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH +export R_LD_LIBRARY_PATH=$LD_LIBRARY_PATH echo "export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH" >> ~/.bash_profile +echo "export R_LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> ~/.bash_profile ``` Start by navigating in a terminal to the arrow repository. You will need to create a directory into which the C++ build will put its contents. We recommend that you make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). Next, change directories to be inside `cpp/build`: @@ -350,7 +352,7 @@ sudo apt update sudo apt install -y -V libarrow-dev ``` -```{bash, save=run & !sys_install} +```{bash, save=run & !sys_install & macos} MAKEFLAGS="LDFLAGS=" R CMD INSTALL . ```