From ace2bfc160e049b4edc7ffb55081cfb7210d6e43 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 6 Apr 2021 12:18:16 -0700 Subject: [PATCH] ARROW-11478: [R] Consider ways to make arrow.skip_nul option more user-friendly Following a similar approach to ARROW-11766. This also pulls in `withr` as a test dependency and refactors some unrelated tests to use it. Closes #9899 from nealrichardson/skip-nul-message Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- r/DESCRIPTION | 3 +- r/R/arrow-datum.R | 7 ++- r/R/arrow-tabular.R | 5 ++- r/R/util.R | 8 ++++ r/src/array_to_vector.cpp | 9 ++-- r/tests/testthat/helper-arrow.R | 4 +- r/tests/testthat/test-Array.R | 65 +++++++++++++++------------ r/tests/testthat/test-RecordBatch.R | 28 ++++++++++++ r/tests/testthat/test-chunked-array.R | 28 ++++++++++++ r/tests/testthat/test-install-arrow.R | 18 ++++---- r/tests/testthat/test-scalar.R | 26 +++++++++++ 11 files changed, 155 insertions(+), 46 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 3de40f6f9a7e9..a355e790a2d57 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -49,7 +49,8 @@ Suggests: rmarkdown, stringr, testthat, - tibble + tibble, + withr LinkingTo: cpp11 (>= 0.2.0) Collate: 'enums.R' diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 99940e74cbd7c..dd43307c9cc74 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -39,7 +39,12 @@ is.na.ArrowDatum <- function(x) call_function("is_null", x) is.nan.ArrowDatum <- function(x) call_function("is_nan", x) #' @export -as.vector.ArrowDatum <- function(x, mode) x$as_vector() +as.vector.ArrowDatum <- function(x, mode) { + tryCatch( + x$as_vector(), + error = handle_embedded_nul_error + ) +} filter_rows <- function(x, i, keep_na = TRUE, ...) { # General purpose function for [ row subsetting with R semantics diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R index 157b799f3b6ef..f32111688a2dc 100644 --- a/r/R/arrow-tabular.R +++ b/r/R/arrow-tabular.R @@ -61,7 +61,10 @@ ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject, #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { - df <- x$to_data_frame() + tryCatch( + df <- x$to_data_frame(), + error = handle_embedded_nul_error + ) if (!is.null(r_metadata <- x$metadata$r)) { df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) } diff --git a/r/R/util.R b/r/R/util.R index 4680381e90970..6d9c91b74aa09 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -86,3 +86,11 @@ all_names <- function(expr) { is_constant <- function(expr) { length(all_vars(expr)) == 0 } + +handle_embedded_nul_error <- function(e) { + msg <- conditionMessage(e) + if (grepl(" nul ", msg)) { + e$message <- paste0(msg, "; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)") + } + stop(e) +} \ No newline at end of file diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index ddcb74946978d..d5fae2951812f 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -88,11 +88,14 @@ class Converter { // for each array, add a task to the task group // // The task group is Finish() in the caller - void IngestParallel(SEXP data, const std::shared_ptr& tg) { + // The converter itself is passed as `self` so that if one of the parallel ops + // hits `stop()`, we don't bail before `tg` is destroyed, which would cause a crash + void IngestParallel(SEXP data, const std::shared_ptr& tg, + std::shared_ptr self) { R_xlen_t k = 0, i = 0; for (const auto& array : arrays_) { auto n_chunk = array->length(); - tg->Append([=] { return IngestOne(data, array, k, n_chunk, i); }); + tg->Append([=] { return self->IngestOne(data, array, k, n_chunk, i); }); k += n_chunk; i++; } @@ -1242,7 +1245,7 @@ cpp11::writable::list to_dataframe_parallel( // add a task to ingest data of that column if that can be done in parallel if (converters[i]->Parallel()) { - converters[i]->IngestParallel(column, tg); + converters[i]->IngestParallel(column, tg, converters[i]); } } diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index a20ec23961e78..89d9bf07ee6ea 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -65,7 +65,5 @@ test_that <- function(what, code) { # Wrapper to run tests that only touch R code even when the C++ library isn't # available (so that at least some tests are run on those platforms) r_only <- function(code) { - old <- options(..skip.tests = FALSE) - on.exit(options(old)) - code + withr::with_options(list(..skip.tests = FALSE), code) } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 18e26207ca3fd..1b92732368d3b 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -262,17 +262,16 @@ test_that("array supports POSIXct (ARROW-3340)", { test_that("array supports POSIXct without timezone", { # Make sure timezone is not set - tz <- Sys.getenv("TZ") - Sys.setenv(TZ = "") - on.exit(Sys.setenv(TZ = tz)) - times <- strptime("2019-02-03 12:34:56", format="%Y-%m-%d %H:%M:%S") + 1:10 - expect_array_roundtrip(times, timestamp("us", "")) + withr::with_envvar(c(TZ = ""), { + times <- strptime("2019-02-03 12:34:56", format="%Y-%m-%d %H:%M:%S") + 1:10 + expect_array_roundtrip(times, timestamp("us", "")) - # Also test the INTSXP code path - skip("Ingest_POSIXct only implemented for REALSXP") - times_int <- as.integer(times) - attributes(times_int) <- attributes(times) - expect_array_roundtrip(times_int, timestamp("us", "")) + # Also test the INTSXP code path + skip("Ingest_POSIXct only implemented for REALSXP") + times_int <- as.integer(times) + attributes(times_int) <- attributes(times) + expect_array_roundtrip(times_int, timestamp("us", "")) + }) }) test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { @@ -634,18 +633,28 @@ test_that("Handling string data with embedded nuls", { as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), as.raw(c(0x74, 0x76))), class = c("arrow_binary", "vctrs_vctr", "list")) - expect_error(rawToChar(raws[[3]]), "nul") # See? + expect_error( + rawToChar(raws[[3]]), + "embedded nul in string: 'ma\\0n'", # See? + fixed = TRUE + ) array_with_nul <- Array$create(raws)$cast(utf8()) - expect_error(as.vector(array_with_nul), "nul") - - options(arrow.skip_nul = TRUE) - expect_warning( - expect_identical( - as.vector(array_with_nul), - c("person", "woman", "man", "fan", "camera", "tv") - ), - "Stripping '\\\\0' \\(nul\\) from character vector" + expect_error( + as.vector(array_with_nul), + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)", + fixed = TRUE ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_identical( + as.vector(array_with_nul), + c("person", "woman", "man", "fan", "camera", "tv") + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) }) test_that("Array$create() should have helpful error", { @@ -793,14 +802,14 @@ test_that("Array$ApproxEquals", { }) test_that("auto int64 conversion to int can be disabled (ARROW-10093)", { - op <- options(arrow.int64_downcast = FALSE); on.exit(options(op)) - - a <- Array$create(1:10, int64()) - expect_true(inherits(a$as_vector(), "integer64")) + withr::with_options(list(arrow.int64_downcast = FALSE), { + a <- Array$create(1:10, int64()) + expect_true(inherits(a$as_vector(), "integer64")) - batch <- RecordBatch$create(x = a) - expect_true(inherits(as.data.frame(batch)$x, "integer64")) + batch <- RecordBatch$create(x = a) + expect_true(inherits(as.data.frame(batch)$x, "integer64")) - tab <- Table$create(x = a) - expect_true(inherits(as.data.frame(batch)$x, "integer64")) + tab <- Table$create(x = a) + expect_true(inherits(as.data.frame(batch)$x, "integer64")) + }) }) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 96abceb299497..b71c07b78c248 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -471,3 +471,31 @@ test_that("record_batch() with different length arrays", { expect_error(record_batch(a=1:5, b = 42), msg) expect_error(record_batch(a=1:5, b = 1:6), msg) }) + +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76))), + class = c("arrow_binary", "vctrs_vctr", "list")) + batch_with_nul <- record_batch(a = 1:5, b = raws) + batch_with_nul$b <- batch_with_nul$b$cast(utf8()) + expect_error( + as.data.frame(batch_with_nul), + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)", + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_equivalent( + as.data.frame(batch_with_nul)$b, + c("person", "woman", "man", "camera", "tv") + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index a5ff6ef481280..17a82de810ff6 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -383,3 +383,31 @@ test_that("Converting a chunked array unifies factors (ARROW-8374)", { expect_identical(ca$as_vector(), res) }) + +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76))), + class = c("arrow_binary", "vctrs_vctr", "list")) + chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8()) + expect_error( + as.vector(chunked_array_with_nul), + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)", + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_identical( + as.vector(chunked_array_with_nul), + c("person", "woman", "man", "fan", "camera", "tv") + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-install-arrow.R b/r/tests/testthat/test-install-arrow.R index e2b1d771aa3d0..d9d1cc74b0240 100644 --- a/r/tests/testthat/test-install-arrow.R +++ b/r/tests/testthat/test-install-arrow.R @@ -23,17 +23,17 @@ r_only({ ours <- "https://dl.example.com/ursalabs/fake_repo" other <- "https://cran.fiocruz.br/" - old <- options( + opts <- list( repos=c(CRAN = "@CRAN@"), # Restore defaul arrow.dev_repo = ours ) - on.exit(options(old)) - - expect_identical(arrow_repos(), cran) - expect_identical(arrow_repos(c(cran, ours)), cran) - expect_identical(arrow_repos(c(ours, other)), other) - expect_identical(arrow_repos(nightly = TRUE), c(ours, cran)) - expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran)) - expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other)) + withr::with_options(opts, { + expect_identical(arrow_repos(), cran) + expect_identical(arrow_repos(c(cran, ours)), cran) + expect_identical(arrow_repos(c(ours, other)), other) + expect_identical(arrow_repos(nightly = TRUE), c(ours, cran)) + expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran)) + expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other)) + }) }) }) diff --git a/r/tests/testthat/test-scalar.R b/r/tests/testthat/test-scalar.R index e9ef893bbd94c..501298a80211c 100644 --- a/r/tests/testthat/test-scalar.R +++ b/r/tests/testthat/test-scalar.R @@ -76,3 +76,29 @@ test_that("Scalar$ApproxEquals", { expect_false(a$ApproxEquals(d)) expect_false(a$ApproxEquals(aa)) }) + +test_that("Handling string data with embedded nuls", { + raws <- as.raw(c(0x6d, 0x61, 0x00, 0x6e)) + expect_error( + rawToChar(raws), + "embedded nul in string: 'ma\\0n'", # See? + fixed = TRUE + ) + scalar_with_nul <- Scalar$create(raws, binary())$cast(utf8()) + expect_error( + as.vector(scalar_with_nul), + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)", + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_identical( + as.vector(scalar_with_nul), + "man" + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) \ No newline at end of file