Skip to content

Commit

Permalink
ARROW-11478: [R] Consider ways to make arrow.skip_nul option more use…
Browse files Browse the repository at this point in the history
…r-friendly

Following a similar approach to ARROW-11766. This also pulls in `withr` as a test dependency and refactors some unrelated tests to use it.

Closes #9899 from nealrichardson/skip-nul-message

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
nealrichardson committed Apr 6, 2021
1 parent a111fc9 commit ace2bfc
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 46 deletions.
3 changes: 2 additions & 1 deletion r/DESCRIPTION
Expand Up @@ -49,7 +49,8 @@ Suggests:
rmarkdown,
stringr,
testthat,
tibble
tibble,
withr
LinkingTo: cpp11 (>= 0.2.0)
Collate:
'enums.R'
Expand Down
7 changes: 6 additions & 1 deletion r/R/arrow-datum.R
Expand Up @@ -39,7 +39,12 @@ is.na.ArrowDatum <- function(x) call_function("is_null", x)
is.nan.ArrowDatum <- function(x) call_function("is_nan", x)

#' @export
as.vector.ArrowDatum <- function(x, mode) x$as_vector()
as.vector.ArrowDatum <- function(x, mode) {
tryCatch(
x$as_vector(),
error = handle_embedded_nul_error
)
}

filter_rows <- function(x, i, keep_na = TRUE, ...) {
# General purpose function for [ row subsetting with R semantics
Expand Down
5 changes: 4 additions & 1 deletion r/R/arrow-tabular.R
Expand Up @@ -61,7 +61,10 @@ ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject,

#' @export
as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
df <- x$to_data_frame()
tryCatch(
df <- x$to_data_frame(),
error = handle_embedded_nul_error
)
if (!is.null(r_metadata <- x$metadata$r)) {
df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
}
Expand Down
8 changes: 8 additions & 0 deletions r/R/util.R
Expand Up @@ -86,3 +86,11 @@ all_names <- function(expr) {
is_constant <- function(expr) {
length(all_vars(expr)) == 0
}

handle_embedded_nul_error <- function(e) {
msg <- conditionMessage(e)
if (grepl(" nul ", msg)) {
e$message <- paste0(msg, "; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)")
}
stop(e)
}
9 changes: 6 additions & 3 deletions r/src/array_to_vector.cpp
Expand Up @@ -88,11 +88,14 @@ class Converter {
// for each array, add a task to the task group
//
// The task group is Finish() in the caller
void IngestParallel(SEXP data, const std::shared_ptr<arrow::internal::TaskGroup>& tg) {
// The converter itself is passed as `self` so that if one of the parallel ops
// hits `stop()`, we don't bail before `tg` is destroyed, which would cause a crash
void IngestParallel(SEXP data, const std::shared_ptr<arrow::internal::TaskGroup>& tg,
std::shared_ptr<Converter> self) {
R_xlen_t k = 0, i = 0;
for (const auto& array : arrays_) {
auto n_chunk = array->length();
tg->Append([=] { return IngestOne(data, array, k, n_chunk, i); });
tg->Append([=] { return self->IngestOne(data, array, k, n_chunk, i); });
k += n_chunk;
i++;
}
Expand Down Expand Up @@ -1242,7 +1245,7 @@ cpp11::writable::list to_dataframe_parallel(

// add a task to ingest data of that column if that can be done in parallel
if (converters[i]->Parallel()) {
converters[i]->IngestParallel(column, tg);
converters[i]->IngestParallel(column, tg, converters[i]);
}
}

Expand Down
4 changes: 1 addition & 3 deletions r/tests/testthat/helper-arrow.R
Expand Up @@ -65,7 +65,5 @@ test_that <- function(what, code) {
# Wrapper to run tests that only touch R code even when the C++ library isn't
# available (so that at least some tests are run on those platforms)
r_only <- function(code) {
old <- options(..skip.tests = FALSE)
on.exit(options(old))
code
withr::with_options(list(..skip.tests = FALSE), code)
}
65 changes: 37 additions & 28 deletions r/tests/testthat/test-Array.R
Expand Up @@ -262,17 +262,16 @@ test_that("array supports POSIXct (ARROW-3340)", {

test_that("array supports POSIXct without timezone", {
# Make sure timezone is not set
tz <- Sys.getenv("TZ")
Sys.setenv(TZ = "")
on.exit(Sys.setenv(TZ = tz))
times <- strptime("2019-02-03 12:34:56", format="%Y-%m-%d %H:%M:%S") + 1:10
expect_array_roundtrip(times, timestamp("us", ""))
withr::with_envvar(c(TZ = ""), {
times <- strptime("2019-02-03 12:34:56", format="%Y-%m-%d %H:%M:%S") + 1:10
expect_array_roundtrip(times, timestamp("us", ""))

# Also test the INTSXP code path
skip("Ingest_POSIXct only implemented for REALSXP")
times_int <- as.integer(times)
attributes(times_int) <- attributes(times)
expect_array_roundtrip(times_int, timestamp("us", ""))
# Also test the INTSXP code path
skip("Ingest_POSIXct only implemented for REALSXP")
times_int <- as.integer(times)
attributes(times_int) <- attributes(times)
expect_array_roundtrip(times_int, timestamp("us", ""))
})
})

test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", {
Expand Down Expand Up @@ -634,18 +633,28 @@ test_that("Handling string data with embedded nuls", {
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
expect_error(rawToChar(raws[[3]]), "nul") # See?
expect_error(
rawToChar(raws[[3]]),
"embedded nul in string: 'ma\\0n'", # See?
fixed = TRUE
)
array_with_nul <- Array$create(raws)$cast(utf8())
expect_error(as.vector(array_with_nul), "nul")

options(arrow.skip_nul = TRUE)
expect_warning(
expect_identical(
as.vector(array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\\\0' \\(nul\\) from character vector"
expect_error(
as.vector(array_with_nul),
"embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)",
fixed = TRUE
)

withr::with_options(list(arrow.skip_nul = TRUE), {
expect_warning(
expect_identical(
as.vector(array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\0' (nul) from character vector",
fixed = TRUE
)
})
})

test_that("Array$create() should have helpful error", {
Expand Down Expand Up @@ -793,14 +802,14 @@ test_that("Array$ApproxEquals", {
})

test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
op <- options(arrow.int64_downcast = FALSE); on.exit(options(op))

a <- Array$create(1:10, int64())
expect_true(inherits(a$as_vector(), "integer64"))
withr::with_options(list(arrow.int64_downcast = FALSE), {
a <- Array$create(1:10, int64())
expect_true(inherits(a$as_vector(), "integer64"))

batch <- RecordBatch$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))
batch <- RecordBatch$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))

tab <- Table$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))
tab <- Table$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))
})
})
28 changes: 28 additions & 0 deletions r/tests/testthat/test-RecordBatch.R
Expand Up @@ -471,3 +471,31 @@ test_that("record_batch() with different length arrays", {
expect_error(record_batch(a=1:5, b = 42), msg)
expect_error(record_batch(a=1:5, b = 1:6), msg)
})

test_that("Handling string data with embedded nuls", {
raws <- structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
batch_with_nul <- record_batch(a = 1:5, b = raws)
batch_with_nul$b <- batch_with_nul$b$cast(utf8())
expect_error(
as.data.frame(batch_with_nul),
"embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)",
fixed = TRUE
)

withr::with_options(list(arrow.skip_nul = TRUE), {
expect_warning(
expect_equivalent(
as.data.frame(batch_with_nul)$b,
c("person", "woman", "man", "camera", "tv")
),
"Stripping '\\0' (nul) from character vector",
fixed = TRUE
)
})
})
28 changes: 28 additions & 0 deletions r/tests/testthat/test-chunked-array.R
Expand Up @@ -383,3 +383,31 @@ test_that("Converting a chunked array unifies factors (ARROW-8374)", {

expect_identical(ca$as_vector(), res)
})

test_that("Handling string data with embedded nuls", {
raws <- structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8())
expect_error(
as.vector(chunked_array_with_nul),
"embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)",
fixed = TRUE
)

withr::with_options(list(arrow.skip_nul = TRUE), {
expect_warning(
expect_identical(
as.vector(chunked_array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\0' (nul) from character vector",
fixed = TRUE
)
})
})
18 changes: 9 additions & 9 deletions r/tests/testthat/test-install-arrow.R
Expand Up @@ -23,17 +23,17 @@ r_only({
ours <- "https://dl.example.com/ursalabs/fake_repo"
other <- "https://cran.fiocruz.br/"

old <- options(
opts <- list(
repos=c(CRAN = "@CRAN@"), # Restore defaul
arrow.dev_repo = ours
)
on.exit(options(old))

expect_identical(arrow_repos(), cran)
expect_identical(arrow_repos(c(cran, ours)), cran)
expect_identical(arrow_repos(c(ours, other)), other)
expect_identical(arrow_repos(nightly = TRUE), c(ours, cran))
expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran))
expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other))
withr::with_options(opts, {
expect_identical(arrow_repos(), cran)
expect_identical(arrow_repos(c(cran, ours)), cran)
expect_identical(arrow_repos(c(ours, other)), other)
expect_identical(arrow_repos(nightly = TRUE), c(ours, cran))
expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran))
expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other))
})
})
})
26 changes: 26 additions & 0 deletions r/tests/testthat/test-scalar.R
Expand Up @@ -76,3 +76,29 @@ test_that("Scalar$ApproxEquals", {
expect_false(a$ApproxEquals(d))
expect_false(a$ApproxEquals(aa))
})

test_that("Handling string data with embedded nuls", {
raws <- as.raw(c(0x6d, 0x61, 0x00, 0x6e))
expect_error(
rawToChar(raws),
"embedded nul in string: 'ma\\0n'", # See?
fixed = TRUE
)
scalar_with_nul <- Scalar$create(raws, binary())$cast(utf8())
expect_error(
as.vector(scalar_with_nul),
"embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul = TRUE)",
fixed = TRUE
)

withr::with_options(list(arrow.skip_nul = TRUE), {
expect_warning(
expect_identical(
as.vector(scalar_with_nul),
"man"
),
"Stripping '\\0' (nul) from character vector",
fixed = TRUE
)
})
})

0 comments on commit ace2bfc

Please sign in to comment.