From 73cfd2d0d0e1e5a2192fb73e5262c77953664f81 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 10 Oct 2022 17:08:34 -0300 Subject: [PATCH] ARROW-17885: [R] Return BLOB data as list of raw instead of a list of integers (#14277) This PR adds support for `blob::blob()`, which is common in R database land to denote "binary", and `vctrs::list_of()`, which is similar, easy, and helps a bit with list of things that happen to be all NULL. We have our own infrastructure for binary and lists of things too, which I assume pre-dates the mature vctrs and blob? Should we consider having `as.vector()` output those objects instead of the custom `arrow_list/large_list/binary` classes we implement here? Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Neal Richardson --- r/DESCRIPTION | 1 + r/NAMESPACE | 4 ++ r/R/array.R | 20 ++++++++++ r/R/type.R | 14 +++++++ r/src/r_to_arrow.cpp | 2 +- r/src/type_infer.cpp | 29 +++++++++------ r/tests/testthat/_snaps/Array.md | 8 ++++ r/tests/testthat/test-Array.R | 64 +++++++++++++++++++++++++++++++- r/tests/testthat/test-type.R | 32 ++++++++++++++++ 9 files changed, 161 insertions(+), 13 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index cf83f563902e3..4b526e8b8a971 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -45,6 +45,7 @@ RoxygenNote: 7.2.1 Config/testthat/edition: 3 VignetteBuilder: knitr Suggests: + blob, cli, DBI, dbplyr, diff --git a/r/NAMESPACE b/r/NAMESPACE index 8b08b940b3622..24a9e14bb6188 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -41,9 +41,11 @@ S3method(as.vector,ArrowDatum) S3method(as_arrow_array,Array) S3method(as_arrow_array,ChunkedArray) S3method(as_arrow_array,Scalar) +S3method(as_arrow_array,blob) S3method(as_arrow_array,data.frame) S3method(as_arrow_array,default) S3method(as_arrow_array,pyarrow.lib.Array) +S3method(as_arrow_array,vctrs_list_of) S3method(as_arrow_table,RecordBatch) S3method(as_arrow_table,RecordBatchReader) S3method(as_arrow_table,Table) @@ -100,7 +102,9 @@ S3method(head,Scanner) S3method(head,arrow_dplyr_query) S3method(infer_type,ArrowDatum) S3method(infer_type,Expression) +S3method(infer_type,blob) S3method(infer_type,default) +S3method(infer_type,vctrs_list_of) S3method(is.finite,ArrowDatum) S3method(is.infinite,ArrowDatum) S3method(is.na,ArrowDatum) diff --git a/r/R/array.R b/r/R/array.R index 938c8e4b0485f..7c2fb5c783bb1 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -322,6 +322,26 @@ as_arrow_array.data.frame <- function(x, ..., type = NULL) { } } +#' @export +as_arrow_array.vctrs_list_of <- function(x, ..., type = NULL) { + type <- type %||% infer_type(x) + if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) { + stop_cant_convert_array(x, type) + } + + as_arrow_array(unclass(x), type = type) +} + +#' @export +as_arrow_array.blob <- function(x, ..., type = NULL) { + type <- type %||% infer_type(x) + if (!type$Equals(binary()) && !type$Equals(large_binary())) { + stop_cant_convert_array(x, type) + } + + as_arrow_array(unclass(x), type = type) +} + stop_cant_convert_array <- function(x, type) { if (is.null(type)) { abort( diff --git a/r/R/type.R b/r/R/type.R index d4d7d52ad580d..5089789f6c16a 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -111,6 +111,20 @@ infer_type.default <- function(x, ..., from_array_infer_type = FALSE) { } } +#' @export +infer_type.vctrs_list_of <- function(x, ...) { + list_of(infer_type(attr(x, "ptype"))) +} + +#' @export +infer_type.blob <- function(x, ...) { + if (sum(lengths(x)) > .Machine$integer.max) { + large_binary() + } else { + binary() + } +} + #' @export infer_type.ArrowDatum <- function(x, ...) x$type diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index aa51799585673..c472d8286f4a4 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -743,7 +743,7 @@ Status check_binary(SEXP x, int64_t size) { // check this is a list of raw vectors const SEXP* p_x = VECTOR_PTR_RO(x); for (R_xlen_t i = 0; i < size; i++, ++p_x) { - if (TYPEOF(*p_x) != RAWSXP) { + if (TYPEOF(*p_x) != RAWSXP && (*p_x != R_NilValue)) { return Status::Invalid("invalid R type to convert to binary"); } } diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp index e30d0e1288717..e668918ac7923 100644 --- a/r/src/type_infer.cpp +++ b/r/src/type_infer.cpp @@ -159,22 +159,29 @@ std::shared_ptr InferArrowTypeFromVector(SEXP x) { return arrow::large_binary(); } + // Check attr(x, "ptype") for an appropriate R prototype SEXP ptype = Rf_getAttrib(x, symbols::ptype); - if (Rf_isNull(ptype)) { - if (XLENGTH(x) == 0) { - cpp11::stop( - "Requires at least one element to infer the values' type of a list vector"); + if (!Rf_isNull(ptype)) { + arrow::list(InferArrowType(ptype)); + } + + // If unspecified, iterate through the vector until we get a non-null result + // special case raw() vectors, since we want list(raw()) to result in + // a binary() array + for (R_xlen_t i = 0; i < XLENGTH(x); i++) { + ptype = VECTOR_ELT(x, i); + if (Rf_isNull(ptype)) { + continue; } - // Iterate through the vector until we get a non-null result - for (R_xlen_t i = 0; i < XLENGTH(x); i++) { - ptype = VECTOR_ELT(x, i); - if (!Rf_isNull(ptype)) { - break; - } + + if (!Rf_isObject(ptype) && TYPEOF(ptype) == RAWSXP) { + return arrow::binary(); + } else { + return arrow::list(InferArrowType(ptype)); } } - return arrow::list(InferArrowType(ptype)); + return arrow::list(arrow::null()); } } diff --git a/r/tests/testthat/_snaps/Array.md b/r/tests/testthat/_snaps/Array.md index f6ec523510470..fbcee7a15ce82 100644 --- a/r/tests/testthat/_snaps/Array.md +++ b/r/tests/testthat/_snaps/Array.md @@ -18,6 +18,14 @@ Can't create Array from object of type class_not_supported +# as_arrow_array() works for blob::blob() + + Can't create Array from object of type blob / vctrs_list_of / vctrs_vctr / list + +# as_arrow_array() works for vctrs::list_of() + + Can't create Array from object of type vctrs_list_of / vctrs_vctr / list + # Array doesn't support c() Use `concat_arrays()` or `ChunkedArray$create()` instead. diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index a2299326e400f..37b56bbc4378e 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -849,7 +849,6 @@ test_that("Array$create() should have helpful error", { int <- integer(0) num <- numeric(0) char <- character(0) - expect_error(Array$create(list()), "Requires at least one element to infer") expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector") expect_error(Array$create(list(char, num, char)), "Expecting a character vector") @@ -1172,6 +1171,69 @@ test_that("as_arrow_array() default method errors", { ) }) +test_that("as_arrow_array() works for blob::blob()", { + skip_if_not_installed("blob") + + # empty + expect_r6_class(as_arrow_array(blob::blob()), "Array") + expect_equal( + as_arrow_array(blob::blob()), + as_arrow_array(list(), type = binary()) + ) + + # all null + expect_equal( + as_arrow_array(blob::blob(NULL, NULL)), + as_arrow_array(list(NULL, NULL), type = binary()) + ) + + expect_equal( + as_arrow_array(blob::blob(as.raw(1:5), NULL)), + as_arrow_array(list(as.raw(1:5), NULL), type = binary()) + ) + + expect_equal( + as_arrow_array(blob::blob(as.raw(1:5)), type = large_binary()), + as_arrow_array(list(as.raw(1:5)), type = large_binary()) + ) + + expect_snapshot_error( + as_arrow_array(blob::blob(as.raw(1:5)), type = int32()) + ) +}) + +test_that("as_arrow_array() works for vctrs::list_of()", { + # empty + expect_r6_class(as_arrow_array(vctrs::list_of(.ptype = integer())), "Array") + expect_equal( + as_arrow_array(vctrs::list_of(.ptype = integer())), + as_arrow_array(list(), type = list_of(int32())) + ) + + # all NULL + expect_equal( + as_arrow_array(vctrs::list_of(NULL, NULL, .ptype = integer())), + as_arrow_array(list(NULL, NULL), type = list_of(int32())) + ) + + expect_equal( + as_arrow_array(vctrs::list_of(1:5, NULL, .ptype = integer())), + as_arrow_array(list(1:5, NULL), type = list_of(int32())) + ) + + expect_equal( + as_arrow_array( + vctrs::list_of(1:5, .ptype = integer()), + type = large_list_of(int32()) + ), + as_arrow_array(list(1:5), type = large_list_of(int32())) + ) + + expect_snapshot_error( + as_arrow_array(vctrs::list_of(1:5, .ptype = integer()), type = int32()) + ) +}) + test_that("concat_arrays works", { concat_empty <- concat_arrays() expect_true(concat_empty$type == null()) diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R index 14f0ea7a8d580..0fbeec0a49117 100644 --- a/r/tests/testthat/test-type.R +++ b/r/tests/testthat/test-type.R @@ -85,6 +85,20 @@ test_that("infer_type() can infer nested extension types", { ) }) +test_that("infer_type() can infer vctrs::list_of() types", { + expect_equal(infer_type(vctrs::list_of(.ptype = integer())), list_of(int32())) +}) + +test_that("infer_type() can infer blob type", { + skip_if_not_installed("blob") + + expect_equal(infer_type(blob::blob()), binary()) + + big_ish_raw <- raw(2 ^ 20) + big_ish_blob <- blob::new_blob(rep(list(big_ish_raw), 2049)) + expect_equal(infer_type(big_ish_blob), large_binary()) +}) + test_that("DataType$Equals", { a <- int32() b <- int32() @@ -294,6 +308,18 @@ test_that("type() is deprecated", { expect_equal(a_type, a$type) }) +test_that("infer_type() infers type for lists of raw() as binary()", { + expect_equal( + infer_type(list(raw())), + binary() + ) + + expect_equal( + infer_type(list(NULL, raw(), raw())), + binary() + ) +}) + test_that("infer_type() infers type for lists starting with NULL - ARROW-17639", { null_start_list <- list(NULL, c(2, 3), c(4, 5)) @@ -308,4 +334,10 @@ test_that("infer_type() infers type for lists starting with NULL - ARROW-17639", infer_type(totally_null_list), list_of(null()) ) + + empty_list <- list() + expect_equal( + infer_type(empty_list), + list_of(null()) + ) })