Skip to content

Commit

Permalink
ARROW-17885: [R] Return BLOB data as list of raw instead of a list of…
Browse files Browse the repository at this point in the history
… integers (#14277)

This PR adds support for `blob::blob()`, which is common in R database land to denote "binary", and `vctrs::list_of()`, which is similar, easy, and helps a bit with list of things that happen to be all NULL.

We have our own infrastructure for binary and lists of things too, which I assume pre-dates the mature vctrs and blob? Should we consider having `as.vector()` output those objects instead of the custom `arrow_list/large_list/binary` classes we implement here?

Lead-authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Co-authored-by: Dewey Dunnington <dewey@voltrondata.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
paleolimbot and paleolimbot committed Oct 10, 2022
1 parent 76d6cbb commit 73cfd2d
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 13 deletions.
1 change: 1 addition & 0 deletions r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ RoxygenNote: 7.2.1
Config/testthat/edition: 3
VignetteBuilder: knitr
Suggests:
blob,
cli,
DBI,
dbplyr,
Expand Down
4 changes: 4 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ S3method(as.vector,ArrowDatum)
S3method(as_arrow_array,Array)
S3method(as_arrow_array,ChunkedArray)
S3method(as_arrow_array,Scalar)
S3method(as_arrow_array,blob)
S3method(as_arrow_array,data.frame)
S3method(as_arrow_array,default)
S3method(as_arrow_array,pyarrow.lib.Array)
S3method(as_arrow_array,vctrs_list_of)
S3method(as_arrow_table,RecordBatch)
S3method(as_arrow_table,RecordBatchReader)
S3method(as_arrow_table,Table)
Expand Down Expand Up @@ -100,7 +102,9 @@ S3method(head,Scanner)
S3method(head,arrow_dplyr_query)
S3method(infer_type,ArrowDatum)
S3method(infer_type,Expression)
S3method(infer_type,blob)
S3method(infer_type,default)
S3method(infer_type,vctrs_list_of)
S3method(is.finite,ArrowDatum)
S3method(is.infinite,ArrowDatum)
S3method(is.na,ArrowDatum)
Expand Down
20 changes: 20 additions & 0 deletions r/R/array.R
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,26 @@ as_arrow_array.data.frame <- function(x, ..., type = NULL) {
}
}

#' @export
as_arrow_array.vctrs_list_of <- function(x, ..., type = NULL) {
type <- type %||% infer_type(x)
if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) {
stop_cant_convert_array(x, type)
}

as_arrow_array(unclass(x), type = type)
}

#' @export
as_arrow_array.blob <- function(x, ..., type = NULL) {
type <- type %||% infer_type(x)
if (!type$Equals(binary()) && !type$Equals(large_binary())) {
stop_cant_convert_array(x, type)
}

as_arrow_array(unclass(x), type = type)
}

stop_cant_convert_array <- function(x, type) {
if (is.null(type)) {
abort(
Expand Down
14 changes: 14 additions & 0 deletions r/R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,20 @@ infer_type.default <- function(x, ..., from_array_infer_type = FALSE) {
}
}

#' @export
infer_type.vctrs_list_of <- function(x, ...) {
list_of(infer_type(attr(x, "ptype")))
}

#' @export
infer_type.blob <- function(x, ...) {
if (sum(lengths(x)) > .Machine$integer.max) {
large_binary()
} else {
binary()
}
}

#' @export
infer_type.ArrowDatum <- function(x, ...) x$type

Expand Down
2 changes: 1 addition & 1 deletion r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ Status check_binary(SEXP x, int64_t size) {
// check this is a list of raw vectors
const SEXP* p_x = VECTOR_PTR_RO(x);
for (R_xlen_t i = 0; i < size; i++, ++p_x) {
if (TYPEOF(*p_x) != RAWSXP) {
if (TYPEOF(*p_x) != RAWSXP && (*p_x != R_NilValue)) {
return Status::Invalid("invalid R type to convert to binary");
}
}
Expand Down
29 changes: 18 additions & 11 deletions r/src/type_infer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,29 @@ std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
return arrow::large_binary();
}

// Check attr(x, "ptype") for an appropriate R prototype
SEXP ptype = Rf_getAttrib(x, symbols::ptype);
if (Rf_isNull(ptype)) {
if (XLENGTH(x) == 0) {
cpp11::stop(
"Requires at least one element to infer the values' type of a list vector");
if (!Rf_isNull(ptype)) {
arrow::list(InferArrowType(ptype));
}

// If unspecified, iterate through the vector until we get a non-null result
// special case raw() vectors, since we want list(raw()) to result in
// a binary() array
for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
ptype = VECTOR_ELT(x, i);
if (Rf_isNull(ptype)) {
continue;
}
// Iterate through the vector until we get a non-null result
for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
ptype = VECTOR_ELT(x, i);
if (!Rf_isNull(ptype)) {
break;
}

if (!Rf_isObject(ptype) && TYPEOF(ptype) == RAWSXP) {
return arrow::binary();
} else {
return arrow::list(InferArrowType(ptype));
}
}

return arrow::list(InferArrowType(ptype));
return arrow::list(arrow::null());
}
}

Expand Down
8 changes: 8 additions & 0 deletions r/tests/testthat/_snaps/Array.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@

Can't create Array<float64()> from object of type class_not_supported

# as_arrow_array() works for blob::blob()

Can't create Array<int32()> from object of type blob / vctrs_list_of / vctrs_vctr / list

# as_arrow_array() works for vctrs::list_of()

Can't create Array<int32()> from object of type vctrs_list_of / vctrs_vctr / list

# Array doesn't support c()

Use `concat_arrays()` or `ChunkedArray$create()` instead.
Expand Down
64 changes: 63 additions & 1 deletion r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,6 @@ test_that("Array$create() should have helpful error", {
int <- integer(0)
num <- numeric(0)
char <- character(0)
expect_error(Array$create(list()), "Requires at least one element to infer")
expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
expect_error(Array$create(list(char, num, char)), "Expecting a character vector")

Expand Down Expand Up @@ -1172,6 +1171,69 @@ test_that("as_arrow_array() default method errors", {
)
})

test_that("as_arrow_array() works for blob::blob()", {
skip_if_not_installed("blob")

# empty
expect_r6_class(as_arrow_array(blob::blob()), "Array")
expect_equal(
as_arrow_array(blob::blob()),
as_arrow_array(list(), type = binary())
)

# all null
expect_equal(
as_arrow_array(blob::blob(NULL, NULL)),
as_arrow_array(list(NULL, NULL), type = binary())
)

expect_equal(
as_arrow_array(blob::blob(as.raw(1:5), NULL)),
as_arrow_array(list(as.raw(1:5), NULL), type = binary())
)

expect_equal(
as_arrow_array(blob::blob(as.raw(1:5)), type = large_binary()),
as_arrow_array(list(as.raw(1:5)), type = large_binary())
)

expect_snapshot_error(
as_arrow_array(blob::blob(as.raw(1:5)), type = int32())
)
})

test_that("as_arrow_array() works for vctrs::list_of()", {
# empty
expect_r6_class(as_arrow_array(vctrs::list_of(.ptype = integer())), "Array")
expect_equal(
as_arrow_array(vctrs::list_of(.ptype = integer())),
as_arrow_array(list(), type = list_of(int32()))
)

# all NULL
expect_equal(
as_arrow_array(vctrs::list_of(NULL, NULL, .ptype = integer())),
as_arrow_array(list(NULL, NULL), type = list_of(int32()))
)

expect_equal(
as_arrow_array(vctrs::list_of(1:5, NULL, .ptype = integer())),
as_arrow_array(list(1:5, NULL), type = list_of(int32()))
)

expect_equal(
as_arrow_array(
vctrs::list_of(1:5, .ptype = integer()),
type = large_list_of(int32())
),
as_arrow_array(list(1:5), type = large_list_of(int32()))
)

expect_snapshot_error(
as_arrow_array(vctrs::list_of(1:5, .ptype = integer()), type = int32())
)
})

test_that("concat_arrays works", {
concat_empty <- concat_arrays()
expect_true(concat_empty$type == null())
Expand Down
32 changes: 32 additions & 0 deletions r/tests/testthat/test-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,20 @@ test_that("infer_type() can infer nested extension types", {
)
})

test_that("infer_type() can infer vctrs::list_of() types", {
expect_equal(infer_type(vctrs::list_of(.ptype = integer())), list_of(int32()))
})

test_that("infer_type() can infer blob type", {
skip_if_not_installed("blob")

expect_equal(infer_type(blob::blob()), binary())

big_ish_raw <- raw(2 ^ 20)
big_ish_blob <- blob::new_blob(rep(list(big_ish_raw), 2049))
expect_equal(infer_type(big_ish_blob), large_binary())
})

test_that("DataType$Equals", {
a <- int32()
b <- int32()
Expand Down Expand Up @@ -294,6 +308,18 @@ test_that("type() is deprecated", {
expect_equal(a_type, a$type)
})

test_that("infer_type() infers type for lists of raw() as binary()", {
expect_equal(
infer_type(list(raw())),
binary()
)

expect_equal(
infer_type(list(NULL, raw(), raw())),
binary()
)
})

test_that("infer_type() infers type for lists starting with NULL - ARROW-17639", {
null_start_list <- list(NULL, c(2, 3), c(4, 5))

Expand All @@ -308,4 +334,10 @@ test_that("infer_type() infers type for lists starting with NULL - ARROW-17639",
infer_type(totally_null_list),
list_of(null())
)

empty_list <- list()
expect_equal(
infer_type(empty_list),
list_of(null())
)
})

0 comments on commit 73cfd2d

Please sign in to comment.