Skip to content

Commit

Permalink
GH-34775: [R] arrow_table: as.data.frame() sometimes returns a tbl an…
Browse files Browse the repository at this point in the history
…d sometimes a data.frame (#35173)

Features of this PR:
* Ensures that calling `as.data.frame()` on Arrow objects returns base R `data.frame` objects.
* Drops the `class` attribute metadata of input objects of `data.frame` class (i.e. that don't have inherit from any additional classes other than `data.frame`).  This results in us sacrificing roundtrip class fidelity for `data.frame` objects (i.e. if we input a base R data.frame, convert it to an Arrow Table, and then convert it back to R, we get a tibble).  However, we now have consistency in the type of returned objects, retain roundtrip fidelity for other (non-class) metadata, and guarantee that `as.data.frame()` returns a base R data.frame.  Users who wish to input and return a `data.frame` object can call `as.data.frame()` on the returned object.
* Implements `dplyr::collect()` for StructArrays so that these objects can still be returned as tibbles if needed.
* Renames `expect_data_frame()` to `expect_equal_data_frame()` for clarity, and updates it to convert both the object and expected object to data.frames.

* Closes: #34775

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
  • Loading branch information
thisisnic committed May 3, 2023
1 parent 2ee0345 commit 205ceb9
Show file tree
Hide file tree
Showing 27 changed files with 229 additions and 184 deletions.
2 changes: 1 addition & 1 deletion r/R/array.R
Expand Up @@ -474,7 +474,7 @@ dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields)

#' @export
as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
as.vector(x)
as.data.frame(collect.StructArray(x), row.names = row.names, optional = optional, ...)
}

#' @rdname array
Expand Down
3 changes: 2 additions & 1 deletion r/R/arrow-tabular.R
Expand Up @@ -94,7 +94,8 @@ ArrowTabular <- R6Class("ArrowTabular",
#' @export
as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
df <- x$to_data_frame()
apply_arrow_r_metadata(df, x$metadata$r)
out <- apply_arrow_r_metadata(df, x$metadata$r)
as.data.frame(out, row.names = row.names, optional = optional, ...)
}

#' @export
Expand Down
2 changes: 1 addition & 1 deletion r/R/csv.R
Expand Up @@ -248,7 +248,7 @@ read_delim_arrow <- function(file,
}

if (isTRUE(as_data_frame)) {
tab <- as.data.frame(tab)
tab <- collect.ArrowTabular(tab)
}

tab
Expand Down
7 changes: 6 additions & 1 deletion r/R/dplyr-collect.R
Expand Up @@ -24,7 +24,8 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
}
collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
if (as_data_frame) {
as.data.frame(x, ...)
df <- x$to_data_frame()
apply_arrow_r_metadata(df, x$metadata$r)
} else {
x
}
Expand All @@ -34,6 +35,10 @@ collect.Dataset <- function(x, as_data_frame = TRUE, ...) {
}
collect.RecordBatchReader <- collect.Dataset

collect.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
as.vector(x)
}

compute.ArrowTabular <- function(x, ...) x
compute.arrow_dplyr_query <- function(x, ...) {
# TODO: should this tryCatch move down into as_arrow_table()?
Expand Down
3 changes: 2 additions & 1 deletion r/R/dplyr.R
Expand Up @@ -216,7 +216,8 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query

#' @export
as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) {
collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
out <- collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
as.data.frame(out)
}

#' @export
Expand Down
3 changes: 2 additions & 1 deletion r/R/feather.R
Expand Up @@ -196,7 +196,8 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T
)

if (isTRUE(as_data_frame)) {
out <- as.data.frame(out)
df <- out$to_data_frame()
out <- apply_arrow_r_metadata(df, out$metadata$r)
}
out
}
Expand Down
2 changes: 1 addition & 1 deletion r/R/ipc-stream.R
Expand Up @@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
# https://issues.apache.org/jira/browse/ARROW-6830
out <- RecordBatchStreamReader$create(file)$read_table()
if (as_data_frame) {
out <- as.data.frame(out)
out <- collect.ArrowTabular(out)
}
out
}
2 changes: 1 addition & 1 deletion r/R/json.R
Expand Up @@ -84,7 +84,7 @@ read_json_arrow <- function(file,
}

if (isTRUE(as_data_frame)) {
tab <- as.data.frame(tab)
tab <- collect.ArrowTabular(tab)
}
tab
}
Expand Down
9 changes: 9 additions & 0 deletions r/R/metadata.R
Expand Up @@ -22,6 +22,14 @@
# drop problems attributes (most likely from readr)
x[["attributes"]][["problems"]] <- NULL

# remove the class if it's just data.frame
if (identical(x$attributes$class, "data.frame")) {
x$attributes <- x$attributes[names(x$attributes) != "class"]
if (is_empty(x$attributes)) {
x <- x[names(x) != "attributes"]
}
}

out <- serialize(x, NULL, ascii = TRUE)

# if the metadata is over 100 kB, compress
Expand Down Expand Up @@ -62,6 +70,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
expr = {
columns_metadata <- r_metadata$columns
if (is.data.frame(x)) {
# if columns metadata exists, apply it here
if (length(names(x)) && !is.null(columns_metadata)) {
for (name in intersect(names(columns_metadata), names(x))) {
x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
Expand Down
2 changes: 1 addition & 1 deletion r/R/parquet.R
Expand Up @@ -70,7 +70,7 @@ read_parquet <- function(file,
}

if (as_data_frame) {
tab <- as.data.frame(tab)
tab <- collect.ArrowTabular(tab)
}
tab
}
Expand Down
5 changes: 3 additions & 2 deletions r/tests/testthat/helper-expectation.R
Expand Up @@ -19,8 +19,9 @@ expect_as_vector <- function(x, y, ...) {
expect_equal(as.vector(x), y, ...)
}

expect_data_frame <- function(x, y, ...) {
expect_equal(as.data.frame(x), y, ...)
# expect both objects to contain equal values when converted to data.frame objects
expect_equal_data_frame <- function(x, y, ...) {
expect_equal(as.data.frame(x), as.data.frame(y), ...)
}

expect_r6_class <- function(object, class) {
Expand Down
83 changes: 41 additions & 42 deletions r/tests/testthat/test-RecordBatch.R
Expand Up @@ -89,7 +89,7 @@ test_that("RecordBatch", {
schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8()))
)
expect_equal(batch2$column(0), batch$column(1))
expect_data_frame(batch2, tbl[, -1])
expect_equal_data_frame(batch2, tbl[, -1])

# input validation
expect_error(batch$RemoveColumn(NA), "'i' cannot be NA")
Expand All @@ -109,10 +109,10 @@ test_that("RecordBatch S3 methods", {

test_that("RecordBatch$Slice", {
batch3 <- batch$Slice(5)
expect_data_frame(batch3, tbl[6:10, ])
expect_equal_data_frame(batch3, tbl[6:10, ])

batch4 <- batch$Slice(5, 2)
expect_data_frame(batch4, tbl[6:7, ])
expect_equal_data_frame(batch4, tbl[6:7, ])

# Input validation
expect_error(batch$Slice("ten"))
Expand All @@ -131,20 +131,20 @@ test_that("RecordBatch$Slice", {
})

test_that("[ on RecordBatch", {
expect_data_frame(batch[6:7, ], tbl[6:7, ])
expect_data_frame(batch[c(6, 7), ], tbl[6:7, ])
expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4])
expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)])
expect_equal_data_frame(batch[6:7, ], tbl[6:7, ])
expect_equal_data_frame(batch[c(6, 7), ], tbl[6:7, ])
expect_equal_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4])
expect_equal_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)])
expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr)
expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
expect_data_frame(
expect_equal_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
expect_equal_data_frame(
batch[rep(c(FALSE, TRUE), 5), ],
tbl[c(2, 4, 6, 8, 10), ]
)
# bool Array
expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ])
expect_equal_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ])
# int Array
expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4])
expect_equal_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4])

# input validation
expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
Expand Down Expand Up @@ -176,15 +176,15 @@ test_that("[[<- assignment", {

# can remove a column
batch[["chr"]] <- NULL
expect_data_frame(batch, tbl[-4])
expect_equal_data_frame(batch, tbl[-4])

# can remove a column by index
batch[[4]] <- NULL
expect_data_frame(batch, tbl[1:3])
expect_equal_data_frame(batch, tbl[1:3])

# can add a named column
batch[["new"]] <- letters[10:1]
expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))
expect_equal_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))

# can replace a column by index
batch[[2]] <- as.numeric(10:1)
Expand Down Expand Up @@ -239,16 +239,16 @@ test_that("head and tail on RecordBatch", {
fct = factor(letters[1:10])
)
batch <- RecordBatch$create(tbl)
expect_data_frame(head(batch), head(tbl))
expect_data_frame(head(batch, 4), head(tbl, 4))
expect_data_frame(head(batch, 40), head(tbl, 40))
expect_data_frame(head(batch, -4), head(tbl, -4))
expect_data_frame(head(batch, -40), head(tbl, -40))
expect_data_frame(tail(batch), tail(tbl))
expect_data_frame(tail(batch, 4), tail(tbl, 4))
expect_data_frame(tail(batch, 40), tail(tbl, 40))
expect_data_frame(tail(batch, -4), tail(tbl, -4))
expect_data_frame(tail(batch, -40), tail(tbl, -40))
expect_equal_data_frame(head(batch), head(tbl))
expect_equal_data_frame(head(batch, 4), head(tbl, 4))
expect_equal_data_frame(head(batch, 40), head(tbl, 40))
expect_equal_data_frame(head(batch, -4), head(tbl, -4))
expect_equal_data_frame(head(batch, -40), head(tbl, -40))
expect_equal_data_frame(tail(batch), tail(tbl))
expect_equal_data_frame(tail(batch, 4), tail(tbl, 4))
expect_equal_data_frame(tail(batch, 40), tail(tbl, 40))
expect_equal_data_frame(tail(batch, -4), tail(tbl, -4))
expect_equal_data_frame(tail(batch, -40), tail(tbl, -40))
})

test_that("RecordBatch print method", {
Expand Down Expand Up @@ -346,17 +346,17 @@ test_that("record_batch() handles data frame columns", {
b = struct(x = int32(), y = int32())
)
)
out <- as.data.frame(batch)
expect_equal(out, tibble::tibble(a = 1:10, b = tib))

expect_equal_data_frame(batch, tibble::tibble(a = 1:10, b = tib))

# if not named, columns from tib are auto spliced
batch2 <- record_batch(a = 1:10, tib)
expect_equal(
batch2$schema,
schema(a = int32(), x = int32(), y = int32())
)
out <- as.data.frame(batch2)
expect_equal(out, tibble::tibble(a = 1:10, !!!tib))

expect_equal_data_frame(batch2, tibble::tibble(a = 1:10, !!!tib))
})

test_that("record_batch() handles data frame columns with schema spec", {
Expand All @@ -366,8 +366,7 @@ test_that("record_batch() handles data frame columns with schema spec", {
schema <- schema(a = int32(), b = struct(x = int16(), y = float64()))
batch <- record_batch(a = 1:10, b = tib, schema = schema)
expect_equal(batch$schema, schema)
out <- as.data.frame(batch)
expect_equal(out, tibble::tibble(a = 1:10, b = tib_float))
expect_equal_data_frame(batch, tibble::tibble(a = 1:10, b = tib_float))

schema <- schema(a = int32(), b = struct(x = int16(), y = utf8()))
expect_error(record_batch(a = 1:10, b = tib, schema = schema))
Expand All @@ -379,32 +378,32 @@ test_that("record_batch() auto splices (ARROW-5718)", {
batch2 <- record_batch(!!!df)
expect_equal(batch1, batch2)
expect_equal(batch1$schema, schema(x = int32(), y = utf8()))
expect_data_frame(batch1, df)
expect_equal_data_frame(batch1, df)

batch3 <- record_batch(df, z = 1:10)
batch4 <- record_batch(!!!df, z = 1:10)
expect_equal(batch3, batch4)
expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32()))
expect_equal(
as.data.frame(batch3),
tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
expect_equal_data_frame(
batch3,
cbind(df, data.frame(z = 1:10))
)

s <- schema(x = float64(), y = utf8())
batch5 <- record_batch(df, schema = s)
batch6 <- record_batch(!!!df, schema = s)
expect_equal(batch5, batch6)
expect_equal(batch5$schema, s)
expect_equal(as.data.frame(batch5), df)
expect_equal_data_frame(batch5, df)

s2 <- schema(x = float64(), y = utf8(), z = int16())
batch7 <- record_batch(df, z = 1:10, schema = s2)
batch8 <- record_batch(!!!df, z = 1:10, schema = s2)
expect_equal(batch7, batch8)
expect_equal(batch7$schema, s2)
expect_equal(
as.data.frame(batch7),
tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
expect_equal_data_frame(
batch7,
cbind(df, data.frame(z = 1:10))
)
})

Expand All @@ -425,24 +424,24 @@ test_that("record_batch() handles null type (ARROW-7064)", {
})

test_that("record_batch() scalar recycling with vectors", {
expect_data_frame(
expect_equal_data_frame(
record_batch(a = 1:10, b = 5),
tibble::tibble(a = 1:10, b = 5)
)
})

test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
expect_data_frame(
expect_equal_data_frame(
record_batch(a = Array$create(1:10), b = Scalar$create(5)),
tibble::tibble(a = 1:10, b = 5)
)

expect_data_frame(
expect_equal_data_frame(
record_batch(a = Array$create(1:10), b = Array$create(5)),
tibble::tibble(a = 1:10, b = 5)
)

expect_data_frame(
expect_equal_data_frame(
record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)),
tibble::tibble(a = 1:10, b = 5)
)
Expand Down

0 comments on commit 205ceb9

Please sign in to comment.