-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Description
After ARROW-15168 we will use ExtensionType in more cases to handle R vector types that we don't natively implement a conversion for; however, roundtripping a Table through results in a Table with a slightly inconsistent state where the type of the ChunkedArray doesn't line up with the type in the schema:
# remotes::install_github("apache/arrow/r")
library(arrow, warn.conflicts = FALSE)
pa <- reticulate::import("pyarrow", convert = FALSE)
table <- arrow_table(
ext_col = chunked_array(vctrs_extension_array(1:10))
)
table$ext_col$type
#> VctrsExtensionType
#> integer(0)
table$schema$ext_col$type
#> VctrsExtensionType
#> integer(0)
table_py <- pa$Table$from_arrays(table$columns, schema = table$schema)
table_py$column("ext_col")$type
#> int32
table_py$schema$field("ext_col")$type
#> int32
cols <- reticulate::py_to_r(table_py$columns)
names(cols) <- reticulate::py_to_r(table_py$column_names)
table2 <- Table$create(!!! cols, schema = table$schema)
table2$ext_col$type
#> Int32
#> int32
table2$schema$ext_col$type
#> VctrsExtensionType
#> integer(0)The workaround in ARROW-15168 is to go through RecordBatchReader, which is probably fine but in some cases might result in ChunkedArray columns getting re-chunked to intersection of all the chunks. This doesn't copy any data, but isn't ideal (we should be able to roundtrip column-wise and avoid any re-chunking).
# remotes::install_github("apache/arrow/r#12817")
library(arrow, warn.conflicts = FALSE)
table <- arrow_table(
c1 = chunked_array(1:2, 3:4, 5:6),
c2 = chunked_array(1:6)
)
table$c1
#> ChunkedArray
#> [
#> [
#> 1,
#> 2
#> ],
#> [
#> 3,
#> 4
#> ],
#> [
#> 5,
#> 6
#> ]
#> ]
table$c2
#> ChunkedArray
#> [
#> [
#> 1,
#> 2,
#> 3,
#> 4,
#> 5,
#> 6
#> ]
#> ]
rbr <- as_record_batch_reader(table)
table2 <- rbr$read_table()
table2$c1
#> ChunkedArray
#> [
#> [
#> 1,
#> 2
#> ],
#> [
#> 3,
#> 4
#> ],
#> [
#> 5,
#> 6
#> ]
#> ]
table2$c2
#> ChunkedArray
#> [
#> [
#> 1,
#> 2
#> ],
#> [
#> 3,
#> 4
#> ],
#> [
#> 5,
#> 6
#> ]
#> ]Reporter: Dewey Dunnington / @paleolimbot
Note: This issue was originally created as ARROW-16269. Please see the migration documentation for further details.