Skip to content

Commit

Permalink
ARROW-8118: [R] dim method for FileSystemDataset
Browse files Browse the repository at this point in the history
- Adds `dim` methods for both `Dataset` and `arrow_dplyr_query` classes.
- Add unit tests for both methods.

Closes #6635 from boshek/dim_method

Lead-authored-by: Sam Albers <sam.albers@gov.bc.ca>
Co-authored-by: Sam Albers <sam.albers@gmail.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
  • Loading branch information
3 people authored and bkietz committed Mar 19, 2020
1 parent eb0dd86 commit 91603d8
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 0 deletions.
2 changes: 2 additions & 0 deletions r/NAMESPACE
Expand Up @@ -21,8 +21,10 @@ S3method(as.raw,Buffer)
S3method(as.vector,Array)
S3method(as.vector,ChunkedArray)
S3method(as.vector,array_expression)
S3method(dim,Dataset)
S3method(dim,RecordBatch)
S3method(dim,Table)
S3method(dim,arrow_dplyr_query)
S3method(head,Array)
S3method(head,ChunkedArray)
S3method(head,RecordBatch)
Expand Down
4 changes: 4 additions & 0 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions r/R/dataset.R
Expand Up @@ -126,6 +126,11 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject,
#' Return the Dataset's `Schema`
schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)),
metadata = function() self$schema$metadata,
num_rows = function() {
warning("Number of rows unknown; returning NA", call. = FALSE)
NA_integer_
},
num_cols = function() length(self$schema),
#' @description
#' Return the Dataset's type.
type = function() dataset___Dataset__type_name(self)
Expand All @@ -141,6 +146,9 @@ Dataset$create <- function(children, schema) {
#' @export
names.Dataset <- function(x) names(x$schema)

#' @export
dim.Dataset <- function(x) c(x$num_rows, x$num_cols)

#' @name FileSystemDataset
#' @rdname Dataset
#' @export
Expand All @@ -153,6 +161,15 @@ FileSystemDataset <- R6Class("FileSystemDataset", inherit = Dataset,
#' Return the format of files in this `Dataset`
format = function() {
shared_ptr(FileFormat, dataset___FileSystemDataset__format(self))$..dispatch()
},
num_rows = function() {
if (!inherits(self$format, "ParquetFileFormat")) {
# TODO: implement for other file formats
warning("Number of rows unknown; returning NA", call. = FALSE)
NA_integer_
} else {
sum(map_int(self$files, ~ParquetFileReader$create(.x)$num_rows))
}
}
)
)
Expand Down
17 changes: 17 additions & 0 deletions r/R/dplyr.R
Expand Up @@ -72,6 +72,23 @@ print.arrow_dplyr_query <- function(x, ...) {
#' @export
names.arrow_dplyr_query <- function(x) names(x$selected_columns)

#' @export
dim.arrow_dplyr_query <- function(x) {
if (isTRUE(x$filtered)) {
rows <- x$.data$num_rows
} else {
warning(
"For arrow dplyr queries that call filter(), ",
"dim() returns NA for the number of rows.\n",
"Call collect() to pull data into R to access the number of rows.",
call. = FALSE
)
rows <- NA_integer_
}
cols <- length(names(x))
c(rows, cols)
}

# The following S3 methods are registered on load if dplyr is present
select.arrow_dplyr_query <- function(.data, ...) {
column_select(arrow_dplyr_query(.data), !!!enquos(...))
Expand Down
5 changes: 5 additions & 0 deletions r/R/parquet.R
Expand Up @@ -443,6 +443,11 @@ ParquetFileWriter$create <- function(
#' @include arrow-package.R
ParquetFileReader <- R6Class("ParquetFileReader",
inherit = ArrowObject,
active = list(
num_rows = function() {
as.integer(parquet___arrow___FileReader__num_rows(self))
}
),
public = list(
ReadTable = function(col_select = NULL) {
col_select <- enquo(col_select)
Expand Down
16 changes: 16 additions & 0 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions r/src/parquet.cpp
Expand Up @@ -82,6 +82,12 @@ std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(
return table;
}

// [[arrow::export]]
int64_t parquet___arrow___FileReader__num_rows(
const std::unique_ptr<parquet::arrow::FileReader>& reader) {
return reader->parquet_reader()->metadata()->num_rows();
}

// [[arrow::export]]
std::shared_ptr<parquet::ArrowWriterProperties::Builder>
parquet___ArrowWriterProperties___Builder__create() {
Expand Down
24 changes: 24 additions & 0 deletions r/tests/testthat/test-dataset.R
Expand Up @@ -98,6 +98,26 @@ test_that("Simple interface for datasets", {
)
})

test_that("dim method returns the correct number of rows and columns",{
ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
expect_identical(dim(ds), c(20L, 7L))
})


test_that("dim() correctly determine numbers of rows and columns on arrow_dplyr_query object",{
ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))

expect_warning(dim_fil <- dim(filter(ds, chr == 'A')))
expect_identical(dim_fil, c(NA, 7L))

dim_sel <- dim(select(ds, chr, fct))
expect_identical(dim_sel, c(20L, 2L))

expect_warning(dim_sel_fil <- dim(select(ds, chr, fct) %>% filter(chr == 'A')))
expect_identical(dim_sel_fil, c(NA, 2L))

})

test_that("Simple interface for datasets (custom ParquetFileFormat)", {
ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()),
format = FileFormat$create("parquet", dict_columns = c("chr")))
Expand Down Expand Up @@ -150,6 +170,10 @@ test_that("Partitioning inference", {
test_that("IPC/Arrow format data", {
ds <- open_dataset(ipc_dir, partitioning = "part", format = "arrow")
expect_identical(names(ds), c(names(df1), "part"))
expect_warning(
dim(ds),
"Number of rows unknown; returning NA"
)
expect_equivalent(
ds %>%
select(string = chr, integer = int, part) %>%
Expand Down

0 comments on commit 91603d8

Please sign in to comment.