ARROW-8118: [R] dim method for FileSystemDataset

- Adds `dim` methods for both `Dataset` and `arrow_dplyr_query` classes. - Add unit tests for both methods. Closes #6635 from boshek/dim_method Lead-authored-by: Sam Albers <sam.albers@gov.bc.ca> Co-authored-by: Sam Albers <sam.albers@gmail.com> Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
apache · Mar 19, 2020 · 91603d8 · 91603d8
1 parent eb0dd86
commit 91603d8
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 0 deletions.
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -21,8 +21,10 @@ S3method(as.raw,Buffer)
 S3method(as.vector,Array)
 S3method(as.vector,ChunkedArray)
 S3method(as.vector,array_expression)
+S3method(dim,Dataset)
 S3method(dim,RecordBatch)
 S3method(dim,Table)
+S3method(dim,arrow_dplyr_query)
 S3method(head,Array)
 S3method(head,ChunkedArray)
 S3method(head,RecordBatch)

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/dataset.R b/r/R/dataset.R
@@ -126,6 +126,11 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject,
     #' Return the Dataset's `Schema`
     schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)),
     metadata = function() self$schema$metadata,
+    num_rows = function() {
+      warning("Number of rows unknown; returning NA", call. = FALSE)
+      NA_integer_
+    },
+    num_cols = function() length(self$schema),
     #' @description
     #' Return the Dataset's type.
     type = function() dataset___Dataset__type_name(self)
@@ -141,6 +146,9 @@ Dataset$create <- function(children, schema) {
 #' @export
 names.Dataset <- function(x) names(x$schema)
 
+#' @export
+dim.Dataset <- function(x) c(x$num_rows, x$num_cols)
+
 #' @name FileSystemDataset
 #' @rdname Dataset
 #' @export
@@ -153,6 +161,15 @@ FileSystemDataset <- R6Class("FileSystemDataset", inherit = Dataset,
     #' Return the format of files in this `Dataset`
     format = function() {
       shared_ptr(FileFormat, dataset___FileSystemDataset__format(self))$..dispatch()
+    },
+    num_rows = function() {
+      if (!inherits(self$format, "ParquetFileFormat")) {
+        # TODO: implement for other file formats
+        warning("Number of rows unknown; returning NA", call. = FALSE)
+        NA_integer_
+      } else {
+        sum(map_int(self$files, ~ParquetFileReader$create(.x)$num_rows))
+      }
     }
   )
 )

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
@@ -72,6 +72,23 @@ print.arrow_dplyr_query <- function(x, ...) {
 #' @export
 names.arrow_dplyr_query <- function(x) names(x$selected_columns)
 
+#' @export
+dim.arrow_dplyr_query <- function(x) {
+  if (isTRUE(x$filtered)) {
+    rows <- x$.data$num_rows
+  } else {
+    warning(
+      "For arrow dplyr queries that call filter(), ",
+      "dim() returns NA for the number of rows.\n",
+      "Call collect() to pull data into R to access the number of rows.",
+      call. = FALSE
+    )
+    rows <- NA_integer_
+  }
+  cols <- length(names(x))
+  c(rows, cols)
+}
+
 # The following S3 methods are registered on load if dplyr is present
 select.arrow_dplyr_query <- function(.data, ...) {
   column_select(arrow_dplyr_query(.data), !!!enquos(...))

diff --git a/r/R/parquet.R b/r/R/parquet.R
@@ -443,6 +443,11 @@ ParquetFileWriter$create <- function(
 #' @include arrow-package.R
 ParquetFileReader <- R6Class("ParquetFileReader",
   inherit = ArrowObject,
+  active = list(
+    num_rows = function() {
+      as.integer(parquet___arrow___FileReader__num_rows(self))
+    }
+  ),
   public = list(
     ReadTable = function(col_select = NULL) {
       col_select <- enquo(col_select)

diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp
@@ -82,6 +82,12 @@ std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(
   return table;
 }
 
+// [[arrow::export]]
+int64_t parquet___arrow___FileReader__num_rows(
+    const std::unique_ptr<parquet::arrow::FileReader>& reader) {
+  return reader->parquet_reader()->metadata()->num_rows();
+}
+
 // [[arrow::export]]
 std::shared_ptr<parquet::ArrowWriterProperties::Builder>
 parquet___ArrowWriterProperties___Builder__create() {

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
@@ -98,6 +98,26 @@ test_that("Simple interface for datasets", {
   )
 })
 
+test_that("dim method returns the correct number of rows and columns",{
+  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+  expect_identical(dim(ds), c(20L, 7L))
+})
+
+
+test_that("dim() correctly determine numbers of rows and columns on arrow_dplyr_query object",{
+  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+
+  expect_warning(dim_fil <- dim(filter(ds, chr == 'A')))
+  expect_identical(dim_fil, c(NA, 7L))
+
+  dim_sel <- dim(select(ds, chr, fct))
+  expect_identical(dim_sel, c(20L, 2L))
+
+  expect_warning(dim_sel_fil <- dim(select(ds, chr, fct) %>% filter(chr == 'A')))
+  expect_identical(dim_sel_fil, c(NA, 2L))
+
+})
+
 test_that("Simple interface for datasets (custom ParquetFileFormat)", {
   ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()),
                      format = FileFormat$create("parquet", dict_columns = c("chr")))
@@ -150,6 +170,10 @@ test_that("Partitioning inference", {
 test_that("IPC/Arrow format data", {
   ds <- open_dataset(ipc_dir, partitioning = "part", format = "arrow")
   expect_identical(names(ds), c(names(df1), "part"))
+  expect_warning(
+    dim(ds),
+    "Number of rows unknown; returning NA"
+  )
   expect_equivalent(
     ds %>%
       select(string = chr, integer = int, part) %>%