ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset …

…classes" This commit was pushed to #6470 along with my "approval" review, but a force push overwrote it. cc @bkietz Closes #6502 from nealrichardson/dataset-source-followup and squashes the following commits: c1c5e75 <Neal Richardson> Some R edits Authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
apache · Feb 28, 2020 · 5096803 · 5096803
1 parent 5e3255b
commit 5096803
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 82 deletions.
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -92,6 +92,7 @@ export(FileOutputStream)
 export(FileSelector)
 export(FileStats)
 export(FileSystem)
+export(FileSystemDataset)
 export(FileSystemDatasetFactory)
 export(FileType)
 export(FixedSizeBufferWriter)
@@ -133,8 +134,8 @@ export(StructArray)
 export(SubTreeFileSystem)
 export(Table)
 export(TimeUnit)
-export(UnionDatasetFactory)
 export(Type)
+export(UnionDataset)
 export(arrow_available)
 export(bool)
 export(boolean)
@@ -143,6 +144,7 @@ export(cast_options)
 export(chunked_array)
 export(codec_is_available)
 export(contains)
+export(dataset_factory)
 export(date32)
 export(date64)
 export(decimal)
@@ -171,7 +173,6 @@ export(null)
 export(num_range)
 export(one_of)
 export(open_dataset)
-export(open_dataset_factory)
 export(read_arrow)
 export(read_csv_arrow)
 export(read_delim_arrow)

diff --git a/r/R/dataset.R b/r/R/dataset.R
@@ -24,7 +24,7 @@
 #' `Dataset`, then use `dplyr` methods to query it.
 #'
 #' @param sources Either a string path to a directory containing data files,
-#' or a list of `DatasetFactory` objects as created by [open_dataset_factory()].
+#' or a list of `DatasetFactory` objects as created by [dataset_factory()].
 #' @param schema [Schema] for the dataset. If `NULL` (the default), the schema
 #' will be inferred from the data sources.
 #' @param partitioning When `sources` is a file path, one of
@@ -39,19 +39,15 @@
 #'    by [hive_partition()] which parses explicit or autodetected fields from
 #'    Hive-style path segments
 #'   * `NULL` for no partitioning
-#' @param ... additional arguments passed to `open_dataset_factory()` when
+#' @param ... additional arguments passed to `dataset_factory()` when
 #' `sources` is a file path, otherwise ignored.
 #' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
 #' or call [`$NewScan()`][Scanner] to construct a query directly.
 #' @export
 #' @seealso `vignette("dataset", package = "arrow")`
 #' @include arrow-package.R
 open_dataset <- function(sources, schema = NULL, partitioning = hive_partition(), ...) {
-  if (is.character(sources)) {
-    factory <- open_dataset_factory(sources, partitioning = partitioning, ...)
-  } else {
-    factory <- open_dataset_factory(children = sources)
-  }
+  factory <- DatasetFactory$create(sources, partitioning = partitioning, ...)
   factory$Finish(schema)
 }
 
@@ -79,7 +75,7 @@ open_dataset <- function(sources, schema = NULL, partitioning = hive_partition()
 #' discovering files in the local file system, the only currently supported
 #' file system.
 #'
-#' For the `DatasetFactory$create()` factory method, see [open_dataset_factory()], an
+#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], an
 #' alias for it. A `DatasetFactory` has:
 #'
 #' - `$Inspect()`: Returns a common [Schema] for all data discovered by the factory.
@@ -92,12 +88,6 @@ open_dataset <- function(sources, schema = NULL, partitioning = hive_partition()
 #' * `format`: A string identifier of the format of the files in `path`.
 #'   Currently supported options are "parquet", "arrow", and "ipc" (an alias for
 #'   the Arrow file format)
-#'
-#' `UnionDatasetFactory$create()` can be used to unify child `DatasetFactory`s into
-#' a single `DatasetFactory`. Use it when (for example) your data is in multiple
-#' file systems or formats.
-#' * `children`: child `DatasetFactory`s to be unified
-#'
 #' @section Methods:
 #'
 #' A `Dataset` has the following methods:
@@ -197,22 +187,21 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = Object,
     Inspect = function() shared_ptr(Schema, dataset___DatasetFactory__Inspect(self))
   )
 )
-DatasetFactory$create <- function(path,
-                                  children = NULL,
+DatasetFactory$create <- function(x,
                                   filesystem = c("auto", "local"),
                                   format = c("parquet", "arrow", "ipc"),
                                   partitioning = NULL,
                                   allow_non_existent = FALSE,
                                   recursive = TRUE,
                                   ...) {
-  if (!is.null(children)) {
-    return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children)))
+  if (is.list(x) && all(map_lgl(x, ~inherits(., "DatasetFactory")))) {
+    return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(x)))
   }
 
   if (!inherits(filesystem, "FileSystem")) {
     filesystem <- match.arg(filesystem)
     if (filesystem == "auto") {
-      # When there are other FileSystems supported, detect e.g. S3 from path
+      # When there are other FileSystems supported, detect e.g. S3 from x
       filesystem <- "local"
     }
     filesystem <- list(
@@ -221,7 +210,7 @@ DatasetFactory$create <- function(path,
     )[[filesystem]]$create(...)
   }
   selector <- FileSelector$create(
-    path,
+    x,
     allow_non_existent = allow_non_existent,
     recursive = recursive
   )
@@ -252,16 +241,16 @@ DatasetFactory$create <- function(path,
 #'
 #' If you would only have a single `DatasetFactory` (for example, you have a
 #' single directory containing Parquet files), you can call `open_dataset()`
-#' directly. Use `open_dataset_factory()` when you
+#' directly. Use `dataset_factory()` when you
 #' want to combine different directories, file systems, or file formats.
 #'
-#' @param path A string file path containing data files
-#' @param children A list of `DatasetFactory` objects whose datasets should be
-#' unified. If this argument is specified it will be used to construct a
+#' @param x A string file x containing data files, or
+#' a list of `DatasetFactory` objects whose datasets should be
+#' grouped. If this argument is specified it will be used to construct a
 #' `UnionDatasetFactory` and other arguments will be ignored.
 #' @param filesystem A string identifier for the filesystem corresponding to
-#' `path`. Currently only "local" is supported.
-#' @param format A string identifier of the format of the files in `path`.
+#' `x`. Currently only "local" is supported.
+#' @param format A string identifier of the format of the files in `x`.
 #' Currently supported options are "parquet", "arrow", and "ipc" (an alias for
 #' the Arrow file format)
 #' @param partitioning One of
@@ -276,26 +265,16 @@ DatasetFactory$create <- function(path,
 #'    by [hive_partition()] which parses explicit or autodetected fields from
 #'    Hive-style path segments
 #'   * `NULL` for no partitioning
-#' @param allow_non_existent logical: is `path` allowed to not exist? Default
+#' @param allow_non_existent logical: is `x` allowed to not exist? Default
 #' `FALSE`. See [FileSelector].
 #' @param recursive logical: should files be discovered in subdirectories of
-#' `path`? Default `TRUE`.
+#' `x`? Default `TRUE`.
 #' @param ... Additional arguments passed to the [FileSystem] `$create()` method
 #' @return A `DatasetFactory` object. Pass this to [open_dataset()],
 #' in a list potentially with other `DatasetFactory` objects, to create
 #' a `Dataset`.
 #' @export
-open_dataset_factory <- DatasetFactory$create
-
-#' @usage NULL
-#' @format NULL
-#' @rdname Dataset
-#' @export
-UnionDatasetFactory <- R6Class("UnionDatasetFactory", inherit = DatasetFactory)
-UnionDatasetFactory$create <- function(children) {
-  assert_is_list_of(children, "DatasetFactory")
-  shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children))
-}
+dataset_factory <- DatasetFactory$create
 
 #' @usage NULL
 #' @format NULL
@@ -305,9 +284,9 @@ FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory",
   inherit = DatasetFactory
 )
 FileSystemDatasetFactory$create <- function(filesystem,
-                                           selector,
-                                           format,
-                                           partitioning = NULL) {
+                                            selector,
+                                            format,
+                                            partitioning = NULL) {
   assert_is(filesystem, "FileSystem")
   assert_is(selector, "FileSelector")
   assert_is(format, "FileFormat")

diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
@@ -61,7 +61,7 @@ reference:
 - title: Multi-file datasets
   contents:
   - open_dataset
-  - open_dataset_factory
+  - dataset_factory
   - hive_partition
   - Dataset
   - Partitioning

diff --git a/r/man/Dataset.Rd b/r/man/Dataset.Rd
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
diff --git a/r/man/Scanner.Rd b/r/man/Scanner.Rd
diff --git a/r/man/open_dataset_factory.Rd → r/man/dataset_factory.Rd b/r/man/open_dataset_factory.Rd → r/man/dataset_factory.Rd
diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd