Skip to content

Commit

Permalink
ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset …
Browse files Browse the repository at this point in the history
…classes"

This commit was pushed to #6470 along with my "approval" review, but a force push overwrote it.

cc @bkietz

Closes #6502 from nealrichardson/dataset-source-followup and squashes the following commits:

c1c5e75 <Neal Richardson> Some R edits

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
nealrichardson committed Feb 28, 2020
1 parent 5e3255b commit 5096803
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 82 deletions.
5 changes: 3 additions & 2 deletions r/NAMESPACE
Expand Up @@ -92,6 +92,7 @@ export(FileOutputStream)
export(FileSelector)
export(FileStats)
export(FileSystem)
export(FileSystemDataset)
export(FileSystemDatasetFactory)
export(FileType)
export(FixedSizeBufferWriter)
Expand Down Expand Up @@ -133,8 +134,8 @@ export(StructArray)
export(SubTreeFileSystem)
export(Table)
export(TimeUnit)
export(UnionDatasetFactory)
export(Type)
export(UnionDataset)
export(arrow_available)
export(bool)
export(boolean)
Expand All @@ -143,6 +144,7 @@ export(cast_options)
export(chunked_array)
export(codec_is_available)
export(contains)
export(dataset_factory)
export(date32)
export(date64)
export(decimal)
Expand Down Expand Up @@ -171,7 +173,6 @@ export(null)
export(num_range)
export(one_of)
export(open_dataset)
export(open_dataset_factory)
export(read_arrow)
export(read_csv_arrow)
export(read_delim_arrow)
Expand Down
63 changes: 21 additions & 42 deletions r/R/dataset.R
Expand Up @@ -24,7 +24,7 @@
#' `Dataset`, then use `dplyr` methods to query it.
#'
#' @param sources Either a string path to a directory containing data files,
#' or a list of `DatasetFactory` objects as created by [open_dataset_factory()].
#' or a list of `DatasetFactory` objects as created by [dataset_factory()].
#' @param schema [Schema] for the dataset. If `NULL` (the default), the schema
#' will be inferred from the data sources.
#' @param partitioning When `sources` is a file path, one of
Expand All @@ -39,19 +39,15 @@
#' by [hive_partition()] which parses explicit or autodetected fields from
#' Hive-style path segments
#' * `NULL` for no partitioning
#' @param ... additional arguments passed to `open_dataset_factory()` when
#' @param ... additional arguments passed to `dataset_factory()` when
#' `sources` is a file path, otherwise ignored.
#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
#' or call [`$NewScan()`][Scanner] to construct a query directly.
#' @export
#' @seealso `vignette("dataset", package = "arrow")`
#' @include arrow-package.R
open_dataset <- function(sources, schema = NULL, partitioning = hive_partition(), ...) {
if (is.character(sources)) {
factory <- open_dataset_factory(sources, partitioning = partitioning, ...)
} else {
factory <- open_dataset_factory(children = sources)
}
factory <- DatasetFactory$create(sources, partitioning = partitioning, ...)
factory$Finish(schema)
}

Expand Down Expand Up @@ -79,7 +75,7 @@ open_dataset <- function(sources, schema = NULL, partitioning = hive_partition()
#' discovering files in the local file system, the only currently supported
#' file system.
#'
#' For the `DatasetFactory$create()` factory method, see [open_dataset_factory()], an
#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], an
#' alias for it. A `DatasetFactory` has:
#'
#' - `$Inspect()`: Returns a common [Schema] for all data discovered by the factory.
Expand All @@ -92,12 +88,6 @@ open_dataset <- function(sources, schema = NULL, partitioning = hive_partition()
#' * `format`: A string identifier of the format of the files in `path`.
#' Currently supported options are "parquet", "arrow", and "ipc" (an alias for
#' the Arrow file format)
#'
#' `UnionDatasetFactory$create()` can be used to unify child `DatasetFactory`s into
#' a single `DatasetFactory`. Use it when (for example) your data is in multiple
#' file systems or formats.
#' * `children`: child `DatasetFactory`s to be unified
#'
#' @section Methods:
#'
#' A `Dataset` has the following methods:
Expand Down Expand Up @@ -197,22 +187,21 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = Object,
Inspect = function() shared_ptr(Schema, dataset___DatasetFactory__Inspect(self))
)
)
DatasetFactory$create <- function(path,
children = NULL,
DatasetFactory$create <- function(x,
filesystem = c("auto", "local"),
format = c("parquet", "arrow", "ipc"),
partitioning = NULL,
allow_non_existent = FALSE,
recursive = TRUE,
...) {
if (!is.null(children)) {
return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children)))
if (is.list(x) && all(map_lgl(x, ~inherits(., "DatasetFactory")))) {
return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(x)))
}

if (!inherits(filesystem, "FileSystem")) {
filesystem <- match.arg(filesystem)
if (filesystem == "auto") {
# When there are other FileSystems supported, detect e.g. S3 from path
# When there are other FileSystems supported, detect e.g. S3 from x
filesystem <- "local"
}
filesystem <- list(
Expand All @@ -221,7 +210,7 @@ DatasetFactory$create <- function(path,
)[[filesystem]]$create(...)
}
selector <- FileSelector$create(
path,
x,
allow_non_existent = allow_non_existent,
recursive = recursive
)
Expand Down Expand Up @@ -252,16 +241,16 @@ DatasetFactory$create <- function(path,
#'
#' If you would only have a single `DatasetFactory` (for example, you have a
#' single directory containing Parquet files), you can call `open_dataset()`
#' directly. Use `open_dataset_factory()` when you
#' directly. Use `dataset_factory()` when you
#' want to combine different directories, file systems, or file formats.
#'
#' @param path A string file path containing data files
#' @param children A list of `DatasetFactory` objects whose datasets should be
#' unified. If this argument is specified it will be used to construct a
#' @param x A string file x containing data files, or
#' a list of `DatasetFactory` objects whose datasets should be
#' grouped. If this argument is specified it will be used to construct a
#' `UnionDatasetFactory` and other arguments will be ignored.
#' @param filesystem A string identifier for the filesystem corresponding to
#' `path`. Currently only "local" is supported.
#' @param format A string identifier of the format of the files in `path`.
#' `x`. Currently only "local" is supported.
#' @param format A string identifier of the format of the files in `x`.
#' Currently supported options are "parquet", "arrow", and "ipc" (an alias for
#' the Arrow file format)
#' @param partitioning One of
Expand All @@ -276,26 +265,16 @@ DatasetFactory$create <- function(path,
#' by [hive_partition()] which parses explicit or autodetected fields from
#' Hive-style path segments
#' * `NULL` for no partitioning
#' @param allow_non_existent logical: is `path` allowed to not exist? Default
#' @param allow_non_existent logical: is `x` allowed to not exist? Default
#' `FALSE`. See [FileSelector].
#' @param recursive logical: should files be discovered in subdirectories of
#' `path`? Default `TRUE`.
#' `x`? Default `TRUE`.
#' @param ... Additional arguments passed to the [FileSystem] `$create()` method
#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
#' in a list potentially with other `DatasetFactory` objects, to create
#' a `Dataset`.
#' @export
open_dataset_factory <- DatasetFactory$create

#' @usage NULL
#' @format NULL
#' @rdname Dataset
#' @export
UnionDatasetFactory <- R6Class("UnionDatasetFactory", inherit = DatasetFactory)
UnionDatasetFactory$create <- function(children) {
assert_is_list_of(children, "DatasetFactory")
shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children))
}
dataset_factory <- DatasetFactory$create

#' @usage NULL
#' @format NULL
Expand All @@ -305,9 +284,9 @@ FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory",
inherit = DatasetFactory
)
FileSystemDatasetFactory$create <- function(filesystem,
selector,
format,
partitioning = NULL) {
selector,
format,
partitioning = NULL) {
assert_is(filesystem, "FileSystem")
assert_is(selector, "FileSelector")
assert_is(format, "FileFormat")
Expand Down
2 changes: 1 addition & 1 deletion r/_pkgdown.yml
Expand Up @@ -61,7 +61,7 @@ reference:
- title: Multi-file datasets
contents:
- open_dataset
- open_dataset_factory
- dataset_factory
- hive_partition
- Dataset
- Partitioning
Expand Down
41 changes: 30 additions & 11 deletions r/man/Dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions r/man/FileFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions r/man/Scanner.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 13 additions & 14 deletions r/man/open_dataset_factory.Rd → r/man/dataset_factory.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions r/man/open_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5096803

Please sign in to comment.