From 8e2fa2d9ac12f9c0dfb9715e8b1d9dcd0467125a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 25 Jun 2019 16:27:20 -0700 Subject: [PATCH 1/5] Some cleanup of pkgdown site prep and DESCRIPTION. Start on implementing readr::read_csv arguments --- r/DESCRIPTION | 26 ++++---- r/NEWS.md | 3 + r/R/csv.R | 143 ++++++++++++++++++++++++++++++---------- r/_pkgdown.yml | 13 +++- r/man/read_csv_arrow.Rd | 15 +++-- 5 files changed, 146 insertions(+), 54 deletions(-) create mode 100644 r/NEWS.md diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 45edda18a15d..d0009eced89e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -3,10 +3,10 @@ Title: Integration to 'Apache' 'Arrow' Version: 0.13.0.9000 Authors@R: c( person("Romain", "Fran\u00e7ois", email = "romain@rstudio.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2444-4226")), - person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), - person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")), person("Jeroen", "Ooms", email = "jeroen@berkeley.edu", role = c("aut")), person("Neal", "Richardson", email = "neal@ursalabs.org", role = c("aut")), + person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), + person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")), person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph")) ) Description: 'Apache' 'Arrow' is a cross-language @@ -24,27 +24,27 @@ SystemRequirements: C++11 LinkingTo: Rcpp (>= 1.0.1) Imports: - utils, - Rcpp (>= 1.0.1), - rlang, - purrr, assertthat, - R6, - fs, bit64, - tidyselect + fs, + purrr, + R6, + Rcpp (>= 1.0.1), + rlang, + tidyselect, + utils Roxygen: list(markdown = TRUE) RoxygenNote: 6.1.1 Suggests: - tibble, covr, + hms, + lubridate, pkgdown, rmarkdown, roxygen2, testthat, - lubridate, - vctrs, - hms + tibble, + vctrs Collate: 'enums.R' 'R6.R' diff --git a/r/NEWS.md b/r/NEWS.md new file mode 100644 index 000000000000..bbad0aefcd12 --- /dev/null +++ b/r/NEWS.md @@ -0,0 +1,3 @@ +# arrow 0.13.0.9000 + +Initial CRAN release of `arrow` package diff --git a/r/R/csv.R b/r/R/csv.R index 03a4b7de77dc..c494fa27ef98 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -15,6 +15,100 @@ # specific language governing permissions and limitations # under the License. +#' Read a CSV or other delimited file with Arrow +#' +#' Use arrow::csv::TableReader from [csv_table_reader()] +#' +#' @inheritParams csv_table_reader +#' +#' @param col_select [tidy selection specification][tidyselect::vars_select] of columns +#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a data frame. +#' +#' @return +#' @export +read_csv_arrow <- function(file, + delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + # col_types = TRUE, + col_select = NULL, + # na = c("", "NA"), + # quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = csv_read_options(), + as_tibble = TRUE) { + + if (is.null(parse_options)) { + if (isTRUE(col_names)) { + # Add one row to skip, to match arrow's header_rows + skip <- skip + 1L + } + parse_options <- readr_to_csv_parse_options( + delim, + quote, + escape_double, + escape_backslash, + skip_empty_rows, + skip + ) + } + + if (is.null(convert_options)) { + # TODO: + # * na strings (needs wiring in csv_convert_options) + # * col_types (needs wiring in csv_convert_options). Note that we can't do + # col_types if col_names is strings because the column type specification + # requires a map of name: type, but the CSV reader doesn't handle user- + # provided names--they're renamed after the fact. + convert_options <- csv_convert_options() + } + + reader <- csv_table_reader( + file, + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options + ) + + tab <- reader$Read()$select(!!enquo(col_select)) + if (is.character(col_names)) { + # TODO: Rename `tab`'s columns + # See https://github.com/apache/arrow/pull/4557 + } + + if (isTRUE(as_tibble)) { + tab <- as.data.frame(tab) + } + + tab +} + +readr_to_csv_parse_options <- function(delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + skip_empty_rows = TRUE, + skip = 0L) { + # This function translates from the readr argument list to the arrow arg names + # TODO: validate inputs + csv_parse_options( + delimiter = delim, + quoting = nzchar(quote), + quote_char = quote, + double_quote = escape_double, + escaping = escape_backslash, + escape_char = '\\', + newlines_in_values = escape_backslash, + ignore_empty_lines = skip_empty_rows, + header_rows = skip + ) +} + #' @include R6.R `arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`, @@ -80,7 +174,22 @@ csv_parse_options <- function( #' @param check_utf8 Whether to check UTF8 validity of string columns #' #' @export -csv_convert_options <- function(check_utf8 = TRUE){ +csv_convert_options <- function(check_utf8 = TRUE) { + # TODO: there are more conversion options available: + # // Whether to check UTF8 validity of string columns + # bool check_utf8 = true; + # // Optional per-column types (disabling type inference on those columns) + # std::unordered_map> column_types; + # // Recognized spellings for null values + # std::vector null_values; + # // Recognized spellings for boolean values + # std::vector true_values; + # std::vector false_values; + # // Whether string / binary columns can have null values. + # // If true, then strings in "null_values" are considered null for string columns. + # // If false, then all strings are valid string values. + # bool strings_can_be_null = false; + shared_ptr(`arrow::csv::ConvertOptions`, csv___ConvertOptions__initialize( list( check_utf8 = check_utf8 @@ -167,35 +276,3 @@ csv_table_reader.default <- function(file, ){ file } - -#' Read csv file into an arrow::Table -#' -#' Use arrow::csv::TableReader from [csv_table_reader()] -#' -#' @inheritParams csv_table_reader -#' -#' @param col_select [tidy selection specification][tidyselect::vars_select] of columns -#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a data frame. -#' -#' @export -read_csv_arrow <- function(file, - read_options = csv_read_options(), - parse_options = csv_parse_options(), - convert_options = csv_convert_options(), - col_select = NULL, - as_tibble = TRUE - ) -{ - reader <- csv_table_reader(file, - read_options = read_options, - parse_options = parse_options, - convert_options = convert_options) - - tab <- reader$Read()$select(!!enquo(col_select)) - - if (isTRUE(as_tibble)) { - tab <- as.data.frame(tab) - } - - tab -} diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 69c02e0e4641..1eadc75c1900 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -39,17 +39,28 @@ navbar: text: Reference href: reference/index.html reference: +- title: Installation helpers + contents: + - arrow_available + - install_arrow - title: Reading and writing files contents: - read_csv_arrow + - read_json_arrow - read_feather - read_parquet - write_arrow - write_feather + - write_parquet - csv_convert_options - csv_parse_options - csv_read_options - csv_table_reader + - json_parse_options + - json_read_options + - parquet_arrow_reader_properties + - json_table_reader + - parquet_file_reader - title: Arrow data containers contents: - buffer @@ -92,6 +103,7 @@ reference: - arrow__io__RandomAccessFile - arrow__io__Readable - arrow__io__ReadableFile + - arrow__json__TableReader - arrow__ipc__Message - arrow__ipc__MessageReader - arrow__ipc__RecordBatchFileReader @@ -117,7 +129,6 @@ reference: - RecordBatchFileWriter - RecordBatchStreamReader - RecordBatchStreamWriter - - threadpool - cast_options - compression_codec - default_memory_pool diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_csv_arrow.Rd index 47e5158488a7..42c9478d8994 100644 --- a/r/man/read_csv_arrow.Rd +++ b/r/man/read_csv_arrow.Rd @@ -2,23 +2,24 @@ % Please edit documentation in R/csv.R \name{read_csv_arrow} \alias{read_csv_arrow} -\title{Read csv file into an arrow::Table} +\title{Read a CSV or other delimited file with Arrow} \usage{ -read_csv_arrow(file, read_options = csv_read_options(), - parse_options = csv_parse_options(), - convert_options = csv_convert_options(), col_select = NULL, - as_tibble = TRUE) +read_csv_arrow(file, delim = ",", quote = "\\"", + escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE, + col_select = NULL, skip_empty_rows = TRUE, skip = 0L, + parse_options = NULL, convert_options = NULL, + read_options = csv_read_options(), as_tibble = TRUE) } \arguments{ \item{file}{file} -\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}} +\item{col_select}{\link[tidyselect:vars_select]{tidy selection specification} of columns} \item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}} \item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}} -\item{col_select}{\link[tidyselect:vars_select]{tidy selection specification} of columns} +\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}} \item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a data frame.} } From fb75af1fa0732530e935e214d355d08d0b3a85aa Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 26 Jun 2019 12:59:24 -0700 Subject: [PATCH 2/5] More docs and tests for csv parse options; skip a few that aren't supported --- r/DESCRIPTION | 2 +- r/NEWS.md | 6 +- r/R/csv.R | 121 ++++++++++++++++++++---------- r/man/arrow-package.Rd | 1 - r/man/csv_parse_options.Rd | 4 +- r/man/csv_table_reader.Rd | 12 ++- r/man/read_csv_arrow.Rd | 55 +++++++++++--- r/tests/testthat/test-arrow-csv.R | 85 +++++++++++++++++++-- 8 files changed, 221 insertions(+), 65 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index d0009eced89e..47eccc8834f0 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -16,7 +16,7 @@ Description: 'Apache' 'Arrow' is a cross-language package provides an interface to the Arrow C++ library. Depends: R (>= 3.1) License: Apache License (>= 2.0) -URL: https://arrow.apache.org/docs/r/, https://github.com/apache/arrow/ +URL: https://github.com/apache/arrow/ BugReports: https://issues.apache.org/jira/projects/ARROW/issues Encoding: UTF-8 LazyData: true diff --git a/r/NEWS.md b/r/NEWS.md index bbad0aefcd12..123d1d6a73a7 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -1,3 +1,7 @@ # arrow 0.13.0.9000 -Initial CRAN release of `arrow` package +Initial CRAN release of the `arrow` package. Key features include: + +* Read and write support for various file formats, including Parquet, Feather/Arrow, CSV, and JSON. +* API bindings to the C++ library for Arrow data types and objects, as well as mapping between Arrow types and R data types. +* Tools for helping with C++ library configuration and installation. diff --git a/r/R/csv.R b/r/R/csv.R index c494fa27ef98..7d7913e97261 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -17,32 +17,66 @@ #' Read a CSV or other delimited file with Arrow #' -#' Use arrow::csv::TableReader from [csv_table_reader()] +#' This function uses the Arrow C++ CSV reader to read into a `data.frame`. +#' Arrow C++ options have been mapped to argument names that follow those of +#' [readr::read_delim()], and `col_select` was inspired by [vroom::vroom()]. #' -#' @inheritParams csv_table_reader +#' Note that not all `readr` options are currently implemented here. Please file +#' an issue if you encounter one that `arrow` should support. #' -#' @param col_select [tidy selection specification][tidyselect::vars_select] of columns -#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a data frame. +#' If you need to control Arrow-specific reader parameters that don't have an +#' equivalent in `readr::read_csv()`, you can either provide them in the +#' `parse_options`, `convert_options`, or `read_options` arguments, or you can +#' call [csv_table_reader()] directly for lower-level access. #' -#' @return +#' @param file A character path to a local file, or an Arrow input stream +#' @param delim Single character used to separate fields within a record. +#' @param quote Single character used to quote strings. +#' @param escape_double Does the file escape quotes by doubling them? +#' i.e. If this option is `TRUE`, the value `""""` represents +#' a single quote, `\"`. +#' @param escape_backslash Does the file use backslashes to escape special +#' characters? This is more general than `escape_double` as backslashes +#' can be used to escape the delimiter character, the quote character, or +#' to add special characters like `\\n`. +# #' @param col_names If `TRUE`, the first row of the input will be used as the +# #' column names and will not be included in the data frame. Note that `FALSE` +# #' is not currently supported, nor is specifying a character vector of column +# #' names. +#' @param col_select A [tidy selection specification][tidyselect::vars_select] +#' of columns, as used in `dplyr::select()`. +#' @param skip_empty_rows Should blank rows be ignored altogether? If +#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be +#' filled with missings. +# #' @param skip Number of lines to skip before reading data. +#' @param parse_options see [csv_parse_options()]. If given, this overrides any +#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). +#' @param convert_options see [csv_convert_options()] +#' @param read_options see [csv_read_options()] +#' @param as_tibble Should the function return a `data.frame` or an +#' [arrow::Table][arrow__Table]? +#' +#' @return A `data.frame`, or an `arrow::Table` if `as_tibble = FALSE`. #' @export read_csv_arrow <- function(file, delim = ",", quote = '"', escape_double = TRUE, escape_backslash = FALSE, - col_names = TRUE, + # col_names = TRUE, # col_types = TRUE, col_select = NULL, # na = c("", "NA"), # quoted_na = TRUE, skip_empty_rows = TRUE, - skip = 0L, + # skip = 0L, parse_options = NULL, convert_options = NULL, read_options = csv_read_options(), as_tibble = TRUE) { + col_names <- TRUE # Hardcoded pending fix + skip <- 0L # Hardcoded pending fix if (is.null(parse_options)) { if (isTRUE(col_names)) { # Add one row to skip, to match arrow's header_rows @@ -88,27 +122,6 @@ read_csv_arrow <- function(file, tab } -readr_to_csv_parse_options <- function(delim = ",", - quote = '"', - escape_double = TRUE, - escape_backslash = FALSE, - skip_empty_rows = TRUE, - skip = 0L) { - # This function translates from the readr argument list to the arrow arg names - # TODO: validate inputs - csv_parse_options( - delimiter = delim, - quoting = nzchar(quote), - quote_char = quote, - double_quote = escape_double, - escaping = escape_backslash, - escape_char = '\\', - newlines_in_values = escape_backslash, - ignore_empty_lines = skip_empty_rows, - header_rows = skip - ) -} - #' @include R6.R `arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`, @@ -135,7 +148,29 @@ csv_read_options <- function(block_size = 1048576L) { )) } -#' Parsing options +readr_to_csv_parse_options <- function(delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + skip_empty_rows = TRUE, + skip = 0L) { + # This function translates from the readr argument list to the arrow arg names + # TODO: validate inputs + csv_parse_options( + delimiter = delim, + quoting = nzchar(quote), + quote_char = quote, + double_quote = escape_double, + escaping = escape_backslash, + escape_char = '\\', + newlines_in_values = escape_backslash, + ignore_empty_lines = skip_empty_rows, + header_rows = skip + ) +} + +#' CSV parsing options +#' #' #' @param delimiter Field delimiter #' @param quoting Whether quoting is used @@ -148,12 +183,16 @@ csv_read_options <- function(block_size = 1048576L) { #' @param header_rows Number of header rows to skip (including the first row containing column names) #' #' @export -csv_parse_options <- function( - delimiter = ",", quoting = TRUE, quote_char = '"', - double_quote = TRUE, escaping = FALSE, escape_char = '\\', - newlines_in_values = FALSE, ignore_empty_lines = TRUE, - header_rows = 1L -){ +csv_parse_options <- function(delimiter = ",", + quoting = TRUE, + quote_char = '"', + double_quote = TRUE, + escaping = FALSE, + escape_char = '\\', + newlines_in_values = FALSE, + ignore_empty_lines = TRUE, + header_rows = 1L) { + shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize( list( delimiter = delimiter, @@ -176,8 +215,6 @@ csv_parse_options <- function( #' @export csv_convert_options <- function(check_utf8 = TRUE) { # TODO: there are more conversion options available: - # // Whether to check UTF8 validity of string columns - # bool check_utf8 = true; # // Optional per-column types (disabling type inference on those columns) # std::unordered_map> column_types; # // Recognized spellings for null values @@ -197,14 +234,20 @@ csv_convert_options <- function(check_utf8 = TRUE) { )) } -#' CSV table reader +#' Arrow CSV table reader +#' +#' These methods wrap the Arrow C++ CSV table reader. +#' For an interface to the CSV reader that's more familiar for R users, see +#' [read_csv_arrow()] #' -#' @param file file +#' @param file A character path to a local file, or an Arrow input stream #' @param read_options, see [csv_read_options()] #' @param parse_options, see [csv_parse_options()] #' @param convert_options, see [csv_convert_options()] #' @param ... additional parameters. #' +#' @return An `arrow::csv::TableReader` R6 object. Call `$Read()` on it to get +#' an Arrow Table. #' @export csv_table_reader <- function(file, read_options = csv_read_options(), diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 1f4b5fbcd04e..c3da92dc00f4 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -15,7 +15,6 @@ \seealso{ Useful links: \itemize{ - \item \url{https://arrow.apache.org/docs/r/} \item \url{https://github.com/apache/arrow/} \item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues} } diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd index 7e6ab77d4395..ac9826232b43 100644 --- a/r/man/csv_parse_options.Rd +++ b/r/man/csv_parse_options.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/csv.R \name{csv_parse_options} \alias{csv_parse_options} -\title{Parsing options} +\title{CSV parsing options} \usage{ csv_parse_options(delimiter = ",", quoting = TRUE, quote_char = "\\"", double_quote = TRUE, escaping = FALSE, @@ -29,5 +29,5 @@ csv_parse_options(delimiter = ",", quoting = TRUE, \item{header_rows}{Number of header rows to skip (including the first row containing column names)} } \description{ -Parsing options +CSV parsing options } diff --git a/r/man/csv_table_reader.Rd b/r/man/csv_table_reader.Rd index 029cd0b5923c..862aefbcd7ac 100644 --- a/r/man/csv_table_reader.Rd +++ b/r/man/csv_table_reader.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/csv.R \name{csv_table_reader} \alias{csv_table_reader} -\title{CSV table reader} +\title{Arrow CSV table reader} \usage{ csv_table_reader(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...) } \arguments{ -\item{file}{file} +\item{file}{A character path to a local file, or an Arrow input stream} \item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}} @@ -19,6 +19,12 @@ csv_table_reader(file, read_options = csv_read_options(), \item{...}{additional parameters.} } +\value{ +An \code{arrow::csv::TableReader} R6 object. Call \code{$Read()} on it to get +an Arrow Table. +} \description{ -CSV table reader +These methods wrap the Arrow C++ CSV table reader. +For an interface to the CSV reader that's more familiar for R users, see +\code{\link[=read_csv_arrow]{read_csv_arrow()}} } diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_csv_arrow.Rd index 42c9478d8994..c7315a8903f5 100644 --- a/r/man/read_csv_arrow.Rd +++ b/r/man/read_csv_arrow.Rd @@ -5,24 +5,57 @@ \title{Read a CSV or other delimited file with Arrow} \usage{ read_csv_arrow(file, delim = ",", quote = "\\"", - escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE, - col_select = NULL, skip_empty_rows = TRUE, skip = 0L, - parse_options = NULL, convert_options = NULL, - read_options = csv_read_options(), as_tibble = TRUE) + escape_double = TRUE, escape_backslash = FALSE, col_select = NULL, + skip_empty_rows = TRUE, parse_options = NULL, + convert_options = NULL, read_options = csv_read_options(), + as_tibble = TRUE) } \arguments{ -\item{file}{file} +\item{file}{A character path to a local file, or an Arrow input stream} -\item{col_select}{\link[tidyselect:vars_select]{tidy selection specification} of columns} +\item{delim}{Single character used to separate fields within a record.} -\item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}} +\item{quote}{Single character used to quote strings.} -\item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}} +\item{escape_double}{Does the file escape quotes by doubling them? +i.e. If this option is \code{TRUE}, the value \code{""""} represents +a single quote, \code{\"}.} -\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}} +\item{escape_backslash}{Does the file use backslashes to escape special +characters? This is more general than \code{escape_double} as backslashes +can be used to escape the delimiter character, the quote character, or +to add special characters like \code{\\n}.} -\item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a data frame.} +\item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this +option is \code{TRUE} then blank rows will not be represented at all.} + +\item{parse_options, }{see \code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides any +parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} + +\item{convert_options, }{see \code{\link[=csv_convert_options]{csv_convert_options()}}} + +\item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}} + +\item{as_tibble}{Should the function return a \code{data.frame} or an +\link[=arrow__Table]{arrow::Table}?} +} +\value{ +A \code{data.frame}, or an \code{arrow::Table} if \code{as_tibble = FALSE}. } \description{ -Use arrow::csv::TableReader from \code{\link[=csv_table_reader]{csv_table_reader()}} +This function uses the Arrow C++ CSV reader to read into a \code{data.frame}. +Arrow C++ options have been mapped to argument names that follow those of +\code{\link[readr:read_delim]{readr::read_delim()}}, and \code{col_select} was inspired by \code{\link[vroom:vroom]{vroom::vroom()}}. +} +\details{ +Note that not all \code{readr} options are currently implemented here. Please file +an issue if you encounter one that \code{arrow} should support. + +If you need to control Arrow-specific reader parameters that don't have an +equivalent in \code{readr::read_csv()}, you can either provide them in the +\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can +call \code{\link[=csv_table_reader]{csv_table_reader()}} directly for lower-level access. } diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R index 7f0c1ae497d9..330e17b5a9a7 100644 --- a/r/tests/testthat/test-arrow-csv.R +++ b/r/tests/testthat/test-arrow-csv.R @@ -19,8 +19,9 @@ context("arrow::csv::TableReader") test_that("Can read csv file", { tf <- tempfile() + on.exit(unlink(tf)) - write.csv(iris, tf, row.names = FALSE, quote = FALSE) + write.csv(iris, tf, row.names = FALSE) tab1 <- read_csv_arrow(tf, as_tibble = FALSE) tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = FALSE) @@ -31,14 +32,13 @@ test_that("Can read csv file", { expect_equal(tab0, tab1) expect_equal(tab0, tab2) expect_equal(tab0, tab3) - - unlink(tf) }) test_that("read_csv_arrow(as_tibble=TRUE)", { tf <- tempfile() + on.exit(unlink(tf)) - write.csv(iris, tf, row.names = FALSE, quote = FALSE) + write.csv(iris, tf, row.names = FALSE) tab1 <- read_csv_arrow(tf, as_tibble = TRUE) tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = TRUE) @@ -48,12 +48,85 @@ test_that("read_csv_arrow(as_tibble=TRUE)", { expect_equivalent(iris, tab1) expect_equivalent(iris, tab2) expect_equivalent(iris, tab3) +}) + +test_that("read_csv_arrow parsing options: delim", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.table(iris, tf, sep = "\t", row.names = FALSE) + tab1 <- read_csv_arrow(tf, delim = "\t") + + iris$Species <- as.character(iris$Species) + expect_equivalent(iris, tab1) +}) - unlink(tf) +test_that("read_csv_arrow parsing options: quote", { + tf <- tempfile() + on.exit(unlink(tf)) + + df <- data.frame(a=c(1, 2), b=c("'abc'", "'def'")) + write.table(df, sep=";", tf, row.names = FALSE, quote = FALSE) + tab1 <- read_csv_arrow(tf, delim = ";", quote = "'") + + # Is this a problem? + # Component “a”: target is integer64, current is numeric + tab1$a <- as.numeric(tab1$a) + expect_equivalent( + tab1, + data.frame(a=c(1, 2), b=c("abc", "def"), stringsAsFactors = FALSE) + ) }) +test_that("read_csv_arrow parsing options: col_names", { + skip("Invalid: Empty CSV file") + tf <- tempfile() + on.exit(unlink(tf)) + + write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE) + tab1 <- read_csv_arrow(tf, col_names = FALSE) + + expect_identical(names(tab1), names(iris)) + iris$Species <- as.character(iris$Species) + expect_equivalent(iris, tab1) +}) + +test_that("read_csv_arrow parsing options: skip", { + skip("Invalid: Empty CSV file") + tf <- tempfile() + on.exit(unlink(tf)) + + cat("asdf\nqwer\n", file = tf) + suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append = TRUE)) + # This works: + # print(head(readr::read_csv(tf, skip = 2))) + + # This errors: + tab1 <- read_csv_arrow(tf, skip = 2) + + expect_identical(names(tab1), names(iris)) + iris$Species <- as.character(iris$Species) + expect_equivalent(iris, tab1) +}) + +test_that("read_csv_arrow parsing options: skip_empty_rows", { + skip("Invalid: Empty CSV file") + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(iris, tf, row.names = FALSE) + cat("\n\n", file = tf, append = TRUE) + + tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE) + + expect_equal(nrow(tab1), nrow(iris) + 2) + expect_true(is.na(tail(iris, 1)[[1]])) +}) + + test_that("read_csv_arrow() respects col_select", { tf <- tempfile() + on.exit(unlink(tf)) write.csv(iris, tf, row.names = FALSE, quote = FALSE) @@ -62,6 +135,4 @@ test_that("read_csv_arrow() respects col_select", { tib <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_tibble = TRUE) expect_equal(tib, tibble::tibble(Sepal.Length = iris$Sepal.Length, Sepal.Width = iris$Sepal.Width)) - - unlink(tf) }) From fc156e3e8f2907bbcb9f30ee1c575b267561fbd5 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 26 Jun 2019 15:00:20 -0700 Subject: [PATCH 3/5] Doc :nailcare:, add read_delim_arrow and read_tsv_arrow --- r/NAMESPACE | 2 + r/R/csv.R | 93 +++++++++++++++---- ...{read_csv_arrow.Rd => read_delim_arrow.Rd} | 34 +++++-- r/tests/testthat/test-arrow-csv.R | 10 +- 4 files changed, 108 insertions(+), 31 deletions(-) rename r/man/{read_csv_arrow.Rd => read_delim_arrow.Rd} (65%) diff --git a/r/NAMESPACE b/r/NAMESPACE index e82b30a4fea2..e4b367d0eaf5 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -162,6 +162,7 @@ export(parquet_arrow_reader_properties) export(parquet_file_reader) export(read_arrow) export(read_csv_arrow) +export(read_delim_arrow) export(read_feather) export(read_json_arrow) export(read_message) @@ -169,6 +170,7 @@ export(read_parquet) export(read_record_batch) export(read_schema) export(read_table) +export(read_tsv_arrow) export(record_batch) export(schema) export(starts_with) diff --git a/r/R/csv.R b/r/R/csv.R index 7d7913e97261..8f4370ab7d31 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -17,10 +17,13 @@ #' Read a CSV or other delimited file with Arrow #' -#' This function uses the Arrow C++ CSV reader to read into a `data.frame`. +#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`. #' Arrow C++ options have been mapped to argument names that follow those of #' [readr::read_delim()], and `col_select` was inspired by [vroom::vroom()]. #' +#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around +#' `read_delim_arrow()` that specify a delimiter. +#' #' Note that not all `readr` options are currently implemented here. Please file #' an issue if you encounter one that `arrow` should support. #' @@ -58,29 +61,33 @@ #' #' @return A `data.frame`, or an `arrow::Table` if `as_tibble = FALSE`. #' @export -read_csv_arrow <- function(file, - delim = ",", - quote = '"', - escape_double = TRUE, - escape_backslash = FALSE, - # col_names = TRUE, - # col_types = TRUE, - col_select = NULL, - # na = c("", "NA"), - # quoted_na = TRUE, - skip_empty_rows = TRUE, - # skip = 0L, - parse_options = NULL, - convert_options = NULL, - read_options = csv_read_options(), - as_tibble = TRUE) { +read_delim_arrow <- function(file, + delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + # col_names = TRUE, + # col_types = TRUE, + col_select = NULL, + # na = c("", "NA"), + # quoted_na = TRUE, + skip_empty_rows = TRUE, + # skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = csv_read_options(), + as_tibble = TRUE) { + + # These are hardcoded pending https://issues.apache.org/jira/browse/ARROW-5747 + col_names <- TRUE + skip <- 0L - col_names <- TRUE # Hardcoded pending fix - skip <- 0L # Hardcoded pending fix if (is.null(parse_options)) { if (isTRUE(col_names)) { # Add one row to skip, to match arrow's header_rows skip <- skip + 1L + # Note that with the hardcoding, header_rows is always 1, which + # turns out to be the only value that works meaningfully } parse_options <- readr_to_csv_parse_options( delim, @@ -122,6 +129,54 @@ read_csv_arrow <- function(file, tab } +#' @rdname read_delim_arrow +#' @export +read_csv_arrow <- function(file, + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + # col_names = TRUE, + # col_types = TRUE, + col_select = NULL, + # na = c("", "NA"), + # quoted_na = TRUE, + skip_empty_rows = TRUE, + # skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = csv_read_options(), + as_tibble = TRUE) { + + mc <- match.call() + mc$delim <- "," + mc[[1]] <- as.name("read_delim_arrow") + eval.parent(mc) +} + +#' @rdname read_delim_arrow +#' @export +read_tsv_arrow <- function(file, + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + # col_names = TRUE, + # col_types = TRUE, + col_select = NULL, + # na = c("", "NA"), + # quoted_na = TRUE, + skip_empty_rows = TRUE, + # skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = csv_read_options(), + as_tibble = TRUE) { + + mc <- match.call() + mc$delim <- "\t" + mc[[1]] <- as.name("read_delim_arrow") + eval.parent(mc) +} + #' @include R6.R `arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`, diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_delim_arrow.Rd similarity index 65% rename from r/man/read_csv_arrow.Rd rename to r/man/read_delim_arrow.Rd index c7315a8903f5..e1ca16f0d776 100644 --- a/r/man/read_csv_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -1,14 +1,28 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csv.R -\name{read_csv_arrow} +\name{read_delim_arrow} +\alias{read_delim_arrow} \alias{read_csv_arrow} +\alias{read_tsv_arrow} \title{Read a CSV or other delimited file with Arrow} \usage{ -read_csv_arrow(file, delim = ",", quote = "\\"", +read_delim_arrow(file, delim = ",", quote = "\\"", escape_double = TRUE, escape_backslash = FALSE, col_select = NULL, skip_empty_rows = TRUE, parse_options = NULL, convert_options = NULL, read_options = csv_read_options(), as_tibble = TRUE) + +read_csv_arrow(file, quote = "\\"", escape_double = TRUE, + escape_backslash = FALSE, col_select = NULL, + skip_empty_rows = TRUE, parse_options = NULL, + convert_options = NULL, read_options = csv_read_options(), + as_tibble = TRUE) + +read_tsv_arrow(file, quote = "\\"", escape_double = TRUE, + escape_backslash = FALSE, col_select = NULL, + skip_empty_rows = TRUE, parse_options = NULL, + convert_options = NULL, read_options = csv_read_options(), + as_tibble = TRUE) } \arguments{ \item{file}{A character path to a local file, or an Arrow input stream} @@ -29,15 +43,16 @@ to add special characters like \code{\\n}.} \item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification} of columns, as used in \code{dplyr::select()}.} -\item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this -option is \code{TRUE} then blank rows will not be represented at all.} +\item{skip_empty_rows}{Should blank rows be ignored altogether? If +\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be +filled with missings.} -\item{parse_options, }{see \code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides any +\item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides any parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} -\item{convert_options, }{see \code{\link[=csv_convert_options]{csv_convert_options()}}} +\item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}} -\item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}} +\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}} \item{as_tibble}{Should the function return a \code{data.frame} or an \link[=arrow__Table]{arrow::Table}?} @@ -46,11 +61,14 @@ parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, et A \code{data.frame}, or an \code{arrow::Table} if \code{as_tibble = FALSE}. } \description{ -This function uses the Arrow C++ CSV reader to read into a \code{data.frame}. +These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. Arrow C++ options have been mapped to argument names that follow those of \code{\link[readr:read_delim]{readr::read_delim()}}, and \code{col_select} was inspired by \code{\link[vroom:vroom]{vroom::vroom()}}. } \details{ +\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around +\code{read_delim_arrow()} that specify a delimiter. + Note that not all \code{readr} options are currently implemented here. Please file an issue if you encounter one that \code{arrow} should support. diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R index 330e17b5a9a7..aed96387a829 100644 --- a/r/tests/testthat/test-arrow-csv.R +++ b/r/tests/testthat/test-arrow-csv.R @@ -50,24 +50,26 @@ test_that("read_csv_arrow(as_tibble=TRUE)", { expect_equivalent(iris, tab3) }) -test_that("read_csv_arrow parsing options: delim", { +test_that("read_delim_arrow parsing options: delim", { tf <- tempfile() on.exit(unlink(tf)) write.table(iris, tf, sep = "\t", row.names = FALSE) - tab1 <- read_csv_arrow(tf, delim = "\t") + tab1 <- read_tsv_arrow(tf) + tab2 <- read_delim_arrow(tf, delim = "\t") + expect_equivalent(tab1, tab2) iris$Species <- as.character(iris$Species) expect_equivalent(iris, tab1) }) -test_that("read_csv_arrow parsing options: quote", { +test_that("read_delim_arrow parsing options: quote", { tf <- tempfile() on.exit(unlink(tf)) df <- data.frame(a=c(1, 2), b=c("'abc'", "'def'")) write.table(df, sep=";", tf, row.names = FALSE, quote = FALSE) - tab1 <- read_csv_arrow(tf, delim = ";", quote = "'") + tab1 <- read_delim_arrow(tf, delim = ";", quote = "'") # Is this a problem? # Component “a”: target is integer64, current is numeric From 22268d960cc2799c30bdd831868dea880dfddd00 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 26 Jun 2019 15:20:11 -0700 Subject: [PATCH 4/5] Rename man topic in pkgdown.yml --- r/_pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 1eadc75c1900..648085bb78ac 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -45,7 +45,7 @@ reference: - install_arrow - title: Reading and writing files contents: - - read_csv_arrow + - read_delim_arrow - read_json_arrow - read_feather - read_parquet From 92b0a2788c98c7bae23c238438d8846502d55f19 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 27 Jun 2019 09:50:55 -0700 Subject: [PATCH 5/5] :rat: --- r/NEWS.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index 123d1d6a73a7..fa6b25a472f0 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -1,3 +1,22 @@ + + # arrow 0.13.0.9000 Initial CRAN release of the `arrow` package. Key features include: