Skip to content

Commit

Permalink
ARROW-11735: [R] Allow Parquet and Arrow Dataset to be optional compo…
Browse files Browse the repository at this point in the history
…nents

Not implemented for Windows yet (that's ARROW-11884)

Closes #9610 from ianmcook/ARROW-11735

Lead-authored-by: Ian Cook <ianmcook@gmail.com>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
ianmcook and nealrichardson committed Mar 6, 2021
1 parent d6ba4f8 commit b07027e
Show file tree
Hide file tree
Showing 32 changed files with 945 additions and 219 deletions.
2 changes: 1 addition & 1 deletion dev/tasks/conda-recipes/r-arrow/configure.win
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -euxo pipefail

echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ARROW -DARROW_R_WITH_S3" > src/Makevars.win
echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3" > src/Makevars.win
echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win
echo 'CXX_STD=CXX11' >> src/Makevars.win
echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -lparquet -larrow" >> src/Makevars.win
7 changes: 6 additions & 1 deletion dev/tasks/r/azure.linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ jobs:
# we have to export this (right?) because we need it in the build env
export ARROW_R_DEV={{ not_cran }}
# Note that ci/scripts/r_test.sh sets NOT_CRAN=true if ARROW_R_DEV=TRUE
docker-compose run r
docker-compose run \
-e ARROW_DATASET={{ arrow_dataset|default("") }} \
-e ARROW_PARQUET={{ arrow_parquet|default("") }} \
-e ARROW_S3={{ arrow_s3|default("") }} \
-e LIBARROW_MINIMAL={{ libarrow_minimal|default("") }} \
r
displayName: Docker run
- script: |
Expand Down
13 changes: 13 additions & 0 deletions dev/tasks/tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,19 @@ tasks:
r_image: r-base
r_tag: 3.6-opensuse42
not_cran: "TRUE"

test-r-minimal-build:
ci: azure
template: r/azure.linux.yml
params:
r_org: rocker
r_image: r-base
r_tag: latest
not_cran: "TRUE"
arrow_dataset: "OFF"
arrow_parquet: "OFF"
arrow_s3: "OFF"
libarrow_minimal: "TRUE"

test-ubuntu-18.04-r-sanitizer:
ci: azure
Expand Down
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ export(Type)
export(UnionDataset)
export(arrow_available)
export(arrow_info)
export(arrow_with_dataset)
export(arrow_with_parquet)
export(arrow_with_s3)
export(binary)
export(bool)
Expand Down
25 changes: 22 additions & 3 deletions r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,36 @@
#' You won't generally need to call these function, but they're made available
#' for diagnostic purposes.
#' @return `TRUE` or `FALSE` depending on whether the package was installed
#' with the Arrow C++ library (check with `arrow_available()`) or with S3
#' support enabled (check with `arrow_with_s3()`).
#' with:
#' * The Arrow C++ library (check with `arrow_available()`)
#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`)
#' * Parquet support enabled (check with `arrow_with_parquet()`)
#' * Amazon S3 support enabled (check with `arrow_with_s3()`)
#' @export
#' @examples
#' arrow_available()
#' arrow_with_dataset()
#' arrow_with_parquet()
#' arrow_with_s3()
#' @seealso If either of these are `FALSE`, see
#' @seealso If any of these are `FALSE`, see
#' `vignette("install", package = "arrow")` for guidance on reinstalling the
#' package.
arrow_available <- function() {
.Call(`_arrow_available`)
}

#' @rdname arrow_available
#' @export
arrow_with_dataset <- function() {
.Call(`_dataset_available`)
}

#' @rdname arrow_available
#' @export
arrow_with_parquet <- function() {
.Call(`_parquet_available`)
}

#' @rdname arrow_available
#' @export
arrow_with_s3 <- function() {
Expand Down Expand Up @@ -95,6 +112,8 @@ arrow_info <- function() {
pool <- default_memory_pool()
out <- c(out, list(
capabilities = c(
dataset = arrow_with_dataset(),
parquet = arrow_with_parquet(),
s3 = arrow_with_s3(),
vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1))
),
Expand Down
2 changes: 1 addition & 1 deletion r/R/dataset-partition.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ HivePartitioning$create <- function(schm, null_fallback = NULL) {
#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if
#' calling `hive_partition()` with no arguments.
#' @examples
#' \donttest{
#' \dontrun{
#' hive_partition(year = int16(), month = int8())
#' }
#' @export
Expand Down
6 changes: 3 additions & 3 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#' @return A [arrow::Table][Table], or a `data.frame` if `as_data_frame` is
#' `TRUE` (the default).
#' @examples
#' \donttest{
#' \dontrun{
#' tf <- tempfile()
#' on.exit(unlink(tf))
#' write_parquet(mtcars, tf)
Expand Down Expand Up @@ -122,7 +122,7 @@ read_parquet <- function(file,
#' @return the input `x` invisibly.
#'
#' @examples
#' \donttest{
#' \dontrun{
#' tf1 <- tempfile(fileext = ".parquet")
#' write_parquet(data.frame(x = 1:5), tf1)
#'
Expand Down Expand Up @@ -449,7 +449,7 @@ ParquetFileWriter$create <- function(schema,
#'
#' @export
#' @examples
#' \donttest{
#' \dontrun{
#' f <- system.file("v0.7.1.parquet", package="arrow")
#' pq <- ParquetFileReader$create(f)
#' pq$GetSchema()
Expand Down
49 changes: 36 additions & 13 deletions r/configure
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,12 @@
# R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib'

# Library settings
PKG_CONFIG_NAME="arrow parquet arrow-dataset"
PKG_CONFIG_NAME="arrow"
PKG_DEB_NAME="(unsuppored)"
PKG_RPM_NAME="(unsuppored)"
PKG_BREW_NAME="apache-arrow"
PKG_TEST_HEADER="<arrow/api.h>"
# These must be the same order as $(pkg-config --libs arrow-dataset)
PKG_LIBS="-larrow_dataset -lparquet -larrow"
PKG_LIBS="-larrow"

# Make some env vars case-insensitive
ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'`
Expand Down Expand Up @@ -69,19 +68,22 @@ fi
if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then
echo "*** Using INCLUDE_DIR/LIB_DIR"
PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS"
PKG_LIBS="-L$LIB_DIR $PKG_LIBS"
PKG_DIRS="-L$LIB_DIR"
else
# Use pkg-config if available and allowed
pkg-config --version >/dev/null 2>&1
if [ "$ARROW_USE_PKG_CONFIG" != "false" ] && [ $? -eq 0 ]; then
PKGCONFIG_CFLAGS=`pkg-config --cflags --silence-errors ${PKG_CONFIG_NAME}`
PKGCONFIG_LIBS=`pkg-config --libs --silence-errors ${PKG_CONFIG_NAME}`
PKGCONFIG_LIBS=`pkg-config --libs-only-l --silence-errors ${PKG_CONFIG_NAME}`
PKGCONFIG_DIRS=`pkg-config --libs-only-L --silence-errors ${PKG_CONFIG_NAME}`
# TODO: what about --libs-only-other?
fi

if [ "$PKGCONFIG_CFLAGS" ] || [ "$PKGCONFIG_LIBS" ]; then
echo "*** Arrow C++ libraries found via pkg-config"
PKG_CFLAGS="$PKGCONFIG_CFLAGS"
PKG_LIBS=${PKGCONFIG_LIBS}
PKG_DIRS=${PKGCONFIG_DIRS}

# Check for version mismatch
PC_LIB_VERSION=`pkg-config --modversion arrow`
Expand All @@ -98,7 +100,8 @@ else
if [ "$FORCE_AUTOBREW" != "true" ] && [ "`command -v brew`" ] && [ "`brew ls --versions ${PKG_BREW_NAME}`" != "" ]; then
echo "*** Using Homebrew ${PKG_BREW_NAME}"
BREWDIR=`brew --prefix`
PKG_LIBS="-L$BREWDIR/opt/$PKG_BREW_NAME/lib $PKG_LIBS -larrow_bundled_dependencies"
PKG_LIBS="$PKG_LIBS -larrow_bundled_dependencies"
PKG_DIRS="-L$BREWDIR/opt/$PKG_BREW_NAME/lib $PKG_DIRS"
PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include"
else
echo "*** Downloading ${PKG_BREW_NAME}"
Expand All @@ -114,7 +117,7 @@ else
if [ $? -ne 0 ]; then
echo "Failed to retrieve binary for ${PKG_BREW_NAME}"
fi
# autobrew sets `PKG_LIBS` and `PKG_CFLAGS`
# autobrew sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS`
fi
else
# Set some default values/backwards compatibility
Expand All @@ -135,8 +138,8 @@ else

LIB_DIR="libarrow/arrow-${VERSION}/lib"
if [ -d "$LIB_DIR" ]; then
# Enumerate the static libs and add to PKG_LIBS
# (technically repeating arrow libs so they're in the right order)
# Enumerate the static libs, put their -l flags in BUNDLED_LIBS,
# and put their -L location in PKG_DIRS
#
# If tools/linuxlibs.R fails to produce libs, this dir won't exist
# so don't try (the error message from `ls` would be misleading)
Expand All @@ -145,7 +148,7 @@ else
# TODO: what about non-bundled deps?
BUNDLED_LIBS=`cd $LIB_DIR && ls *.a`
BUNDLED_LIBS=`echo $BUNDLED_LIBS | sed -E "s/lib(.*)\.a/-l\1/" | sed -e "s/\\.a lib/ -l/g"`
PKG_LIBS="-L$(pwd)/$LIB_DIR $PKG_LIBS $BUNDLED_LIBS"
PKG_DIRS="-L$(pwd)/$LIB_DIR"
fi
fi
fi
Expand Down Expand Up @@ -182,15 +185,35 @@ if [ $? -eq 0 ] || [ "$UNAME" = "Darwin" ]; then
# Always build with arrow on macOS
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_ARROW"
# Check for features
LIB_DIR=`echo $PKG_LIBS | sed -e 's/ -l.*//' | sed -e 's/^-L//'`
grep 'set(ARROW_S3 "ON")' $LIB_DIR/cmake/arrow/ArrowOptions.cmake >/dev/null 2>&1
LIB_DIR=`echo $PKG_DIRS | sed -e 's/^-L//'`
ARROW_OPTS_CMAKE="$LIB_DIR/cmake/arrow/ArrowOptions.cmake"
# Check for Parquet
grep 'set(ARROW_PARQUET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
if [ $? -eq 0 ]; then
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET"
PKG_LIBS="-lparquet $PKG_LIBS"
# NOTE: parquet is assumed to have the same -L flag as arrow
# so there is no need to add its location to PKG_DIRS
fi
# Check for Arrow Dataset subcomponent
grep 'set(ARROW_DATASET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
if [ $? -eq 0 ]; then
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_DATASET"
PKG_LIBS="-larrow_dataset $PKG_LIBS"
# NOTE: arrow-dataset is assumed to have the same -L flag as arrow
# so there is no need to add its location to PKG_DIRS
fi
# Check for S3
grep 'set(ARROW_S3 "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
if [ $? -eq 0 ]; then
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3"
if [ "$BUNDLED_LIBS" != "" ]; then
# We're depending on openssl/curl from the system, so they're not in the bundled deps
PKG_LIBS="$PKG_LIBS -lssl -lcrypto -lcurl"
BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl"
fi
fi
# prepend PKG_DIRS and append BUNDLED_LIBS to PKG_LIBS
PKG_LIBS="$PKG_DIRS $PKG_LIBS $BUNDLED_LIBS"
echo "PKG_CFLAGS=$PKG_CFLAGS"
echo "PKG_LIBS=$PKG_LIBS"
else
Expand Down
2 changes: 1 addition & 1 deletion r/configure.win
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man

# NOTE: If you make changes to the libraries below, you should also change
# ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW"
PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET"
PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lre2 -lthrift -lsnappy -lz -lzstd -llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"

# S3 support only for Rtools40 (i.e. R >= 4.0)
Expand Down
30 changes: 17 additions & 13 deletions r/data-raw/codegen.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# #if defined(ARROW_R_WITH_FEATURE)
# and each feature is written to its own set of export files.

features <- c("arrow", "s3")
features <- c("arrow", "dataset", "parquet", "s3")

suppressPackageStartupMessages({
library(decor)
Expand Down Expand Up @@ -187,27 +187,31 @@ cpp_file_header <- '// Generated by using data-raw/codegen.R -> do not edit by h
#include "./arrow_types.h"
'

arrow_exports_cpp <- glue::glue('
arrow_exports_cpp <- paste0(
glue::glue('
{cpp_file_header}
{cpp_functions_definitions}
{cpp_classes_finalizers}
{feature_available("arrow")}
{feature_available("s3")}
static const R_CallMethodDef CallEntries[] = {{
\t\t{{ "_arrow_available", (DL_FUNC)& _arrow_available, 0 }},
\t\t{{ "_s3_available", (DL_FUNC)& _s3_available, 0 }},
\n'),
glue::glue_collapse(glue::glue('
{feature_available({features})}
'), sep = '\n'),
'
static const R_CallMethodDef CallEntries[] = {
',
glue::glue_collapse(glue::glue('
\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},
'), sep = '\n'),
glue::glue('\n
{cpp_functions_registration}
{classes_finalizers_registration}
\t\t{{NULL, NULL, 0}}
}};
extern "C" void R_init_arrow(DllInfo* dll){{
\n'),
'extern "C" void R_init_arrow(DllInfo* dll){
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}}
}
\n')

write_if_modified(arrow_exports_cpp, "src/arrowExports.cpp")
Expand Down
4 changes: 2 additions & 2 deletions r/inst/build_arrow_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
-DARROW_BUILD_STATIC=ON \
-DARROW_COMPUTE=ON \
-DARROW_CSV=ON \
-DARROW_DATASET=ON \
-DARROW_DATASET=${ARROW_DATASET:-ON} \
-DARROW_DEPENDENCY_SOURCE=BUNDLED \
-DARROW_FILESYSTEM=ON \
-DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \
-DARROW_MIMALLOC=${ARROW_MIMALLOC:-$ARROW_DEFAULT_PARAM} \
-DARROW_JSON=ON \
-DARROW_PARQUET=ON \
-DARROW_PARQUET=${ARROW_PARQUET:-ON} \
-DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \
-DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \
-DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-$ARROW_DEFAULT_PARAM} \
Expand Down
2 changes: 1 addition & 1 deletion r/man/ParquetFileReader.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 16 additions & 3 deletions r/man/arrow_available.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/hive_partition.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b07027e

Please sign in to comment.