Skip to content

Commit

Permalink
GH-41480: [Python] Building PyArrow: enable/disable python components…
Browse files Browse the repository at this point in the history
… by default based on availability in Arrow C++ (#41494)

### Rationale for this change

Currently, when building pyarrow from source, one needs to manually enable the optional components through setting `PYARROW_WITH_...` environment variables. However, we could also make a default choice of components based on which ones where enabled in the Arrow C++ build.

### What changes are included in this PR?

Set defaults for the various `PYARROW_BUILD_<component>` based on the `ARROW_<component>` setting. Keep the current `PYARROW_WITH_<component>` environment variables working to allow to override this default.

### Are there any user-facing changes?

No
* GitHub Issue: #41480

Lead-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
jorisvandenbossche and kou committed May 16, 2024
1 parent 07a30d9 commit 1c546fb
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 127 deletions.
1 change: 0 additions & 1 deletion ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
set PYARROW_WITH_PARQUET=ON
set PYARROW_WITH_PARQUET_ENCRYPTION=ON
set PYARROW_WITH_S3=%ARROW_S3%
set PYARROW_WITH_STATIC_BOOST=ON
set PYARROW_WITH_SUBSTRAIT=ON

set ARROW_HOME=%CONDA_PREFIX%\Library
Expand Down
115 changes: 86 additions & 29 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,6 @@ if(UNIX)
endif()
endif()

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" OFF)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
endif()

find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
Expand Down Expand Up @@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")

include(UseCython)

# PyArrow C++
# Arrow C++ and set default PyArrow build options
include(GNUInstallDirs)

find_package(Arrow REQUIRED)

macro(define_option name description arrow_option)
set("PYARROW_${name}"
"AUTO"
CACHE STRING ${description})

if("${PYARROW_${name}}" STREQUAL "AUTO")
# by default, first check if env variable exists, otherwise use Arrow C++ config
set(env_variable "PYARROW_WITH_${name}")
if(DEFINED ENV{${env_variable}})
if($ENV{${env_variable}})
set("PYARROW_BUILD_${name}" ON)
else()
set("PYARROW_BUILD_${name}" OFF)
endif()
else()
if(${arrow_option})
set("PYARROW_BUILD_${name}" ON)
else()
set("PYARROW_BUILD_${name}" OFF)
endif()
endif()
else()
if("${PYARROW_${name}}")
set("PYARROW_BUILD_${name}" ON)
else()
set("PYARROW_BUILD_${name}" OFF)
endif()
endif()
endmacro()

define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO)
define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA)
define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET)
define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT)
define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA)
define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC)
define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET)
define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration"
PARQUET_REQUIRE_ENCRYPTION)
define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT)
define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE)
define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS)
define_option(S3 "Build the PyArrow S3 integration" ARROW_S3)
define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling PyArrow C++")

# enforce module dependencies
if(PYARROW_BUILD_SUBSTRAIT)
set(PYARROW_BUILD_DATASET ON)
endif()
if(PYARROW_BUILD_DATASET)
set(PYARROW_BUILD_ACERO ON)
endif()

# PyArrow C++
set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
Expand Down Expand Up @@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "")

# Check all the options from Arrow and PyArrow C++ to be in line
if(PYARROW_BUILD_DATASET)
message(STATUS "Building PyArrow with Dataset")
if(NOT ARROW_DATASET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
endif()
Expand All @@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET)
endif()

if(PYARROW_BUILD_ACERO)
message(STATUS "Building PyArrow with Acero")
if(NOT ARROW_ACERO)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
endif()
Expand All @@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO)
endif()

if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
message(STATUS "Building PyArrow with Parquet")
if(NOT ARROW_PARQUET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
endif()
find_package(Parquet REQUIRED)
endif()

if(PYARROW_BUILD_HDFS)
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
endif()

# Check for only Arrow C++ options
if(ARROW_CSV)
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
Expand Down Expand Up @@ -400,6 +437,7 @@ endif()

set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
if(PYARROW_BUILD_FLIGHT)
message(STATUS "Building PyArrow with Flight")
if(NOT ARROW_FLIGHT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
endif()
Expand Down Expand Up @@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE)
set(LINK_LIBS arrow_python)

if(PYARROW_BUILD_AZURE)
message(STATUS "Building PyArrow with Azure")
if(NOT ARROW_AZURE)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _azurefs)
endif()

if(PYARROW_BUILD_GCS)
message(STATUS "Building PyArrow with GCS")
if(NOT ARROW_GCS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _gcsfs)
endif()

if(PYARROW_BUILD_S3)
message(STATUS "Building PyArrow with S3")
if(NOT ARROW_S3)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _s3fs)
endif()

if(PYARROW_BUILD_HDFS)
message(STATUS "Building PyArrow with HDFS")
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _hdfs)
endif()

if(PYARROW_BUILD_CUDA)
# Arrow CUDA
message(STATUS "Building PyArrow with CUDA")
if(NOT ARROW_CUDA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
endif()
Expand Down Expand Up @@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
endif()

# ORC
if(PYARROW_BUILD_ORC)
# ORC
message(STATUS "Building PyArrow with ORC")
if(NOT ARROW_ORC)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
endif()
Expand Down Expand Up @@ -679,6 +734,7 @@ endif()

# Substrait
if(PYARROW_BUILD_SUBSTRAIT)
message(STATUS "Building PyArrow with Substrait")
if(NOT ARROW_SUBSTRAIT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
endif()
Expand All @@ -696,6 +752,7 @@ endif()

# Gandiva
if(PYARROW_BUILD_GANDIVA)
message(STATUS "Building PyArrow with Gandiva")
if(NOT ARROW_GANDIVA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
endif()
Expand Down
134 changes: 37 additions & 97 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,48 +152,27 @@ def initialize_options(self):
if not hasattr(sys, 'gettotalrefcount'):
self.build_type = 'release'

self.with_azure = strtobool(
os.environ.get('PYARROW_WITH_AZURE', '0'))
self.with_gcs = strtobool(
os.environ.get('PYARROW_WITH_GCS', '0'))
self.with_s3 = strtobool(
os.environ.get('PYARROW_WITH_S3', '0'))
self.with_hdfs = strtobool(
os.environ.get('PYARROW_WITH_HDFS', '0'))
self.with_cuda = strtobool(
os.environ.get('PYARROW_WITH_CUDA', '0'))
self.with_substrait = strtobool(
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
self.with_flight = strtobool(
os.environ.get('PYARROW_WITH_FLIGHT', '0'))
self.with_acero = strtobool(
os.environ.get('PYARROW_WITH_ACERO', '0'))
self.with_dataset = strtobool(
os.environ.get('PYARROW_WITH_DATASET', '0'))
self.with_parquet = strtobool(
os.environ.get('PYARROW_WITH_PARQUET', '0'))
self.with_parquet_encryption = strtobool(
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
self.with_orc = strtobool(
os.environ.get('PYARROW_WITH_ORC', '0'))
self.with_gandiva = strtobool(
os.environ.get('PYARROW_WITH_GANDIVA', '0'))
self.with_azure = None
self.with_gcs = None
self.with_s3 = None
self.with_hdfs = None
self.with_cuda = None
self.with_substrait = None
self.with_flight = None
self.with_acero = None
self.with_dataset = None
self.with_parquet = None
self.with_parquet_encryption = None
self.with_orc = None
self.with_gandiva = None

self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0'))
self.bundle_cython_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))

self.with_parquet_encryption = (self.with_parquet_encryption and
self.with_parquet)

# enforce module dependencies
if self.with_substrait:
self.with_dataset = True
if self.with_dataset:
self.with_acero = True

CYTHON_MODULE_NAMES = [
'lib',
'_fs',
Expand Down Expand Up @@ -270,23 +249,30 @@ def append_cmake_bool(value, varname):
cmake_options.append('-D{0}={1}'.format(
varname, 'on' if value else 'off'))

def append_cmake_component(flag, varname):
# only pass this to cmake is the user pass the --with-component
# flag to setup.py build_ext
if flag is not None:
append_cmake_bool(flag, varname)

if self.cmake_generator:
cmake_options += ['-G', self.cmake_generator]

append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
append_cmake_bool(self.with_parquet_encryption,
'PYARROW_BUILD_PARQUET_ENCRYPTION')
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
append_cmake_component(self.with_cuda, 'PYARROW_CUDA')
append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT')
append_cmake_component(self.with_flight, 'PYARROW_FLIGHT')
append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA')
append_cmake_component(self.with_acero, 'PYARROW_ACERO')
append_cmake_component(self.with_dataset, 'PYARROW_DATASET')
append_cmake_component(self.with_orc, 'PYARROW_ORC')
append_cmake_component(self.with_parquet, 'PYARROW_PARQUET')
append_cmake_component(self.with_parquet_encryption,
'PYARROW_PARQUET_ENCRYPTION')
append_cmake_component(self.with_azure, 'PYARROW_AZURE')
append_cmake_component(self.with_gcs, 'PYARROW_GCS')
append_cmake_component(self.with_s3, 'PYARROW_S3')
append_cmake_component(self.with_hdfs, 'PYARROW_HDFS')

append_cmake_bool(self.bundle_arrow_cpp,
'PYARROW_BUNDLE_ARROW_CPP')
append_cmake_bool(self.bundle_cython_cpp,
Expand Down Expand Up @@ -329,54 +315,8 @@ def append_cmake_bool(value, varname):
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = pjoin(install_prefix, name + ext_suffix)
if not os.path.exists(built_path):
print(f'Did not find {built_path}')
if self._failure_permitted(name):
print(f'Cython module {name} failure permitted')
continue
raise RuntimeError('PyArrow C-extension failed to build:',
os.path.abspath(built_path))

self._found_names.append(name)

def _failure_permitted(self, name):
if name == '_parquet' and not self.with_parquet:
return True
if name == '_parquet_encryption' and not self.with_parquet_encryption:
return True
if name == '_orc' and not self.with_orc:
return True
if name == '_flight' and not self.with_flight:
return True
if name == '_substrait' and not self.with_substrait:
return True
if name == '_azurefs' and not self.with_azure:
return True
if name == '_gcsfs' and not self.with_gcs:
return True
if name == '_s3fs' and not self.with_s3:
return True
if name == '_hdfs' and not self.with_hdfs:
return True
if name == '_dataset' and not self.with_dataset:
return True
if name == '_acero' and not self.with_acero:
return True
if name == '_exec_plan' and not self.with_acero:
return True
if name == '_dataset_orc' and not (
self.with_orc and self.with_dataset
):
return True
if name == '_dataset_parquet' and not (
self.with_parquet and self.with_dataset
):
return True
if name == '_cuda' and not self.with_cuda:
return True
if name == 'gandiva' and not self.with_gandiva:
return True
return False
if os.path.exists(built_path):
self._found_names.append(name)

def _get_build_dir(self):
# Get the package directory from build_py
Expand Down

0 comments on commit 1c546fb

Please sign in to comment.