diff --git a/.env b/.env index 6746892fd4ed8..eb87dc62bdd8c 100644 --- a/.env +++ b/.env @@ -65,7 +65,7 @@ JDK=8 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=14 -MAVEN=3.5.4 +MAVEN=3.6.3 NODE=18 NUMBA=latest NUMPY=latest @@ -92,13 +92,13 @@ DEVTOOLSET_VERSION= # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="501db0f17ef6df184fcdbfbe0f87cde2313b6ab1" # 2023.04.15 Release +VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # This must be updated when we update -# ci/docker/python-wheel-windows-vs2017.dockerfile. +# ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2023-08-02 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-02-05 # Use conanio/${CONAN} for "docker-compose run --rm conan". See # https://github.com/conan-io/conan-docker-tools#readme for available diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 41a075b1c0bcb..e7e544c2b0e62 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -30,15 +30,10 @@ # /cpp/ /cpp/src/arrow/acero @westonpace /cpp/src/arrow/adapters/orc @wgtmac -/cpp/src/arrow/dataset @westonpace /cpp/src/arrow/engine @westonpace /cpp/src/arrow/flight/ @lidavidm -/cpp/src/arrow/util/async* @westonpace -/cpp/src/arrow/util/future* @westonpace -/cpp/src/arrow/util/thread* @westonpace /cpp/src/parquet @wgtmac -/cpp/src/skyhook @westonpace -/csharp/ @westonpace +/csharp/ @curthagenlocher /go/ @zeroshade /java/ @lidavidm /js/ @domoritz @trxcllnt diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index bd14f1b895bf6..e9409f1cd6248 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -57,37 +57,65 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "amd64", + "clang-tools": "14", + "image": "conda-cpp", + "llvm": "14", + "runs-on": "ubuntu-latest", + "simd-level": "AVX2", + "title": "AMD64 Conda C++ AVX2", + "ubuntu": "22.04" + }, + { + "arch": "amd64", + "clang-tools": "14", + "image": "ubuntu-cpp-sanitizer", + "llvm": "14", + "runs-on": "ubuntu-latest", + "title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN", + "ubuntu": "22.04" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "arm64v8", + "clang-tools": "10", + "image": "ubuntu-cpp", + "llvm": "10", + "runs-on": ["self-hosted", "arm", "linux"], + "title": "ARM64 Ubuntu 20.04 C++", + "ubuntu": "20.04" + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.title }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 strategy: fail-fast: false matrix: - include: - - arch: amd64 - clang-tools: "14" - image: conda-cpp - llvm: "14" - runs-on: ubuntu-latest - simd-level: AVX2 - title: AMD64 Conda C++ AVX2 - ubuntu: "22.04" - - arch: amd64 - clang-tools: "14" - image: ubuntu-cpp-sanitizer - llvm: "14" - runs-on: ubuntu-latest - title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN - ubuntu: "22.04" - - arch: arm64v8 - clang-tools: "10" - image: ubuntu-cpp - llvm: "10" - runs-on: ["self-hosted", "arm", "linux"] - title: ARM64 Ubuntu 20.04 C++ - ubuntu: "20.04" + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} @@ -284,10 +312,6 @@ jobs: /t REG_DWORD ` /d 1 ` /f - - name: Installed Packages - run: choco list - - name: Install Dependencies - run: choco install -y --no-progress openssl - name: Checkout Arrow uses: actions/checkout@v4 with: diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cd44e65e8811b..bbffab6704087 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -43,31 +43,62 @@ permissions: jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.19", + "runs-on": "ubuntu-latest" + }, + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.20", + "runs-on": "ubuntu-latest" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.19", + "runs-on": ["self-hosted", "arm", "linux"] + }, + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.20", + "runs-on": ["self-hosted", "arm", "linux"] + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.arch-label }} Debian 11 Go ${{ matrix.go }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: fail-fast: false matrix: - include: - - arch-label: AMD64 - arch: amd64 - go: 1.19 - runs-on: ubuntu-latest - - arch-label: AMD64 - arch: amd64 - go: '1.20' - runs-on: ubuntu-latest - - arch-label: ARM64 - arch: arm64v8 - go: 1.19 - runs-on: ["self-hosted", "arm", "linux"] - - arch-label: ARM64 - arch: arm64v8 - go: '1.20' - runs-on: ["self-hosted", "arm", "linux"] + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} GO: ${{ matrix.go }} diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 512ff2bb929b3..eceeb551a0653 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -52,7 +52,7 @@ jobs: - name: Install ninja-build run: sudo apt-get install ninja-build - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Install ccache @@ -85,7 +85,7 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test macos: @@ -100,7 +100,7 @@ jobs: - name: Install ninja-build run: brew install ninja - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Install ccache @@ -125,7 +125,7 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test windows: @@ -138,7 +138,7 @@ jobs: with: fetch-depth: 0 - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Download Timezone Database @@ -171,6 +171,6 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 2a801b6040ec8..8c47915b7b6d3 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -21,24 +21,26 @@ on: push: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" pull_request: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" concurrency: @@ -52,6 +54,63 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + ubuntu-minimum-cpp-version: + name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) + runs-on: ubuntu-latest + strategy: + matrix: + include: + - cpp_version: "13.0.0" + steps: + - name: Checkout Arrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + path: src + submodules: recursive + + - name: Install Arrow C++ (${{ matrix.cpp_version }}) + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + # We have to list all packages to avoid version conflicts. + sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \ + libarrow-acero-dev=${{ matrix.cpp_version }}-1 \ + libparquet-dev=${{ matrix.cpp_version }}-1 \ + libarrow-dataset-dev=${{ matrix.cpp_version }}-1 + + - name: Install checkbashisms + run: | + sudo apt-get install devscripts + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + working-directory: src/r + + - uses: r-lib/actions/check-r-package@v2 + with: + working-directory: src/r + env: + LIBARROW_BINARY: "false" + LIBARROW_BUILD: "false" + ARROW_R_VERBOSE_TEST: "true" + ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true" + + - name: Show install output + if: always() + run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true + shell: bash + + ubuntu: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} runs-on: ubuntu-latest @@ -256,6 +315,16 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + # This must be done before r-lib/actions/setup-r because curl in + # Rtools doesn't work on non Rtools' MSYS2 environment. If we + # use "shell: bash" after r-lib/actions/setup-r, bash in Rtools + # is used on non Rtools' MSYS2 environment. + - name: Install MinIO + shell: bash + run: | + mkdir -p "$HOME/.local/bin" + ci/scripts/install_minio.sh latest "$HOME/.local" + echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts uses: actions/download-artifact@v3 @@ -282,15 +351,6 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck - - name: Install MinIO - shell: bash - run: | - mkdir -p "$HOME/.local/bin" - curl \ - --output "$HOME/.local/bin/minio.exe" \ - https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z - chmod +x "$HOME/.local/bin/minio.exe" - echo "$HOME/.local/bin" >> $GITHUB_PATH # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows # - name: Install Google Cloud Storage Testbench # shell: bash diff --git a/c_glib/README.md b/c_glib/README.md index 2a4d6b8a6628c..24e69eff65055 100644 --- a/c_glib/README.md +++ b/c_glib/README.md @@ -101,7 +101,7 @@ $ sudo meson install -C c_glib.build You need to install Arrow C++ before you install Arrow GLib. See Arrow C++ document about how to install Arrow C++. -You need [GTK-Doc](https://www.gtk.org/gtk-doc/) and +You need [GTK-Doc](https://gitlab.gnome.org/GNOME/gtk-doc) and [GObject Introspection](https://wiki.gnome.org/Projects/GObjectIntrospection) to build Arrow GLib. You can install them by the followings: diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 0de9466eee456..98b2c92104507 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1212,7 +1212,8 @@ garrow_timestamp_data_type_class_init(GArrowTimestampDataTypeClass *klass) /** * garrow_timestamp_data_type_new: * @unit: The unit of the timestamp data. - * @time_zone: (nullable): The time zone of the timestamp data. + * @time_zone: (nullable): The time zone of the timestamp data. If based GLib + * is less than 2.58, this is ignored. * * Returns: A newly created the number of * seconds/milliseconds/microseconds/nanoseconds since UNIX epoch in @@ -1226,9 +1227,11 @@ garrow_timestamp_data_type_new(GArrowTimeUnit unit, { auto arrow_unit = garrow_time_unit_to_raw(unit); std::string arrow_timezone; +#if GLIB_CHECK_VERSION(2, 58, 0) if (time_zone) { arrow_timezone = g_time_zone_get_identifier(time_zone); } +#endif auto arrow_data_type = arrow::timestamp(arrow_unit, arrow_timezone); auto data_type = GARROW_TIMESTAMP_DATA_TYPE(g_object_new(GARROW_TYPE_TIMESTAMP_DATA_TYPE, diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 5e561a0461ea3..ab85032fe9924 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -26,7 +26,10 @@ git submodule update --init || exit /B set ARROW_TEST_DATA=%CD%\testing\data set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data -set ARROW_DEBUG_MEMORY_POOL=trap +@rem Enable memory debug checks if the env is not set already +IF "%ARROW_DEBUG_MEMORY_POOL%"=="" ( + set ARROW_DEBUG_MEMORY_POOL=trap +) set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 5fdd21d2bd1f9..59e2def1bf339 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -23,9 +23,8 @@ cloudpickle fsspec hypothesis numpy>=1.16.6 -pytest<8 # pytest-lazy-fixture broken on pytest 8.0.0 +pytest<8 pytest-faulthandler -pytest-lazy-fixture s3fs>=2023.10.0 setuptools setuptools_scm<8.0.0 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index d0f494d2e085d..0e50875fc1ef8 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme=0.14.1 +pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index c51600a1e5920..3d102796b8c00 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -60,7 +60,7 @@ RUN apt-get update -y && \ ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 -ARG maven=3.5.4 +ARG maven=3.6.3 COPY ci/scripts/util_download_apache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/util_download_apache.sh \ "maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz" /opt diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 0a50d450c225a..b1d9ed5ab88d9 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -28,7 +28,7 @@ ENV MANYLINUX_VERSION=${manylinux} RUN yum install -y dnf # Install basic dependencies -RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers +RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget # A system Python is required for ninja and vcpkg in this Dockerfile. # On manylinux2014 base images, system Python is 2.7.5, while @@ -62,15 +62,16 @@ COPY ci/vcpkg/*.patch \ COPY ci/scripts/install_vcpkg.sh \ arrow/ci/scripts/ ENV VCPKG_ROOT=/opt/vcpkg -RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} -ENV PATH="${PATH}:${VCPKG_ROOT}" - ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \ VCPKG_FEATURE_FLAGS="manifests" + +RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} +ENV PATH="${PATH}:${VCPKG_ROOT}" + COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/ # cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains # ssl related fixes as well as we can patch the vcpkg portfile to support @@ -81,6 +82,7 @@ RUN vcpkg install \ --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ @@ -97,4 +99,5 @@ SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] COPY python/requirements-wheel-build.txt /arrow/python/ -RUN pip install -r /arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" diff --git a/ci/docker/python-wheel-windows-test-vs2017.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile similarity index 96% rename from ci/docker/python-wheel-windows-test-vs2017.dockerfile rename to ci/docker/python-wheel-windows-test-vs2019.dockerfile index e842ede18454b..67d99fa9c5724 100644 --- a/ci/docker/python-wheel-windows-test-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile similarity index 94% rename from ci/docker/python-wheel-windows-vs2017.dockerfile rename to ci/docker/python-wheel-windows-vs2019.dockerfile index faf07800c956a..b8e8aad952b1c 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Install CMake and Ninja ARG cmake=3.21.4 @@ -88,7 +88,8 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ -RUN python -m pip install -r arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" # ENV CLCACHE_DIR="C:\clcache" # ENV CLCACHE_COMPRESS=1 diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index cea600191ae05..f8083c7759d8a 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -28,8 +28,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi pushd ${source_dir} diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 69d86e871ac5f..60cab1a9feaba 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -54,6 +54,7 @@ if [ "${GITHUB_ACTIONS:-false}" = "true" ]; then fi if [ "${ARROW_ENABLE_THREADING:-ON}" = "OFF" ]; then + ARROW_AZURE=OFF ARROW_FLIGHT=OFF ARROW_FLIGHT_SQL=OFF ARROW_GCS=OFF diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0c6e1c6ef7057..1d685c51a9326 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -37,8 +37,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib}:${LD_LIBRARY_P # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi ctest_options=() case "$(uname)" in diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 8d712a88a6ab1..478c1d5997906 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -35,4 +35,5 @@ else fi # additional dependencies needed for dask's s3 tests -pip install moto[server] flask requests +# Moto 5 results in timeouts in s3 tests: https://github.com/dask/dask/issues/10869 +pip install "moto[server]<5" flask requests diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 6ea8e1a095c39..e493a183b4543 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -17,7 +17,15 @@ # specific language governing permissions and limitations # under the License. -set -e +set -eu + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 declare -A archs archs=([x86_64]=amd64 @@ -25,45 +33,60 @@ archs=([x86_64]=amd64 [aarch64]=arm64 [s390x]=s390x) -declare -A platforms -platforms=([Linux]=linux - [Darwin]=darwin) - arch=$(uname -m) -platform=$(uname) -version=$1 -prefix=$2 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit 1 -elif [ -z ${archs[$arch]} ]; then +if [ -z ${archs[$arch]} ]; then echo "Unsupported architecture: ${arch}" exit 0 -elif [ -z ${platforms[$platform]} ]; then - echo "Unsupported platform: ${platform}" - exit 0 -elif [ "${version}" != "latest" ]; then +fi +arch=${archs[$arch]} + +platform=$(uname) +case ${platform} in + Linux) + platform=linux + ;; + Darwin) + platform=darwin + ;; + MSYS_NT*|MINGW64_NT*) + platform=windows + ;; + *) + echo "Unsupported platform: ${platform}" + exit 0 + ;; +esac + +if [ "${version}" != "latest" ]; then echo "Cannot fetch specific versions of minio, only latest is supported." exit 1 fi -arch=${archs[$arch]} -platform=${platforms[$platform]} - # Use specific versions for minio server and client to avoid CI failures on new releases. minio_version="minio.RELEASE.2022-05-26T05-48-41Z" mc_version="mc.RELEASE.2022-05-09T04-08-26Z" +download() +{ + local output=$1 + local url=$2 + + if type wget > /dev/null 2>&1; then + wget -nv --output-document ${output} ${url} + else + curl --fail --location --output ${output} ${url} + fi +} + if [[ ! -x ${prefix}/bin/minio ]]; then url="https://dl.min.io/server/minio/release/${platform}-${arch}/archive/${minio_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/minio ${url} + download ${prefix}/bin/minio ${url} chmod +x ${prefix}/bin/minio fi if [[ ! -x ${prefix}/bin/mc ]]; then url="https://dl.min.io/client/mc/release/${platform}-${arch}/archive/${mc_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/mc ${url} + download ${prefix}/bin/mc ${url} chmod +x ${prefix}/bin/mc fi diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index c0a27e6e705e9..9bdcc4d687584 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -55,6 +55,7 @@ export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_ACERO=${ARROW_ACERO:-OFF} +export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 341c2dd0577ef..20ca3300c0538 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -32,11 +32,14 @@ export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py # Enable some checks inside Python itself export PYTHONDEVMODE=1 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # By default, force-test all optional components : ${PYARROW_TEST_ACERO:=${ARROW_ACERO:-ON}} +: ${PYARROW_TEST_AZURE:=${ARROW_AZURE:-ON}} : ${PYARROW_TEST_CUDA:=${ARROW_CUDA:-ON}} : ${PYARROW_TEST_DATASET:=${ARROW_DATASET:-ON}} : ${PYARROW_TEST_FLIGHT:=${ARROW_FLIGHT:-ON}} diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index fd845c512dcdb..bea5409100770 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -50,16 +50,20 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ - -r ${source_dir}/python/requirements-wheel-build.txt + -r ${source_dir}/python/requirements-wheel-build.txt \ + --pre \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -92,6 +96,7 @@ pushd ${build_dir}/build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -145,6 +150,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 58e42fea88088..4d4d4fb694e0b 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -49,6 +49,7 @@ rm -rf /arrow/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -87,6 +88,7 @@ pushd /tmp/arrow-build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -141,6 +143,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 01250ff7ef40c..a25e5c51bddbc 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -28,15 +28,17 @@ fi source_dir=${1} +: ${ARROW_AZURE:=ON} : ${ARROW_FLIGHT:=ON} -: ${ARROW_SUBSTRAIT:=ON} -: ${ARROW_S3:=ON} : ${ARROW_GCS:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} export PYARROW_TEST_ACERO=ON +export PYARROW_TEST_AZURE=${ARROW_AZURE} export PYARROW_TEST_CYTHON=OFF export PYARROW_TEST_DATASET=ON export PYARROW_TEST_FLIGHT=${ARROW_FLIGHT} diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index ffb43b3481e55..73b0192d9bc97 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -19,7 +19,7 @@ echo "Building windows wheel..." -call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" echo "=== (%PYTHON_VERSION%) Clear output directories and leftovers ===" del /s /q C:\arrow-build @@ -50,7 +50,8 @@ set ARROW_WITH_SNAPPY=ON set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON set CMAKE_UNITY_BUILD=ON -set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +set CMAKE_GENERATOR=Visual Studio 16 2019 +set CMAKE_PLATFORM=x64 set VCPKG_ROOT=C:\vcpkg set VCPKG_FEATURE_FLAGS=-manifests set VCGPK_TARGET_TRIPLET=amd64-windows-static-md-%CMAKE_BUILD_TYPE% @@ -96,6 +97,7 @@ cmake ^ -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=%VCGPK_TARGET_TRIPLET% ^ -G "%CMAKE_GENERATOR%" ^ + -A "%CMAKE_PLATFORM%" ^ C:\arrow\cpp || exit /B 1 cmake --build . --config %CMAKE_BUILD_TYPE% --target install || exit /B 1 popd @@ -121,6 +123,6 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM bundle the msvc runtime -cp "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Redist\MSVC\14.16.27012\x64\Microsoft.VC141.CRT\msvcp140.dll" pyarrow\ +cp "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Redist\MSVC\14.28.29325\x64\Microsoft.VC142.CRT\msvcp140.dll" pyarrow\ python setup.py bdist_wheel || exit /B 1 popd diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 22ec551edb9fa..72078ab3c06c2 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -72,8 +72,10 @@ export _R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_=TRUE # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # Hack so that texlive2020 doesn't pollute the home dir export TEXMFCONFIG=/tmp/texmf-config diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 4fd6a85fe3966..56c33a4d6378a 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -26,7 +26,9 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi rake -f ${source_dir}/Rakefile BUILD_DIR=${build_dir} USE_BUNDLER=yes diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 68f6cae5addc9..0d4fb540a2003 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,13 +1,14 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index 5a14562..924b1b7 100644 +index bdc544e9e..53f6bbc3b 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -87,8 +87,11 @@ vcpkg_cmake_configure( +@@ -74,9 +74,12 @@ vcpkg_cmake_configure( -DENABLE_MANUAL=OFF -DCURL_CA_FALLBACK=ON -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none + -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON OPTIONS_DEBUG -DENABLE_DEBUG=ON + ${EXTRA_ARGS_DEBUG} @@ -15,29 +16,29 @@ index 5a14562..924b1b7 100644 vcpkg_cmake_install() vcpkg_copy_pdbs() diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 8f3f3f9..745b0fb 100644 +index 0c7098082..c603c3653 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -9,6 +9,7 @@ vcpkg_from_github( - HEAD_REF master +@@ -10,6 +10,7 @@ vcpkg_from_github( PATCHES fix_clang-cl_build.patch + no-werror.patch + "snappy-disable-bmi.patch" ) vcpkg_cmake_configure( diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 -index 0000000..a57ce0c +index 000000000..e839c93a4 --- /dev/null +++ b/ports/snappy/snappy-disable-bmi.patch @@ -0,0 +1,19 @@ +diff --git a/snappy.cc b/snappy.cc -+index 79dc0e8..f3153ea 100644 ++index d414718..7b49d2a 100644 +--- a/snappy.cc ++++ b/snappy.cc -+@@ -965,14 +965,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, -+ static inline uint32_t ExtractLowBytes(uint32_t v, int n) { ++@@ -1014,14 +1014,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, ++ static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { + assert(n >= 0); + assert(n <= 4); +-#if SNAPPY_HAVE_BMI2 @@ -52,13 +53,13 @@ index 0000000..a57ce0c + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index 4d7e26a..1f054a2 100644 +index bf9397b66..c3112b673 100644 --- a/ports/llvm/portfile.cmake +++ b/ports/llvm/portfile.cmake -@@ -274,6 +274,8 @@ vcpkg_cmake_configure( +@@ -293,6 +293,8 @@ vcpkg_cmake_configure( + ${FEATURE_OPTIONS} + MAYBE_UNUSED_VARIABLES COMPILER_RT_ENABLE_IOS - OPENMP_TOOLS_INSTALL_DIR - MLIR_TOOLS_INSTALL_DIR + BOLT_TOOLS_INSTALL_DIR + LIBOMP_INSTALL_ALIASES ) diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 71c23165e61f0..e86479a7c32fc 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -81,8 +81,11 @@ "default-features": false, "features": [ "clang", - "default-options", "default-targets", + "enable-bindings", + "enable-terminfo", + "enable-zlib", + "enable-zstd", "enable-rtti", "lld", "tools" @@ -102,6 +105,16 @@ } ] }, + "azure": { + "description": "Azure blob storage support", + "dependencies": [ + "azure-core-cpp", + "azure-identity-cpp", + "azure-storage-blobs-cpp", + "azure-storage-common-cpp", + "azure-storage-files-datalake-cpp" + ] + }, "orc": { "description": "ORC support", "dependencies": [ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 016cd8a1b9ec8..50a85b33d5489 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -152,6 +152,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "18.1" "17.0" "16.0" "15.0" diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 6bb9c0f6af2ca..b16ee07756013 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2594,16 +2594,11 @@ macro(build_re2) endmacro() if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. - resolve_dependency(re2 HAVE_ALT TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM" AND ARROW_BUILD_STATIC) - get_target_property(RE2_TYPE re2::re2 TYPE) - if(NOT RE2_TYPE STREQUAL "INTERFACE_LIBRARY") - string(APPEND ARROW_PC_LIBS_PRIVATE " $") - endif() - endif() + resolve_dependency(re2 + HAVE_ALT + TRUE + PC_PACKAGE_NAMES + re2) add_definitions(-DARROW_WITH_RE2) endif() @@ -2634,7 +2629,7 @@ macro(build_bzip2) BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS} ${BZIP2_EXTRA_ARGS} - INSTALL_COMMAND ${MAKE} install PREFIX=${BZIP2_PREFIX} + INSTALL_COMMAND ${MAKE} install -j1 PREFIX=${BZIP2_PREFIX} ${BZIP2_EXTRA_ARGS} INSTALL_DIR ${BZIP2_PREFIX} URL ${ARROW_BZIP2_SOURCE_URL} diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index b483ec420cc3c..6927f51283eb7 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -307,6 +307,18 @@ Result> Array::View( return MakeArray(result); } +Result> Array::CopyTo( + const std::shared_ptr& to) const { + ARROW_ASSIGN_OR_RAISE(auto copied_data, data()->CopyTo(to)); + return MakeArray(copied_data); +} + +Result> Array::ViewOrCopyTo( + const std::shared_ptr& to) const { + ARROW_ASSIGN_OR_RAISE(auto new_data, data()->ViewOrCopyTo(to)); + return MakeArray(new_data); +} + // ---------------------------------------------------------------------- // NullArray diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 7e857bf20568e..6411aebf80442 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -165,6 +165,22 @@ class ARROW_EXPORT Array { /// An error is returned if the types are not layout-compatible. Result> View(const std::shared_ptr& type) const; + /// \brief Construct a copy of the array with all buffers on destination + /// Memory Manager + /// + /// This method recursively copies the array's buffers and those of its children + /// onto the destination MemoryManager device and returns the new Array. + Result> CopyTo(const std::shared_ptr& to) const; + + /// \brief Construct a new array attempting to zero-copy view if possible. + /// + /// Like CopyTo this method recursively goes through all of the array's buffers + /// and those of it's children and first attempts to create zero-copy + /// views on the destination MemoryManager device. If it can't, it falls back + /// to performing a copy. See Buffer::ViewOrCopy. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + /// Construct a zero-copy slice of the array with the indicated offset and /// length /// diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e9d478f108584..21ac1a09f56e7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -905,6 +905,29 @@ TEST_F(TestArray, TestAppendArraySlice) { } } +// GH-39976: Test out-of-line data size calculation in +// BinaryViewBuilder::AppendArraySlice. +TEST_F(TestArray, TestBinaryViewAppendArraySlice) { + BinaryViewBuilder src_builder(pool_); + ASSERT_OK(src_builder.AppendNull()); + ASSERT_OK(src_builder.Append("long string; not inlined")); + ASSERT_EQ(2, src_builder.length()); + ASSERT_OK_AND_ASSIGN(auto src, src_builder.Finish()); + ASSERT_OK(src->ValidateFull()); + + ArraySpan span; + span.SetMembers(*src->data()); + BinaryViewBuilder dst_builder(pool_); + ASSERT_OK(dst_builder.AppendArraySlice(span, 0, 1)); + ASSERT_EQ(1, dst_builder.length()); + ASSERT_OK(dst_builder.AppendArraySlice(span, 1, 1)); + ASSERT_EQ(2, dst_builder.length()); + ASSERT_OK_AND_ASSIGN(auto dst, dst_builder.Finish()); + ASSERT_OK(dst->ValidateFull()); + + AssertArraysEqual(*src, *dst); +} + TEST_F(TestArray, ValidateBuffersPrimitive) { auto empty_buffer = std::make_shared(""); auto null_buffer = Buffer::FromString("\xff"); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index f85852fa0eda6..7e5721917f3a0 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -54,7 +54,7 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse int64_t out_of_line_total = 0, i = 0; VisitNullBitmapInline( - array.buffers[0].data, array.offset, array.length, array.null_count, + array.buffers[0].data, array.offset + offset, length, array.null_count, [&] { if (!values[i].is_inline()) { out_of_line_total += static_cast(values[i].size()); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 8065752f3e278..429aa5c0488cd 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -515,10 +515,9 @@ class ARROW_EXPORT LargeListViewBuilder final /// \class MapBuilder /// \brief Builder class for arrays of variable-size maps /// -/// To use this class, you must append values to the key and item array builders -/// and use the Append function to delimit each distinct map (once the keys and items -/// have been appended) or use the bulk API to append a sequence of offsets and null -/// maps. +/// To use this class, you must use the Append function to delimit each distinct +/// map before appending values to the key and item array builders, or use the +/// bulk API to append a sequence of offsets and null maps. /// /// Key uniqueness and ordering are not validated. class ARROW_EXPORT MapBuilder : public ArrayBuilder { diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8454ac8f1d5fb..80c411dfa6a6d 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -27,6 +27,7 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/device.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -36,6 +37,7 @@ #include "arrow/util/dict_util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" #include "arrow/util/ree_util.h" #include "arrow/util/slice_util_internal.h" #include "arrow/util/union_util.h" @@ -140,6 +142,43 @@ std::shared_ptr ArrayData::Make(std::shared_ptr type, int64 return std::make_shared(std::move(type), length, null_count, offset); } +namespace { +template +Result> CopyToImpl(const ArrayData& data, + const std::shared_ptr& to, + Fn&& copy_fn) { + auto output = ArrayData::Make(data.type, data.length, data.null_count, data.offset); + output->buffers.resize(data.buffers.size()); + for (auto&& [buf, out_buf] : internal::Zip(data.buffers, output->buffers)) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(out_buf, copy_fn(buf, to)); + } + } + + output->child_data.reserve(data.child_data.size()); + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto copied, CopyToImpl(*child, to, copy_fn)); + output->child_data.push_back(std::move(copied)); + } + + if (data.dictionary) { + ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); + } + + return output; +} +} // namespace + +Result> ArrayData::CopyTo( + const std::shared_ptr& to) const { + return CopyToImpl(*this, to, MemoryManager::CopyBuffer); +} + +Result> ArrayData::ViewOrCopyTo( + const std::shared_ptr& to) const { + return CopyToImpl(*this, to, Buffer::ViewOrCopy); +} + std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { ARROW_CHECK_LE(off, length) << "Slice offset (" << off << ") greater than array length (" << length << ")"; diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index edd443adc43c4..d8a6663cec580 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -27,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" #include "arrow/util/span.h" @@ -34,9 +35,6 @@ namespace arrow { -class Array; -struct ArrayData; - namespace internal { // ---------------------------------------------------------------------- // Null handling for types without a validity bitmap and the dictionary type @@ -183,6 +181,21 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr Copy() const { return std::make_shared(*this); } + /// \brief Copy all buffers and children recursively to destination MemoryManager + /// + /// This utilizes MemoryManager::CopyBuffer to create a new ArrayData object + /// recursively copying the buffers and all child buffers to the destination + /// memory manager. This includes dictionaries if applicable. + Result> CopyTo( + const std::shared_ptr& to) const; + /// \brief View or Copy this ArrayData to destination memory manager. + /// + /// Tries to view the buffer contents on the given memory manager's device + /// if possible (to avoid a copy) but falls back to copying if a no-copy view + /// isn't supported. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + bool IsNull(int64_t i) const { return !IsValid(i); } bool IsValid(int64_t i) const { diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 52fd94ec1f7d4..258a9faac7361 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -359,7 +359,7 @@ class ARROW_EXPORT Buffer { static Result> ViewOrCopy( std::shared_ptr source, const std::shared_ptr& to); - virtual std::shared_ptr device_sync_event() { return NULLPTR; } + virtual std::shared_ptr device_sync_event() const { return NULLPTR; } protected: bool is_mutable_; diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 238afb0328672..022fce72f59b8 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -914,6 +914,8 @@ struct DecodedMetadata { std::shared_ptr metadata; std::string extension_name; std::string extension_serialized; + int extension_name_index = -1; // index of extension_name in metadata + int extension_serialized_index = -1; // index of extension_serialized in metadata }; Result DecodeMetadata(const char* metadata) { @@ -956,8 +958,10 @@ Result DecodeMetadata(const char* metadata) { RETURN_NOT_OK(read_string(&values[i])); if (keys[i] == kExtensionTypeKeyName) { decoded.extension_name = values[i]; + decoded.extension_name_index = i; } else if (keys[i] == kExtensionMetadataKeyName) { decoded.extension_serialized = values[i]; + decoded.extension_serialized_index = i; } } decoded.metadata = key_value_metadata(std::move(keys), std::move(values)); @@ -1046,6 +1050,8 @@ struct SchemaImporter { ARROW_ASSIGN_OR_RAISE( type_, registered_ext_type->Deserialize(std::move(type_), metadata_.extension_serialized)); + RETURN_NOT_OK(metadata_.metadata->DeleteMany( + {metadata_.extension_name_index, metadata_.extension_serialized_index})); } } @@ -1466,7 +1472,7 @@ class ImportedBuffer : public Buffer { ~ImportedBuffer() override = default; - std::shared_ptr device_sync_event() override { + std::shared_ptr device_sync_event() const override { return import_->device_sync_; } @@ -1537,6 +1543,8 @@ struct ArrayImporter { if (recursion_level_ >= kMaxImportRecursionLevel) { return Status::Invalid("Recursion level in ArrowArray struct exceeded"); } + device_type_ = parent->device_type_; + memory_mgr_ = parent->memory_mgr_; // Child buffers will keep the entire parent import alive. // Perhaps we can move the child structs to an owned area // when the parent ImportedArrayData::Release() gets called, @@ -1851,10 +1859,25 @@ struct ArrayImporter { template Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, int64_t byte_width = 1) { - auto offsets = data_->GetValues(offsets_buffer_id); + if (device_type_ == DeviceAllocationType::kCPU) { + auto offsets = data_->GetValues(offsets_buffer_id); + // Compute visible size of buffer + int64_t buffer_size = + (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + return ImportBuffer(buffer_id, buffer_size); + } + + // we only need the value of the last offset so let's just copy that + // one value from device to host. + auto single_value_buf = + SliceBuffer(data_->buffers[offsets_buffer_id], + c_struct_->length * sizeof(OffsetType), sizeof(OffsetType)); + ARROW_ASSIGN_OR_RAISE( + auto cpubuf, Buffer::ViewOrCopy(single_value_buf, default_cpu_memory_manager())); + auto offsets = cpubuf->data_as(); // Compute visible size of buffer - int64_t buffer_size = - (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + int64_t buffer_size = (c_struct_->length > 0) ? byte_width * offsets[0] : 0; + return ImportBuffer(buffer_id, buffer_size); } @@ -1979,13 +2002,49 @@ Result> ImportDeviceRecordBatch( namespace { +Status ExportStreamSchema(const std::shared_ptr& src, + struct ArrowSchema* out_schema) { + return ExportSchema(*src->schema(), out_schema); +} + +Status ExportStreamSchema(const std::shared_ptr& src, + struct ArrowSchema* out_schema) { + return ExportType(*src->type(), out_schema); +} + +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowArray* out_array) { + std::shared_ptr batch; + RETURN_NOT_OK(src->ReadNext(&batch)); + if (batch == nullptr) { + // End of stream + ArrowArrayMarkReleased(out_array); + return Status::OK(); + } else { + return ExportRecordBatch(*batch, out_array); + } +} + +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowArray* out_array) { + if (i >= src->num_chunks()) { + // End of stream + ArrowArrayMarkReleased(out_array); + return Status::OK(); + } else { + return ExportArray(*src->chunk(static_cast(i)), out_array); + } +} + +template class ExportedArrayStream { public: struct PrivateData { - explicit PrivateData(std::shared_ptr reader) - : reader_(std::move(reader)) {} + explicit PrivateData(std::shared_ptr reader) + : reader_(std::move(reader)), batch_num_(0) {} - std::shared_ptr reader_; + std::shared_ptr reader_; + int64_t batch_num_; std::string last_error_; PrivateData() = default; @@ -1995,19 +2054,11 @@ class ExportedArrayStream { explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {} Status GetSchema(struct ArrowSchema* out_schema) { - return ExportSchema(*reader()->schema(), out_schema); + return ExportStreamSchema(reader(), out_schema); } Status GetNext(struct ArrowArray* out_array) { - std::shared_ptr batch; - RETURN_NOT_OK(reader()->ReadNext(&batch)); - if (batch == nullptr) { - // End of stream - ArrowArrayMarkReleased(out_array); - return Status::OK(); - } else { - return ExportRecordBatch(*batch, out_array); - } + return ExportStreamNext(reader(), next_batch_num(), out_array); } const char* GetLastError() { @@ -2047,6 +2098,15 @@ class ExportedArrayStream { return ExportedArrayStream{stream}.GetLastError(); } + static Status Make(std::shared_ptr reader, struct ArrowArrayStream* out) { + out->get_schema = ExportedArrayStream::StaticGetSchema; + out->get_next = ExportedArrayStream::StaticGetNext; + out->get_last_error = ExportedArrayStream::StaticGetLastError; + out->release = ExportedArrayStream::StaticRelease; + out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)}; + return Status::OK(); + } + private: int ToCError(const Status& status) { if (ARROW_PREDICT_TRUE(status.ok())) { @@ -2070,7 +2130,9 @@ class ExportedArrayStream { return reinterpret_cast(stream_->private_data); } - const std::shared_ptr& reader() { return private_data()->reader_; } + const std::shared_ptr& reader() { return private_data()->reader_; } + + int64_t next_batch_num() { return private_data()->batch_num_++; } struct ArrowArrayStream* stream_; }; @@ -2079,12 +2141,12 @@ class ExportedArrayStream { Status ExportRecordBatchReader(std::shared_ptr reader, struct ArrowArrayStream* out) { - out->get_schema = ExportedArrayStream::StaticGetSchema; - out->get_next = ExportedArrayStream::StaticGetNext; - out->get_last_error = ExportedArrayStream::StaticGetLastError; - out->release = ExportedArrayStream::StaticRelease; - out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)}; - return Status::OK(); + return ExportedArrayStream::Make(std::move(reader), out); +} + +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out) { + return ExportedArrayStream::Make(std::move(chunked_array), out); } ////////////////////////////////////////////////////////////////////////// @@ -2092,66 +2154,58 @@ Status ExportRecordBatchReader(std::shared_ptr reader, namespace { -class ArrayStreamBatchReader : public RecordBatchReader { +class ArrayStreamReader { public: - explicit ArrayStreamBatchReader(std::shared_ptr schema, - struct ArrowArrayStream* stream) - : schema_(std::move(schema)) { + explicit ArrayStreamReader(struct ArrowArrayStream* stream) { ArrowArrayStreamMove(stream, &stream_); DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } - ~ArrayStreamBatchReader() override { + ~ArrayStreamReader() { ReleaseStream(); } + + void ReleaseStream() { if (!ArrowArrayStreamIsReleased(&stream_)) { ArrowArrayStreamRelease(&stream_); } DCHECK(ArrowArrayStreamIsReleased(&stream_)); } - std::shared_ptr schema() const override { return schema_; } - - Status ReadNext(std::shared_ptr* batch) override { - struct ArrowArray c_array; - if (ArrowArrayStreamIsReleased(&stream_)) { - return Status::Invalid( - "Attempt to read from a reader that has already been closed"); - } - RETURN_NOT_OK(StatusFromCError(stream_.get_next(&stream_, &c_array))); - if (ArrowArrayIsReleased(&c_array)) { - // End of stream - batch->reset(); - return Status::OK(); - } else { - return ImportRecordBatch(&c_array, schema_).Value(batch); + protected: + Status ReadNextArrayInternal(struct ArrowArray* array) { + ArrowArrayMarkReleased(array); + Status status = StatusFromCError(stream_.get_next(&stream_, array)); + if (!status.ok() && !ArrowArrayIsReleased(array)) { + ArrowArrayRelease(array); } + + return status; } - Status Close() override { - if (!ArrowArrayStreamIsReleased(&stream_)) { - ArrowArrayStreamRelease(&stream_); - } - return Status::OK(); + Result> ReadSchema() { + struct ArrowSchema c_schema = {}; + ARROW_RETURN_NOT_OK( + StatusFromCError(&stream_, stream_.get_schema(&stream_, &c_schema))); + ARROW_ASSIGN_OR_RAISE(auto schema, ImportSchema(&c_schema)); + return schema; } - static Result> Make( - struct ArrowArrayStream* stream) { - if (ArrowArrayStreamIsReleased(stream)) { - return Status::Invalid("Cannot import released ArrowArrayStream"); - } - std::shared_ptr schema; + Result> ReadField() { struct ArrowSchema c_schema = {}; - auto status = StatusFromCError(stream, stream->get_schema(stream, &c_schema)); - if (status.ok()) { - status = ImportSchema(&c_schema).Value(&schema); - } - if (!status.ok()) { - ArrowArrayStreamRelease(stream); - return status; + ARROW_RETURN_NOT_OK( + StatusFromCError(&stream_, stream_.get_schema(&stream_, &c_schema))); + ARROW_ASSIGN_OR_RAISE(auto schema, ImportField(&c_schema)); + return schema; + } + + Status CheckNotReleased() { + if (ArrowArrayStreamIsReleased(&stream_)) { + return Status::Invalid( + "Attempt to read from a stream that has already been closed"); + } else { + return Status::OK(); } - return std::make_shared(std::move(schema), stream); } - private: Status StatusFromCError(int errno_like) const { return StatusFromCError(&stream_, errno_like); } @@ -2180,15 +2234,114 @@ class ArrayStreamBatchReader : public RecordBatchReader { return {code, last_error ? std::string(last_error) : ""}; } + private: mutable struct ArrowArrayStream stream_; +}; + +class ArrayStreamBatchReader : public RecordBatchReader, public ArrayStreamReader { + public: + explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) + : ArrayStreamReader(stream) {} + + Status Init() { + ARROW_ASSIGN_OR_RAISE(schema_, ReadSchema()); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* batch) override { + ARROW_RETURN_NOT_OK(CheckNotReleased()); + + struct ArrowArray c_array; + ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + + if (ArrowArrayIsReleased(&c_array)) { + // End of stream + batch->reset(); + return Status::OK(); + } else { + return ImportRecordBatch(&c_array, schema_).Value(batch); + } + } + + Status Close() override { + ReleaseStream(); + return Status::OK(); + } + + private: std::shared_ptr schema_; }; +class ArrayStreamArrayReader : public ArrayStreamReader { + public: + explicit ArrayStreamArrayReader(struct ArrowArrayStream* stream) + : ArrayStreamReader(stream) {} + + Status Init() { + ARROW_ASSIGN_OR_RAISE(field_, ReadField()); + return Status::OK(); + } + + std::shared_ptr data_type() const { return field_->type(); } + + Status ReadNext(std::shared_ptr* array) { + ARROW_RETURN_NOT_OK(CheckNotReleased()); + + struct ArrowArray c_array; + ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + + if (ArrowArrayIsReleased(&c_array)) { + // End of stream + array->reset(); + return Status::OK(); + } else { + return ImportArray(&c_array, field_->type()).Value(array); + } + } + + private: + std::shared_ptr field_; +}; + } // namespace Result> ImportRecordBatchReader( struct ArrowArrayStream* stream) { - return ArrayStreamBatchReader::Make(stream); + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + + auto reader = std::make_shared(stream); + ARROW_RETURN_NOT_OK(reader->Init()); + return reader; +} + +Result> ImportChunkedArray( + struct ArrowArrayStream* stream) { + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + + auto reader = std::make_shared(stream); + ARROW_RETURN_NOT_OK(reader->Init()); + + std::shared_ptr data_type = reader->data_type(); + + ArrayVector chunks; + std::shared_ptr chunk; + while (true) { + ARROW_RETURN_NOT_OK(reader->ReadNext(&chunk)); + if (!chunk) { + break; + } + + chunks.push_back(std::move(chunk)); + } + + reader->ReleaseStream(); + return ChunkedArray::Make(std::move(chunks), std::move(data_type)); } } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 45583109a761f..e98a42818f628 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -302,6 +302,17 @@ ARROW_EXPORT Status ExportRecordBatchReader(std::shared_ptr reader, struct ArrowArrayStream* out); +/// \brief Export C++ ChunkedArray using the C data interface format. +/// +/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out); + /// \brief Import C++ RecordBatchReader from the C stream interface. /// /// The ArrowArrayStream struct has its contents moved to a private object @@ -313,6 +324,17 @@ ARROW_EXPORT Result> ImportRecordBatchReader( struct ArrowArrayStream* stream); +/// \brief Import C++ ChunkedArray from the C stream interface +/// +/// The ArrowArrayStream struct has its contents moved to a private object, +/// is consumed in its entirity, and released before returning all chunks +/// as a ChunkedArray. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportChunkedArray(struct ArrowArrayStream* stream); + /// @} } // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 58bbc9282c204..dba6e4736b673 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -1282,7 +1282,9 @@ class MyBuffer final : public MutableBuffer { default_memory_pool()->Free(const_cast(data_), size_); } - std::shared_ptr device_sync_event() override { return device_sync_; } + std::shared_ptr device_sync_event() const override { + return device_sync_; + } protected: std::shared_ptr device_sync_; @@ -1870,7 +1872,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertTypeEqual(*expected, *type); + AssertTypeEqual(*expected, *type, /*check_metadata=*/true); } void CheckImport(const std::shared_ptr& expected) { @@ -1890,7 +1892,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertSchemaEqual(*expected, *schema); + AssertSchemaEqual(*expected, *schema, /*check_metadata=*/true); } void CheckImportError() { @@ -3569,7 +3571,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the type ASSERT_OK_AND_ASSIGN(actual, ImportType(&c_schema)); type = factory_expected(); - AssertTypeEqual(*type, *actual); + AssertTypeEqual(*type, *actual, /*check_metadata=*/true); type.reset(); actual.reset(); @@ -3600,7 +3602,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the schema ASSERT_OK_AND_ASSIGN(actual, ImportSchema(&c_schema)); schema = factory(); - AssertSchemaEqual(*schema, *actual); + AssertSchemaEqual(*schema, *actual, /*check_metadata=*/true); schema.reset(); actual.reset(); @@ -3693,13 +3695,27 @@ TEST_F(TestSchemaRoundtrip, Dictionary) { } } +// Given an extension type, return a field of its storage type + the +// serialized extension metadata. +std::shared_ptr GetStorageWithMetadata(const std::string& field_name, + const std::shared_ptr& type) { + const auto& ext_type = checked_cast(*type); + auto storage_type = ext_type.storage_type(); + auto md = KeyValueMetadata::Make({kExtensionTypeKeyName, kExtensionMetadataKeyName}, + {ext_type.extension_name(), ext_type.Serialize()}); + return field(field_name, storage_type, /*nullable=*/true, md); +} + TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); - // Inside nested type - TestWithTypeFactory([]() { return list(dict_extension_type()); }, - []() { return list(dictionary(int8(), utf8())); }); + // Inside nested type. + // When an extension type is not known by the importer, it is imported + // as its storage type and the extension metadata is preserved on the field. + TestWithTypeFactory( + []() { return list(dict_extension_type()); }, + []() { return list(GetStorageWithMetadata("item", dict_extension_type())); }); } TEST_F(TestSchemaRoundtrip, RegisteredExtension) { @@ -3708,7 +3724,9 @@ TEST_F(TestSchemaRoundtrip, RegisteredExtension) { TestWithTypeFactory(dict_extension_type); TestWithTypeFactory(complex128); - // Inside nested type + // Inside nested type. + // When the extension type is registered, the extension metadata is removed + // from the storage type's field to ensure roundtripping (GH-39865). TestWithTypeFactory([]() { return list(uuid()); }); TestWithTypeFactory([]() { return list(dict_extension_type()); }); TestWithTypeFactory([]() { return list(complex128()); }); @@ -3808,7 +3826,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -3848,7 +3866,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4228,7 +4246,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -4274,7 +4292,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4302,6 +4320,16 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) { TestWithJSON(mm, int32(), "[4, 5, null]"); } +TEST_F(TestDeviceArrayRoundtrip, Struct) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + auto type = struct_({field("ints", int16()), field("strs", utf8())}); + + TestWithJSON(mm, type, "[]"); + TestWithJSON(mm, type, R"([[4, "foo"], [5, "bar"]])"); + TestWithJSON(mm, type, R"([[4, null], null, [5, "foo"]])"); +} + //////////////////////////////////////////////////////////////////////////// // Array stream export tests @@ -4351,7 +4379,7 @@ class TestArrayStreamExport : public BaseArrayStreamTest { SchemaExportGuard schema_guard(&c_schema); ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(expected, *schema); + AssertSchemaEqual(expected, *schema, /*check_metadata=*/true); } void AssertStreamEnd(struct ArrowArrayStream* c_stream) { @@ -4372,6 +4400,17 @@ class TestArrayStreamExport : public BaseArrayStreamTest { ASSERT_OK_AND_ASSIGN(auto batch, ImportRecordBatch(&c_array, expected.schema())); AssertBatchesEqual(expected, *batch); } + + void AssertStreamNext(struct ArrowArrayStream* c_stream, const Array& expected) { + struct ArrowArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + ArrayExportGuard guard(&c_array); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array)); + + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_array, expected.type())); + AssertArraysEqual(expected, *array); + } }; TEST_F(TestArrayStreamExport, Empty) { @@ -4435,7 +4474,7 @@ TEST_F(TestArrayStreamExport, ArrayLifetime) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto got_schema, ImportSchema(&c_schema)); - AssertSchemaEqual(*schema, *got_schema); + AssertSchemaEqual(*schema, *got_schema, /*check_metadata=*/true); } ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); @@ -4460,13 +4499,74 @@ TEST_F(TestArrayStreamExport, Errors) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(schema, arrow::schema({})); + AssertSchemaEqual(schema, arrow::schema({}), /*check_metadata=*/true); } struct ArrowArray c_array; ASSERT_EQ(EINVAL, c_stream.get_next(&c_stream, &c_array)); } +TEST_F(TestArrayStreamExport, ChunkedArrayExportEmpty) { + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({}, int32())); + + struct ArrowArrayStream c_stream; + struct ArrowSchema c_schema; + + ASSERT_OK(ExportChunkedArray(chunked_array, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + { + ArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + AssertStreamEnd(&c_stream); + } + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } +} + +TEST_F(TestArrayStreamExport, ChunkedArrayExport) { + ASSERT_OK_AND_ASSIGN(auto chunked_array, + ChunkedArray::Make({ArrayFromJSON(int32(), "[1, 2]"), + ArrayFromJSON(int32(), "[4, 5, null]")})); + + struct ArrowArrayStream c_stream; + struct ArrowSchema c_schema; + struct ArrowArray c_array0, c_array1; + + ASSERT_OK(ExportChunkedArray(chunked_array, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + { + ArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array0)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array1)); + AssertStreamEnd(&c_stream); + } + + ArrayExportGuard guard0(&c_array0), guard1(&c_array1); + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } + + ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_array0, chunked_array->type())); + AssertArraysEqual(*chunked_array->chunk(0), *array); + ASSERT_OK_AND_ASSIGN(array, ImportArray(&c_array1, chunked_array->type())); + AssertArraysEqual(*chunked_array->chunk(1), *array); +} + //////////////////////////////////////////////////////////////////////////// // Array stream roundtrip tests @@ -4506,6 +4606,29 @@ class TestArrayStreamRoundtrip : public BaseArrayStreamTest { ASSERT_TRUE(weak_reader.expired()); } + void Roundtrip(std::shared_ptr src, + std::function&)> check_func) { + ArrowArrayStream c_stream; + + // One original copy which to compare the result, one copy held by the stream + std::weak_ptr weak_src(src); + int64_t initial_use_count = weak_src.use_count(); + + ASSERT_OK(ExportChunkedArray(std::move(src), &c_stream)); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + { + ASSERT_OK_AND_ASSIGN(auto dst, ImportChunkedArray(&c_stream)); + // Stream was moved, consumed, and released + ASSERT_TRUE(ArrowArrayStreamIsReleased(&c_stream)); + + // Stream was released by ImportChunkedArray but original copy remains + ASSERT_EQ(weak_src.use_count(), initial_use_count - 1); + + check_func(dst); + } + } + void AssertReaderNext(const std::shared_ptr& reader, const RecordBatch& expected) { ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); @@ -4537,7 +4660,7 @@ TEST_F(TestArrayStreamRoundtrip, Simple) { ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, orig_schema)); Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { - AssertSchemaEqual(*orig_schema, *reader->schema()); + AssertSchemaEqual(*orig_schema, *reader->schema(), /*check_metadata=*/true); AssertReaderNext(reader, *batches[0]); AssertReaderNext(reader, *batches[1]); AssertReaderEnd(reader); @@ -4603,4 +4726,24 @@ TEST_F(TestArrayStreamRoundtrip, SchemaError) { ASSERT_TRUE(state.released); } +TEST_F(TestArrayStreamRoundtrip, ChunkedArrayRoundtrip) { + ASSERT_OK_AND_ASSIGN(auto src, + ChunkedArray::Make({ArrayFromJSON(int32(), "[1, 2]"), + ArrayFromJSON(int32(), "[4, 5, null]")})); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + +TEST_F(TestArrayStreamRoundtrip, ChunkedArrayRoundtripEmpty) { + ASSERT_OK_AND_ASSIGN(auto src, ChunkedArray::Make({}, int32())); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + } // namespace arrow diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index b47e0a35525c5..8c59ad1df86f2 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -761,6 +761,15 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i } } + int64_t input_length; + if (!arguments.empty() && all_scalar) { + // all inputs are scalar, so use a 1-long batch to avoid + // computing input.length equivalent outputs + input_length = 1; + } else { + input_length = input.length; + } + auto executor = compute::detail::KernelExecutor::MakeScalar(); compute::KernelContext kernel_context(exec_context, call->kernel); @@ -772,8 +781,8 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, types, options})); compute::detail::DatumAccumulator listener; - RETURN_NOT_OK(executor->Execute( - ExecBatch(std::move(arguments), all_scalar ? 1 : input.length), &listener)); + RETURN_NOT_OK( + executor->Execute(ExecBatch(std::move(arguments), input_length), &listener)); const auto out = executor->WrapResults(arguments, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 44159e76600fb..d33c348cd77da 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -863,6 +863,25 @@ TEST(Expression, ExecuteCall) { ])")); } +TEST(Expression, ExecuteCallWithNoArguments) { + const int kCount = 10; + auto random_options = RandomOptions::FromSeed(/*seed=*/0); + ExecBatch input({}, kCount); + + Expression random_expr = call("random", {}, random_options); + ASSERT_OK_AND_ASSIGN(random_expr, random_expr.Bind(float64())); + + ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(random_expr, input)); + compute::ExecContext* exec_context = default_exec_context(); + ASSERT_OK_AND_ASSIGN(auto function, + exec_context->func_registry()->GetFunction("random")); + ASSERT_OK_AND_ASSIGN(Datum expected, + function->Execute(input, &random_options, exec_context)); + AssertDatumsEqual(actual, expected, /*verbose=*/true); + + EXPECT_EQ(actual.length(), kCount); +} + TEST(Expression, ExecuteDictionaryTransparent) { ExpectExecute( equal(field_ref("a"), field_ref("b")), diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 960a69c59db5d..dd3d025202018 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -175,6 +175,13 @@ void AssertParsePartial(BlockParser& parser, const std::string& str, ASSERT_EQ(parsed_size, expected_size); } +void AssertParsePartial(BlockParser& parser, const std::vector& data, + uint32_t expected_size) { + uint32_t parsed_size = static_cast(-1); + ASSERT_OK(parser.Parse(data, &parsed_size)); + ASSERT_EQ(parsed_size, expected_size); +} + void AssertLastRowEq(const BlockParser& parser, const std::vector& expected) { std::vector values; @@ -376,6 +383,21 @@ TEST(BlockParser, TruncatedData) { } } +TEST(BlockParser, TruncatedDataViews) { + // The BlockParser API mandates that, when passing a vector of views, + // only the last view may be a truncated CSV block. + // In the current implementation, receiving a truncated non-last view + // simply stops parsing after that view. + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3); + AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6); + + // More sophisticated: non-last block ends on some newline inside a quoted string + // (terse reproducer of gh-39857) + AssertParsePartial(parser, Views({"a,b,\"c\n", "\"\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\n\"d\n", "\",e,f\n"}), 6); +} + TEST(BlockParser, Final) { // Tests for ParseFinal() BlockParser parser(ParseOptions::Defaults()); diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 332fad054fea3..e981fafe8e780 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -261,11 +261,10 @@ class SerialBlockReader : public BlockReader { auto consume_bytes = [this, bytes_before_buffer, next_buffer](int64_t nbytes) -> Status { DCHECK_GE(nbytes, 0); - auto offset = nbytes - bytes_before_buffer; - if (offset < 0) { - // Should not happen - return Status::Invalid("CSV parser got out of sync with chunker"); - } + int64_t offset = nbytes - bytes_before_buffer; + // All data before the buffer should have been consumed. + // This is checked in Parse() and BlockParsingOperator::operator(). + DCHECK_GE(offset, 0); partial_ = SliceBuffer(buffer_, offset); buffer_ = next_buffer; return Status::OK(); @@ -400,6 +399,7 @@ class BlockParsingOperator { count_rows_(first_row >= 0), num_rows_seen_(first_row) {} + // TODO: this is almost entirely the same as ReaderMixin::Parse(). Refactor? Result operator()(const CSVBlock& block) { constexpr int32_t max_num_rows = std::numeric_limits::max(); auto parser = std::make_shared( @@ -427,19 +427,38 @@ class BlockParsingOperator { } else { RETURN_NOT_OK(parser->Parse(views, &parsed_size)); } + + // `partial + completion` should have been entirely consumed. + const int64_t bytes_before_buffer = block.partial->size() + block.completion->size(); + if (static_cast(parsed_size) < bytes_before_buffer) { + // This can happen if `newlines_in_values` is not enabled and + // `partial + completion` ends with a newline inside a quoted string. + // In this case, the BlockParser stops at the truncated data in the first + // block (see gh-39857). + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); + } + if (count_rows_) { num_rows_seen_ += parser->total_num_rows(); } - RETURN_NOT_OK(block.consume_bytes(parsed_size)); + + if (block.consume_bytes) { + RETURN_NOT_OK(block.consume_bytes(parsed_size)); + } return ParsedBlock{std::move(parser), block.block_index, static_cast(parsed_size) + block.bytes_skipped}; } + int num_csv_cols() const { return num_csv_cols_; } + private: io::IOContext io_context_; - ParseOptions parse_options_; - int num_csv_cols_; - bool count_rows_; + const ParseOptions parse_options_; + const int num_csv_cols_; + const bool count_rows_; int64_t num_rows_seen_; }; @@ -555,7 +574,6 @@ class ReaderMixin { parse_options_(parse_options), convert_options_(convert_options), count_rows_(count_rows), - num_rows_seen_(count_rows_ ? 1 : -1), input_(std::move(input)) {} protected: @@ -566,6 +584,7 @@ class ReaderMixin { const uint8_t* data = buf->data(); const auto data_end = data + buf->size(); DCHECK_GT(data_end - data, 0); + int64_t num_rows_seen = 1; if (read_options_.skip_rows) { // Skip initial rows (potentially invalid CSV data) @@ -578,14 +597,14 @@ class ReaderMixin { "either file is too short or header is larger than block size"); } if (count_rows_) { - num_rows_seen_ += num_skipped_rows; + num_rows_seen += num_skipped_rows; } } if (read_options_.column_names.empty()) { // Parse one row (either to read column names or to know the number of columns) - BlockParser parser(io_context_.pool(), parse_options_, num_csv_cols_, - num_rows_seen_, 1); + BlockParser parser(io_context_.pool(), parse_options_, /*num_cols=*/-1, + /*first_row=*/num_rows_seen, /*max_num_rows=*/1); uint32_t parsed_size = 0; RETURN_NOT_OK(parser.Parse( std::string_view(reinterpret_cast(data), data_end - data), @@ -612,7 +631,7 @@ class ReaderMixin { // Skip parsed header row data += parsed_size; if (count_rows_) { - ++num_rows_seen_; + ++num_rows_seen; } } } else { @@ -621,14 +640,17 @@ class ReaderMixin { if (count_rows_) { // increase rows seen to skip past rows which will be skipped - num_rows_seen_ += read_options_.skip_rows_after_names; + num_rows_seen += read_options_.skip_rows_after_names; } auto bytes_consumed = data - buf->data(); *rest = SliceBuffer(buf, bytes_consumed); - num_csv_cols_ = static_cast(column_names_.size()); - DCHECK_GT(num_csv_cols_, 0); + int32_t num_csv_cols = static_cast(column_names_.size()); + DCHECK_GT(num_csv_cols, 0); + // Since we know the number of columns, we can instantiate the BlockParsingOperator + parsing_operator_.emplace(io_context_, parse_options_, num_csv_cols, + count_rows_ ? num_rows_seen : -1); RETURN_NOT_OK(MakeConversionSchema()); return bytes_consumed; @@ -676,7 +698,7 @@ class ReaderMixin { if (convert_options_.include_columns.empty()) { // Include all columns in CSV file order - for (int32_t col_index = 0; col_index < num_csv_cols_; ++col_index) { + for (int32_t col_index = 0; col_index < num_csv_cols(); ++col_index) { append_csv_column(column_names_[col_index], col_index); } } else { @@ -704,57 +726,25 @@ class ReaderMixin { return Status::OK(); } - struct ParseResult { - std::shared_ptr parser; - int64_t parsed_bytes; - }; - - Result Parse(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, int64_t block_index, - bool is_final) { - static constexpr int32_t max_num_rows = std::numeric_limits::max(); - auto parser = std::make_shared( - io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows); + Result Parse(const CSVBlock& block) { + DCHECK(parsing_operator_.has_value()); + return (*parsing_operator_)(block); + } - std::shared_ptr straddling; - std::vector views; - if (partial->size() != 0 || completion->size() != 0) { - if (partial->size() == 0) { - straddling = completion; - } else if (completion->size() == 0) { - straddling = partial; - } else { - ARROW_ASSIGN_OR_RAISE( - straddling, ConcatenateBuffers({partial, completion}, io_context_.pool())); - } - views = {std::string_view(*straddling), std::string_view(*block)}; - } else { - views = {std::string_view(*block)}; - } - uint32_t parsed_size; - if (is_final) { - RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size)); - } else { - RETURN_NOT_OK(parser->Parse(views, &parsed_size)); - } - if (count_rows_) { - num_rows_seen_ += parser->total_num_rows(); - } - return ParseResult{std::move(parser), static_cast(parsed_size)}; + int num_csv_cols() const { + DCHECK(parsing_operator_.has_value()); + return parsing_operator_->num_csv_cols(); } io::IOContext io_context_; - ReadOptions read_options_; - ParseOptions parse_options_; - ConvertOptions convert_options_; - - // Number of columns in the CSV file - int32_t num_csv_cols_ = -1; - // Whether num_rows_seen_ tracks the number of rows seen in the CSV being parsed - bool count_rows_; - // Number of rows seen in the csv. Not used if count_rows is false - int64_t num_rows_seen_; + const ReadOptions read_options_; + const ParseOptions parse_options_; + const ConvertOptions convert_options_; + // Whether to track the number of rows seen in the CSV being parsed + const bool count_rows_; + + std::optional parsing_operator_; + // Column names in the CSV file std::vector column_names_; ConversionSchema conversion_schema_; @@ -798,14 +788,10 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader { return Status::OK(); } - Result ParseAndInsert(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, - int64_t block_index, bool is_final) { - ARROW_ASSIGN_OR_RAISE(auto result, - Parse(partial, completion, block, block_index, is_final)); - RETURN_NOT_OK(ProcessData(result.parser, block_index)); - return result.parsed_bytes; + Status ParseAndInsert(const CSVBlock& block) { + ARROW_ASSIGN_OR_RAISE(auto result, Parse(block)); + RETURN_NOT_OK(ProcessData(result.parser, result.block_index)); + return Status::OK(); } // Trigger conversion of parsed block data @@ -897,8 +883,6 @@ class StreamingReaderImpl : public ReaderMixin, ProcessHeader(first_buffer, &after_header)); bytes_decoded_->fetch_add(header_bytes_consumed); - auto parser_op = - BlockParsingOperator(io_context_, parse_options_, num_csv_cols_, num_rows_seen_); ARROW_ASSIGN_OR_RAISE( auto decoder_op, BlockDecodingOperator::Make(io_context_, convert_options_, conversion_schema_)); @@ -906,8 +890,7 @@ class StreamingReaderImpl : public ReaderMixin, auto block_gen = SerialBlockReader::MakeAsyncIterator( std::move(buffer_generator), MakeChunker(parse_options_), std::move(after_header), read_options_.skip_rows_after_names); - auto parsed_block_gen = - MakeMappedGenerator(std::move(block_gen), std::move(parser_op)); + auto parsed_block_gen = MakeMappedGenerator(std::move(block_gen), *parsing_operator_); auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op)); auto self = shared_from_this(); @@ -1011,11 +994,7 @@ class SerialTableReader : public BaseTableReader { // EOF break; } - ARROW_ASSIGN_OR_RAISE( - int64_t parsed_bytes, - ParseAndInsert(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes)); + RETURN_NOT_OK(ParseAndInsert(maybe_block)); } // Finish conversion, create schema and table RETURN_NOT_OK(task_group_->Finish()); @@ -1086,13 +1065,8 @@ class AsyncThreadedTableReader DCHECK(!maybe_block.consume_bytes); // Launch parse task - self->task_group_->Append([self, maybe_block] { - return self - ->ParseAndInsert(maybe_block.partial, maybe_block.completion, - maybe_block.buffer, maybe_block.block_index, - maybe_block.is_final) - .status(); - }); + self->task_group_->Append( + [self, maybe_block] { return self->ParseAndInsert(maybe_block); }); return Status::OK(); }; @@ -1215,12 +1189,8 @@ class CSVRowCounter : public ReaderMixin, // IterationEnd. std::function>(const CSVBlock&)> count_cb = [self](const CSVBlock& maybe_block) -> Result> { - ARROW_ASSIGN_OR_RAISE( - auto parser, - self->Parse(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parser.parsed_bytes)); - int32_t total_row_count = parser.parser->total_num_rows(); + ARROW_ASSIGN_OR_RAISE(auto parsed_block, self->Parse(maybe_block)); + int32_t total_row_count = parsed_block.parser->total_num_rows(); self->row_count_ += total_row_count; return total_row_count; }; diff --git a/cpp/src/arrow/device.cc b/cpp/src/arrow/device.cc index de709923dc44e..3736a4e018c33 100644 --- a/cpp/src/arrow/device.cc +++ b/cpp/src/arrow/device.cc @@ -20,8 +20,10 @@ #include #include +#include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/memory.h" +#include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/logging.h" @@ -193,6 +195,13 @@ Result> CPUMemoryManager::ViewBufferFrom( if (!from->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), shared_from_this(), buf); + } return buf; } @@ -218,6 +227,13 @@ Result> CPUMemoryManager::ViewBufferTo( if (!to->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), to, buf); + } return buf; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index af8305a025291..02e0a890e4b3d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -19,6 +19,8 @@ #include #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/tensor_internal.h" +#include "arrow/scalar.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" @@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; } - const auto& other_ext = static_cast(other); + const auto& other_ext = internal::checked_cast(other); auto is_permutation_trivial = [](const std::vector& permutation) { for (size_t i = 1; i < permutation.size(); ++i) { @@ -143,7 +145,7 @@ std::string FixedShapeTensorType::Serialize() const { if (!dim_names_.empty()) { rj::Value dim_names(rj::kArrayType); - for (std::string v : dim_names_) { + for (const std::string& v : dim_names_) { dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); } document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); @@ -199,10 +201,52 @@ std::shared_ptr FixedShapeTensorType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.fixed_shape_tensor", - static_cast(*data->type).extension_name()); + internal::checked_cast(*data->type).extension_name()); return std::make_shared(data); } +Result> FixedShapeTensorType::MakeTensor( + const std::shared_ptr& scalar) { + const auto ext_scalar = internal::checked_pointer_cast(scalar); + const auto ext_type = + internal::checked_pointer_cast(scalar->type); + if (!is_fixed_width(*ext_type->value_type())) { + return Status::TypeError("Cannot convert non-fixed-width values to Tensor."); + } + const auto array = + internal::checked_pointer_cast(ext_scalar->value)->value; + if (array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls to Tensor."); + } + const auto value_type = + internal::checked_pointer_cast(ext_type->value_type()); + const auto byte_width = value_type->byte_width(); + + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim()); + std::iota(permutation.begin(), permutation.end(), 0); + } + + std::vector shape = ext_type->shape(); + internal::Permute(permutation, &shape); + + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + internal::Permute(permutation, &dim_names); + } + + std::vector strides; + RETURN_NOT_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); + const auto start_position = array->offset() * byte_width; + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + const auto buffer = + SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); + + return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); +} + Result> FixedShapeTensorArray::FromTensor( const std::shared_ptr& tensor) { auto permutation = internal::ArgSort(tensor->strides(), std::greater<>()); @@ -293,53 +337,71 @@ const Result> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); - - std::vector dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); - } - dim_names.insert(dim_names.begin(), 1, ""); + const auto ext_type = + internal::checked_pointer_cast(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); + + // ext_type->permutation() gives us permutation for a single row with values in + // range [0, ndim). Here want to create a ndim + 1 dimensional tensor from the entire + // array and we assume the first dimension will always have the greatest stride, so it + // will get permutation index 0 and remaining values from ext_type->permutation() need + // to be shifted to fill the [1, ndim+1) range. Computed permutation will be used to + // generate the new tensor's shape, strides and dim_names. + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim() + 1); + std::iota(permutation.begin(), permutation.end(), 0); } else { - dim_names = {}; + for (auto i = 0; i < static_cast(ext_type->ndim()); i++) { + permutation[i] += 1; + } + permutation.insert(permutation.begin(), 1, 0); } - std::vector shape; - for (int64_t& i : permutation) { - shape.emplace_back(ext_type->shape()[i]); - ++i; + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + dim_names.insert(dim_names.begin(), 1, ""); + internal::Permute(permutation, &dim_names); } + + std::vector shape = ext_type->shape(); + auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); shape.insert(shape.begin(), 1, this->length()); - permutation.insert(permutation.begin(), 1, 0); + internal::Permute(permutation, &shape); std::vector tensor_strides; - auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); + const auto fw_value_type = internal::checked_pointer_cast(value_type); ARROW_RETURN_NOT_OK( - ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); - ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); + ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides)); + + const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); - return tensor; + const auto buffer, + SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width())); + + return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names); } Result> FixedShapeTensorType::Make( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { - if (!permutation.empty() && shape.size() != permutation.size()) { - return Status::Invalid("permutation size must match shape size. Expected: ", - shape.size(), " Got: ", permutation.size()); + const auto ndim = shape.size(); + if (!permutation.empty() && ndim != permutation.size()) { + return Status::Invalid("permutation size must match shape size. Expected: ", ndim, + " Got: ", permutation.size()); + } + if (!dim_names.empty() && ndim != dim_names.size()) { + return Status::Invalid("dim_names size must match shape size. Expected: ", ndim, + " Got: ", dim_names.size()); } - if (!dim_names.empty() && shape.size() != dim_names.size()) { - return Status::Invalid("dim_names size must match shape size. Expected: ", - shape.size(), " Got: ", dim_names.size()); + if (!permutation.empty()) { + RETURN_NOT_OK(internal::IsPermutationValid(permutation)); } + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); return std::make_shared(value_type, static_cast(size), diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index fcfb1ebbab96a..591a7cee32a34 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { std::string ToString() const override; /// Number of dimensions of tensor elements - size_t ndim() { return shape_.size(); } + size_t ndim() const { return shape_.size(); } /// Shape of tensor elements const std::vector shape() const { return shape_; } @@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; + /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray + /// + /// This method will return a Tensor from ExtensionScalar with strides + /// derived from shape and permutation of FixedShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// FixedShapeTensorType metadata. + static Result> MakeTensor( + const std::shared_ptr& scalar); + /// \brief Create a FixedShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const std::vector& shape, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2b8e703d3c66e..3fd39a11ff50d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -28,6 +28,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/sort.h" namespace arrow { @@ -39,34 +40,34 @@ class TestExtensionType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; - cell_shape_ = {3, 4}; + element_shape_ = {3, 4}; value_type_ = int64(); - cell_type_ = fixed_size_list(value_type_, 12); + element_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; ext_type_ = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); + fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; tensor_strides_ = {96, 32, 8}; - cell_strides_ = {32, 8}; + element_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; std::vector shape_partial_; - std::vector cell_shape_; + std::vector element_shape_; std::shared_ptr value_type_; - std::shared_ptr cell_type_; + std::shared_ptr element_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; std::vector tensor_strides_; - std::vector cell_strides_; + std::vector element_strides_; std::string serialized_; }; @@ -96,8 +97,8 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - ASSERT_FALSE(ext_type_->Equals(*cell_type_)); - ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); + ASSERT_FALSE(ext_type_->Equals(*element_type_)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*element_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); @@ -106,18 +107,28 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->ndim(), element_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), element_shape_); ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->strides(), cell_strides_); + ASSERT_EQ(exact_ext_type->strides(), element_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); + FixedShapeTensorType::Make(value_type_, element_shape_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); + FixedShapeTensorType::Make(value_type_, element_shape_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 2 dimensional tensors must be " + "unique and within [0, 1] range. Got: [3,0]"), + FixedShapeTensorType::Make(value_type_, {5, 6}, {3, 0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [0,1,1]"), + FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); } TEST_F(TestExtensionType, EqualsCases) { @@ -148,7 +159,7 @@ TEST_F(TestExtensionType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); @@ -200,7 +211,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array @@ -215,7 +226,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, + ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); RoundtripBatch(batch2, &read_batch2); @@ -270,7 +281,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_arr_5 = std::static_pointer_cast( ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), + TypeError, testing::HasSubstr("binary is not valid data type for a tensor"), ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( @@ -278,6 +289,10 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( arr_with_null, fixed_size_list(int64(), 2))); + + auto ext_type_7 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {})); + ASSERT_OK_AND_ASSIGN(auto ext_arr_7, FixedShapeTensorArray::FromTensor(tensor)); } void CheckFromTensorType(const std::shared_ptr& tensor, @@ -308,7 +323,7 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { @@ -316,11 +331,82 @@ TEST_F(TestExtensionType, TestFromTensorType) { strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template +void CheckToTensor(const std::vector& values, const std::shared_ptr typ, + const int32_t& element_size, const std::vector& element_shape, + const std::vector& element_permutation, + const std::vector& element_dim_names, + const std::vector& tensor_shape, + const std::vector& tensor_dim_names, + const std::vector& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr element_type = fixed_size_list(typ, element_size); + std::vector> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared(typ, values.size(), buffers); + auto arr = std::make_shared(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); + ASSERT_OK(actual_tensor->Validate()); + + ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); + ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); + ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); + ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); + ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); + ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); +} + +TEST_F(TestExtensionType, ToTensor) { + std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + + auto element_sizes = std::vector{6, 6, 18, 18, 18, 18}; + + auto element_shapes = std::vector>{{2, 3}, {3, 2}, {3, 6}, + {6, 3}, {3, 2, 3}, {3, 2, 3}}; + auto tensor_shapes = std::vector>{ + {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + + auto element_permutations = std::vector>{ + {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; + auto tensor_strides_32 = + std::vector>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, + {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; + auto tensor_strides_64 = + std::vector>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, + {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + + auto element_dim_names = std::vector>{ + {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; + auto tensor_dim_names = std::vector>{ + {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, + {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; + + for (size_t i = 0; i < element_shapes.size(); i++) { + CheckToTensor(float_values, float32(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_32[i]); + CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_64[i]); + } +} + void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); @@ -364,7 +450,7 @@ TEST_F(TestExtensionType, SliceTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); @@ -404,11 +490,11 @@ TEST_F(TestExtensionType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int32(), element_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); @@ -462,4 +548,96 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast(permuted_ext_type); + + auto array = std::static_pointer_cast( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK(actual_tensor->Validate()); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {3, 4}, {}, {"x", "y"})); + ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); + ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); + ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); + ASSERT_EQ(actual_tensor->strides(), std::vector({32, 8})); + ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); + ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); + + // Get tensor from extension array with non-trivial permutation + ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {4, 3}, {8, 24}, {"y", "x"})); + ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); + ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, + exact_permuted_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + ASSERT_OK(actual_permuted_tensor->Validate()); + ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); + ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); + ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); + ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); + ASSERT_EQ(expected_permuted_tensor->is_contiguous(), + actual_permuted_tensor->is_contiguous()); + ASSERT_EQ(expected_permuted_tensor->is_column_major(), + actual_permuted_tensor->is_column_major()); + ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); + } + + // Test null values fail + auto element_type = fixed_size_list(int64(), 1); + auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); + ext_type = fixed_shape_tensor(int64(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + auto tensor_array = internal::checked_pointer_cast(ext_arr); + + ASSERT_OK_AND_ASSIGN(auto scalar, tensor_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto tensor, + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(1)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(2)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + element_type = list(utf8()); + ext_type = fixed_shape_tensor(utf8(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + scalar = std::make_shared(ArrayFromJSON(element_type, R"([["a", "b"]])")); + auto ext_scalar = std::make_shared(scalar, ext_type); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + testing::HasSubstr("Type error: Cannot convert non-fixed-width values to Tensor."), + exact_ext_type->MakeTensor(ext_scalar)); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h new file mode 100644 index 0000000000000..069880cb17c85 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/print.h" + +namespace arrow::internal { + +ARROW_EXPORT +Status IsPermutationValid(const std::vector& permutation) { + const auto size = static_cast(permutation.size()); + std::vector dim_seen(size, 0); + + for (const auto p : permutation) { + if (p < 0 || p >= size || dim_seen[p] != 0) { + return Status::Invalid( + "Permutation indices for ", size, + " dimensional tensors must be unique and within [0, ", size - 1, + "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); + } + dim_seen[p] = 1; + } + return Status::OK(); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 81fd813962792..d4bb445701444 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1924,6 +1924,26 @@ class AzureFileSystem::Impl { } } + Status DeleteFile(const AzureLocation& location) { + RETURN_NOT_OK(ValidateFileLocation(location)); + auto file_client = datalake_service_client_->GetFileSystemClient(location.container) + .GetFileClient(location.path); + try { + auto response = file_client.Delete(); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.ErrorCode == "FilesystemNotFound" || + exception.ErrorCode == "PathNotFound") { + return PathNotFound(location); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, + ": ", file_client.GetUrl()); + } + return Status::OK(); + } + private: /// \brief Create a BlobLeaseClient and acquire a lease on the container. /// @@ -2523,7 +2543,8 @@ Status AzureFileSystem::DeleteRootDirContents() { } Status AzureFileSystem::DeleteFile(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->DeleteFile(location); } Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index df9e6c4aac6a4..a8623093c0141 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1816,6 +1816,38 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } +TEST_F(TestAzuriteFileSystem, DeleteFileSuccess) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + const auto file_name = ConcatAbstractPath(container_name, "abc"); + CreateFile(fs(), file_name, "data"); + arrow::fs::AssertFileInfo(fs(), file_name, FileType::File); + ASSERT_OK(fs()->DeleteFile(file_name)); + arrow::fs::AssertFileInfo(fs(), file_name, FileType::NotFound); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureNonexistent) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + const auto nonexistent_file_name = ConcatAbstractPath(container_name, "nonexistent"); + ASSERT_RAISES(IOError, fs()->DeleteFile(nonexistent_file_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureContainer) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureDirectory) { + const auto directory_name = + ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "directory"); + ASSERT_OK(fs()->CreateDir(directory_name)); + arrow::fs::AssertFileInfo(fs(), directory_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(directory_name)); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); @@ -2302,6 +2334,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); ASSERT_RAISES(Invalid, stream->Seek(2)); } - } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 81542d339bd70..988cc1f25b91c 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -433,6 +433,11 @@ Result> CudaMemoryManager::CopyBufferTo( Result> CudaMemoryManager::CopyNonOwnedTo( const Buffer& buf, const std::shared_ptr& to) { if (to->is_cpu()) { + auto sync_event = buf.device_sync_event(); + if (sync_event) { + RETURN_NOT_OK(sync_event->Wait()); + } + // Device-to-CPU copy std::unique_ptr dest; ARROW_ASSIGN_OR_RAISE(auto from_context, cuda_device()->GetContext()); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index bd2c2b716d502..c5075299a3e35 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1336,30 +1336,11 @@ class CopyCollectListener : public CollectListener { Status OnRecordBatchWithMetadataDecoded( RecordBatchWithMetadata record_batch_with_metadata) override { - auto& record_batch = record_batch_with_metadata.batch; - for (auto column_data : record_batch->column_data()) { - ARROW_RETURN_NOT_OK(CopyArrayData(column_data)); - } - return CollectListener::OnRecordBatchWithMetadataDecoded(record_batch_with_metadata); - } + ARROW_ASSIGN_OR_RAISE( + record_batch_with_metadata.batch, + record_batch_with_metadata.batch->CopyTo(default_cpu_memory_manager())); - private: - Status CopyArrayData(std::shared_ptr data) { - auto& buffers = data->buffers; - for (size_t i = 0; i < buffers.size(); ++i) { - auto& buffer = buffers[i]; - if (!buffer) { - continue; - } - ARROW_ASSIGN_OR_RAISE(buffers[i], Buffer::Copy(buffer, buffer->memory_manager())); - } - for (auto child_data : data->child_data) { - ARROW_RETURN_NOT_OK(CopyArrayData(child_data)); - } - if (data->dictionary) { - ARROW_RETURN_NOT_OK(CopyArrayData(data->dictionary)); - } - return Status::OK(); + return CollectListener::OnRecordBatchWithMetadataDecoded(record_batch_with_metadata); } }; diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 843329c17bc28..d58c203d2ae27 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -195,7 +195,7 @@ bool IsDebugEnabled() { return false; } auto env_value = *std::move(maybe_env_value); - if (env_value.empty()) { + if (env_value.empty() || env_value == "none") { return false; } auto debug_state = DebugState::Instance(); @@ -212,7 +212,7 @@ bool IsDebugEnabled() { return true; } ARROW_LOG(WARNING) << "Invalid value for " << kDebugMemoryEnvVar << ": '" << env_value - << "'. Valid values are 'abort', 'trap', 'warn'."; + << "'. Valid values are 'abort', 'trap', 'warn', 'none'."; return false; }(); diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 457135fa400d5..ca6b45af3d6b4 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -357,6 +357,30 @@ Status ValidateBatch(const RecordBatch& batch, bool full_validation) { } // namespace +Result> RecordBatch::CopyTo( + const std::shared_ptr& to) const { + ArrayVector copied_columns; + copied_columns.reserve(num_columns()); + for (const auto& col : columns()) { + ARROW_ASSIGN_OR_RAISE(auto c, col->CopyTo(to)); + copied_columns.push_back(std::move(c)); + } + + return Make(schema_, num_rows(), std::move(copied_columns)); +} + +Result> RecordBatch::ViewOrCopyTo( + const std::shared_ptr& to) const { + ArrayVector copied_columns; + copied_columns.reserve(num_columns()); + for (const auto& col : columns()) { + ARROW_ASSIGN_OR_RAISE(auto c, col->ViewOrCopyTo(to)); + copied_columns.push_back(std::move(c)); + } + + return Make(schema_, num_rows(), std::move(copied_columns)); +} + Status RecordBatch::Validate() const { return ValidateBatch(*this, /*full_validation=*/false); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 1a66fc3fb5629..79f93a7b5997f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -186,6 +186,25 @@ class ARROW_EXPORT RecordBatch { /// \return the number of rows (the corresponding length of each column) int64_t num_rows() const { return num_rows_; } + /// \brief Copy the entire RecordBatch to destination MemoryManager + /// + /// This uses Array::CopyTo on each column of the record batch to create + /// a new record batch where all underlying buffers for the columns have + /// been copied to the destination MemoryManager. This uses + /// MemoryManager::CopyBuffer under the hood. + Result> CopyTo( + const std::shared_ptr& to) const; + + /// \brief View or Copy the entire RecordBatch to destination MemoryManager + /// + /// This uses Array::ViewOrCopyTo on each column of the record batch to create + /// a new record batch where all underlying buffers for the columns have + /// been zero-copy viewed on the destination MemoryManager, falling back + /// to performing a copy if it can't be viewed as a zero-copy buffer. This uses + /// Buffer::ViewOrCopy under the hood. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + /// \brief Slice each of the arrays in the record batch /// \param[in] offset the starting offset to slice, through end of batch /// \return new record batch diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index bc48ae76c2a2f..002e8b0975094 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -90,7 +90,7 @@ void KeyValueMetadata::Append(std::string key, std::string value) { values_.push_back(std::move(value)); } -Result KeyValueMetadata::Get(const std::string& key) const { +Result KeyValueMetadata::Get(std::string_view key) const { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -129,7 +129,7 @@ Status KeyValueMetadata::DeleteMany(std::vector indices) { return Status::OK(); } -Status KeyValueMetadata::Delete(const std::string& key) { +Status KeyValueMetadata::Delete(std::string_view key) { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -138,20 +138,18 @@ Status KeyValueMetadata::Delete(const std::string& key) { } } -Status KeyValueMetadata::Set(const std::string& key, const std::string& value) { +Status KeyValueMetadata::Set(std::string key, std::string value) { auto index = FindKey(key); if (index < 0) { - Append(key, value); + Append(std::move(key), std::move(value)); } else { - keys_[index] = key; - values_[index] = value; + keys_[index] = std::move(key); + values_[index] = std::move(value); } return Status::OK(); } -bool KeyValueMetadata::Contains(const std::string& key) const { - return FindKey(key) >= 0; -} +bool KeyValueMetadata::Contains(std::string_view key) const { return FindKey(key) >= 0; } void KeyValueMetadata::reserve(int64_t n) { DCHECK_GE(n, 0); @@ -188,7 +186,7 @@ std::vector> KeyValueMetadata::sorted_pairs( return pairs; } -int KeyValueMetadata::FindKey(const std::string& key) const { +int KeyValueMetadata::FindKey(std::string_view key) const { for (size_t i = 0; i < keys_.size(); ++i) { if (keys_[i] == key) { return static_cast(i); diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index 8702ce73a639a..57ade11e75868 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,13 +45,13 @@ class ARROW_EXPORT KeyValueMetadata { void ToUnorderedMap(std::unordered_map* out) const; void Append(std::string key, std::string value); - Result Get(const std::string& key) const; - bool Contains(const std::string& key) const; + Result Get(std::string_view key) const; + bool Contains(std::string_view key) const; // Note that deleting may invalidate known indices - Status Delete(const std::string& key); + Status Delete(std::string_view key); Status Delete(int64_t index); Status DeleteMany(std::vector indices); - Status Set(const std::string& key, const std::string& value); + Status Set(std::string key, std::string value); void reserve(int64_t n); @@ -63,7 +64,7 @@ class ARROW_EXPORT KeyValueMetadata { std::vector> sorted_pairs() const; /// \brief Perform linear search for key, returning -1 if not found - int FindKey(const std::string& key) const; + int FindKey(std::string_view key) const; std::shared_ptr Copy() const; diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 3f038f54a7b27..d773fb5ff5895 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -229,6 +229,15 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(TEST_NAME gandiva-${REL_TEST_NAME}) string(REPLACE "_" "-" TEST_NAME ${TEST_NAME}) + + if(ARG_USE_STATIC_LINKING OR ARROW_TEST_LINKAGE STREQUAL "static") + # LLVM 17 or later requires that an executable exports + # "llvm_orc_registerEHFrameSectionWrapper()" and + # "llvm_orc_unregisterEHFrameSectionWrapper()". We need to do + # nothing when we use libLLVM.so. But we need to export symbols + # explicitly when we use libLLVM*.a. + set_target_properties(${TEST_NAME} PROPERTIES ENABLE_EXPORTS TRUE) + endif() endfunction() add_gandiva_test(internals-test diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index fc047f2ac0763..bfce72cefc630 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -62,7 +62,11 @@ #endif #include #include +#if LLVM_VERSION_MAJOR >= 18 +#include +#else #include +#endif #include #include #if LLVM_VERSION_MAJOR >= 14 @@ -86,7 +90,9 @@ #include #include #include +#if LLVM_VERSION_MAJOR <= 17 #include +#endif // JITLink is available in LLVM 9+ // but the `InProcessMemoryManager::Create` API was added since LLVM 14 @@ -132,8 +138,13 @@ Result MakeTargetMachineBuilder( jtmb.setCPU(cpu_name.str()); jtmb.addFeatures(cpu_attrs); } +#if LLVM_VERSION_MAJOR >= 18 + using CodeGenOptLevel = llvm::CodeGenOptLevel; +#else + using CodeGenOptLevel = llvm::CodeGenOpt::Level; +#endif auto const opt_level = - conf.optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; + conf.optimize() ? CodeGenOptLevel::Aggressive : CodeGenOptLevel::None; jtmb.setCodeGenOptLevel(opt_level); return jtmb; } diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 55292ac35ab9c..f56ba0958ae2d 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -34,10 +34,6 @@ Result> FromFLBA(const LogicalType& logical_t Result> FromInt32(const LogicalType& logical_type); Result> FromInt64(const LogicalType& logical_type); -Result> GetArrowType(Type::type physical_type, - const LogicalType& logical_type, - int type_length); - Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc index 49b2317ede187..61fe397cf1c30 100644 --- a/cpp/src/parquet/column_reader_benchmark.cc +++ b/cpp/src/parquet/column_reader_benchmark.cc @@ -219,5 +219,103 @@ BENCHMARK(RecordReaderReadRecords) ->Args({2, 1000, true}) ->Args({2, 1000, false}); +void GenerateLevels(int level_repeats, int max_level, int num_levels, + std::vector* levels) { + // Generate random levels + std::default_random_engine gen(/*seed=*/1943); + std::uniform_int_distribution d(0, max_level); + for (int i = 0; i < num_levels;) { + int16_t current_level = d(gen); // level repeat `level_repeats` times + const int current_repeated = std::min(level_repeats, num_levels - i); + levels->insert(levels->end(), current_repeated, current_level); + i += current_repeated; + } +} + +void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, + const int16_t* input_levels, std::vector* bytes) { + LevelEncoder encoder; + // encode levels + if (encoding == Encoding::RLE) { + int rle_size = LevelEncoder::MaxBufferSize(encoding, max_level, num_levels); + bytes->resize(rle_size + sizeof(int32_t)); + // leave space to write the rle length value + encoder.Init(encoding, max_level, num_levels, bytes->data() + sizeof(int32_t), + rle_size); + encoder.Encode(num_levels, input_levels); + int data_length = encoder.len(); + memcpy(bytes->data(), &data_length, sizeof(int32_t)); + } else { + int bitpack_size = + LevelEncoder::MaxBufferSize(encoding, max_level, num_levels) + sizeof(int32_t); + bytes->resize(bitpack_size); + encoder.Init(encoding, max_level, num_levels, bytes->data(), + static_cast(bytes->size())); + encoder.Encode(num_levels, input_levels); + } +} + +static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int num_levels, + int batch_size, int level_repeat_count, + ::benchmark::State& state) { + std::vector bytes; + { + std::vector input_levels; + GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level, + num_levels, &input_levels); + EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), &bytes); + } + + LevelDecoder decoder; + std::vector output_levels(batch_size); + for (auto _ : state) { + state.PauseTiming(); + decoder.SetData(level_encoding, max_level, num_levels, bytes.data(), + static_cast(bytes.size())); + state.ResumeTiming(); + // Decode multiple times with batch_size + while (true) { + int levels_decoded = decoder.Decode(batch_size, output_levels.data()); + if (levels_decoded == 0) { + break; + } + } + } + state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t)); + state.SetItemsProcessed(state.iterations() * num_levels); +} + +static void ReadLevels_Rle(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size, level_repeat_count, + state); +} + +static void ReadLevels_BitPack(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::BIT_PACKED, max_level, num_levels, batch_size, + level_repeat_count, state); +} + +static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) { + b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"}) + ->Args({1, 8096, 1024, 1}) + ->Args({1, 8096, 1024, 7}) + ->Args({1, 8096, 1024, 1024}) + ->Args({1, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 1}) + ->Args({3, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 7}); +} + +BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments); +BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments); + } // namespace benchmark } // namespace parquet diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 23366b2daafd5..eae8fc6125499 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -442,7 +442,8 @@ class SerializedPageWriter : public PageWriter { if (offset_index_builder_ != nullptr) { const int64_t compressed_size = output_data_len + header_size; if (compressed_size > std::numeric_limits::max()) { - throw ParquetException("Compressed page size overflows INT32_MAX."); + throw ParquetException("Compressed page size ", compressed_size, + " overflows INT32_MAX."); } if (!page.first_row_index().has_value()) { throw ParquetException("First row index is not set in data page."); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 97421629d2ca6..a40e71ce30aec 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1021,7 +1021,7 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, } void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector& bytes) { LevelDecoder decoder; int levels_count = 0; @@ -1060,7 +1060,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, } void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector>& bytes) { LevelDecoder decoder; int levels_count = 0; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b801b5ab11bb9..a3d1746536647 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -160,7 +160,8 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } UnsafePutByteArray(view.data(), static_cast(view.size())); return Status::OK(); @@ -571,7 +572,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } PutByteArray(view.data(), static_cast(view.size())); return Status::OK(); @@ -585,7 +587,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { for (int64_t i = 0; i < array.length(); i++) { auto v = array.GetView(i); if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); + throw ParquetException( + "Parquet cannot store strings with size 2GB or more, got: ", v.size()); } dict_encoded_size_ += static_cast(v.size() + sizeof(uint32_t)); int32_t unused_memo_index; @@ -2411,7 +2414,11 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecodernum_values_ = num_values; - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_ == nullptr) { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } else { + decoder_->Reset(data, len); + } InitHeader(); } @@ -2667,7 +2674,8 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); @@ -2769,7 +2777,11 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, void SetData(int num_values, const uint8_t* data, int len) override { DecoderImpl::SetData(num_values, data, len); - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_ == nullptr) { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } else { + decoder_->Reset(data, len); + } DecodeLengths(); } @@ -3192,7 +3204,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } const ByteArray src{view}; @@ -3238,7 +3251,8 @@ struct ByteArrayVisitor { std::string_view operator[](int i) const { if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); + throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ", + src[i].len); } return std::string_view{src[i]}; } diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 2664775c0fbf4..dd3f5da84f777 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -115,8 +115,8 @@ ARROW_UTF8PROC_BUILD_VERSION=v2.7.0 ARROW_UTF8PROC_BUILD_SHA256_CHECKSUM=4bb121e297293c0fd55f08f83afab6d35d48f0af4ecc07523ad8ec99aa2b12a1 ARROW_XSIMD_BUILD_VERSION=9.0.1 ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0 -ARROW_ZLIB_BUILD_VERSION=1.3 -ARROW_ZLIB_BUILD_SHA256_CHECKSUM=ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e +ARROW_ZLIB_BUILD_VERSION=1.3.1 +ARROW_ZLIB_BUILD_SHA256_CHECKSUM=9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 ARROW_ZSTD_BUILD_VERSION=1.5.5 ARROW_ZSTD_BUILD_SHA256_CHECKSUM=9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4 diff --git a/csharp/README.md b/csharp/README.md index 6e6ed9c756873..b36eb899db2d5 100644 --- a/csharp/README.md +++ b/csharp/README.md @@ -115,10 +115,10 @@ for currently available features. ### Compression -- Buffer compression is not supported when writing IPC files or streams -- Buffer decompression is supported, but requires installing the `Apache.Arrow.Compression` package, - and passing an `Apache.Arrow.Compression.CompressionCodecFactory` instance to the - `ArrowFileReader` or `ArrowStreamReader` constructor. +- Buffer compression and decompression is supported, but requires installing the `Apache.Arrow.Compression` package. + When reading compressed data, you must pass an `Apache.Arrow.Compression.CompressionCodecFactory` instance to the + `ArrowFileReader` or `ArrowStreamReader` constructor, and when writing compressed data a + `CompressionCodecFactory` must be set in the `IpcOptions`. Alternatively, a custom implementation of `ICompressionCodecFactory` can be used. ## Not Implemented diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index fded62911262c..6988567193db4 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -1,10 +1,16 @@ - netstandard2.0 Provides decompression support for the Arrow IPC format + + netstandard2.0;net462 + + + netstandard2.0 + + diff --git a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs index 3e0a537a89a8f..4bfcdf6544f9d 100644 --- a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs @@ -24,11 +24,16 @@ namespace Apache.Arrow.Compression public sealed class CompressionCodecFactory : ICompressionCodecFactory { public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType) + { + return CreateCodec(compressionCodecType, null); + } + + public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType, int? compressionLevel) { return compressionCodecType switch { - CompressionCodecType.Lz4Frame => Lz4CompressionCodec.Instance, - CompressionCodecType.Zstd => new ZstdCompressionCodec(), + CompressionCodecType.Lz4Frame => new Lz4CompressionCodec(compressionLevel), + CompressionCodecType.Zstd => new ZstdCompressionCodec(compressionLevel), _ => throw new NotImplementedException($"Compression type {compressionCodecType} is not supported") }; } diff --git a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs index ebbcfbc3e095f..df19c16a30213 100644 --- a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs @@ -14,17 +14,35 @@ // limitations under the License. using System; +using System.IO; using Apache.Arrow.Ipc; +using K4os.Compression.LZ4; using K4os.Compression.LZ4.Streams; namespace Apache.Arrow.Compression { internal sealed class Lz4CompressionCodec : ICompressionCodec { - /// - /// Singleton instance, used as this class doesn't need to be disposed and has no state - /// - public static readonly Lz4CompressionCodec Instance = new Lz4CompressionCodec(); + private readonly LZ4EncoderSettings _settings = null; + + public Lz4CompressionCodec(int? compressionLevel = null) + { + if (compressionLevel.HasValue) + { + if (Enum.IsDefined(typeof(LZ4Level), compressionLevel)) + { + _settings = new LZ4EncoderSettings + { + CompressionLevel = (LZ4Level) compressionLevel, + }; + } + else + { + throw new ArgumentException( + $"Invalid LZ4 compression level ({compressionLevel})", nameof(compressionLevel)); + } + } + } public int Decompress(ReadOnlyMemory source, Memory destination) { @@ -32,6 +50,12 @@ public int Decompress(ReadOnlyMemory source, Memory destination) return decoder.ReadManyBytes(destination.Span); } + public void Compress(ReadOnlyMemory source, Stream destination) + { + using var encoder = LZ4Frame.Encode(destination, _settings, leaveOpen: true); + encoder.WriteManyBytes(source.Span); + } + public void Dispose() { } diff --git a/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs index 92c2e65371612..cc340a7cd1b9f 100644 --- a/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs @@ -14,6 +14,7 @@ // limitations under the License. using System; +using System.IO; using Apache.Arrow.Ipc; using ZstdSharp; @@ -22,10 +23,21 @@ namespace Apache.Arrow.Compression internal sealed class ZstdCompressionCodec : ICompressionCodec { private readonly Decompressor _decompressor; + private readonly Compressor _compressor; - public ZstdCompressionCodec() + public ZstdCompressionCodec(int? compressionLevel = null) { + if (compressionLevel.HasValue && + (compressionLevel.Value < Compressor.MinCompressionLevel || + compressionLevel.Value > Compressor.MaxCompressionLevel)) + { + throw new ArgumentException( + $"Zstd compression level must be between {Compressor.MinCompressionLevel} and {Compressor.MaxCompressionLevel}", + nameof(compressionLevel)); + } + _decompressor = new Decompressor(); + _compressor = new Compressor(compressionLevel ?? Compressor.DefaultCompressionLevel); } public int Decompress(ReadOnlyMemory source, Memory destination) @@ -33,9 +45,17 @@ public int Decompress(ReadOnlyMemory source, Memory destination) return _decompressor.Unwrap(source.Span, destination.Span); } + public void Compress(ReadOnlyMemory source, Stream destination) + { + using var compressor = new CompressionStream( + destination, _compressor, preserveCompressor: true, leaveOpen: true); + compressor.Write(source.Span); + } + public void Dispose() { _decompressor.Dispose(); + _compressor.Dispose(); } } } diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 68c3e47e01902..3a6ae28b390d2 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 3a229f4ffcaf8..c4bb64b73a9ed 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -1,14 +1,20 @@ - netstandard2.0;net6.0 true $(DefineConstants);UNSAFE_BYTEBUFFER;BYTEBUFFER_NO_BOUNDS_CHECK;ENABLE_SPAN_T Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. - + + netstandard2.0;net6.0;net462 + + + netstandard2.0;net6.0 + + + @@ -34,7 +40,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs index fe42075f14f73..e0e0f5707086b 100644 --- a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs +++ b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs @@ -25,5 +25,12 @@ public static void Deconstruct(this Tuple value, out T1 item1, o item1 = value.Item1; item2 = value.Item2; } + + public static void Deconstruct(this Tuple value, out T1 item1, out T2 item2, out T3 item3) + { + item1 = value.Item1; + item2 = value.Item2; + item3 = value.Item3; + } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs index 547fa800ec71e..a643012bab1a2 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs @@ -20,6 +20,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; +using Apache.Arrow.Memory; namespace Apache.Arrow.Ipc { @@ -37,12 +38,17 @@ public ArrowFileWriter(Stream stream, Schema schema) } public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen) - : this(stream, schema, leaveOpen, options: null) + : this(stream, schema, leaveOpen, options: null, allocator: null) { } public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen, IpcOptions options) - : base(stream, schema, leaveOpen, options) + : this(stream, schema, leaveOpen, options, allocator: null) + { + } + + public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen, IpcOptions options, MemoryAllocator allocator) + : base(stream, schema, leaveOpen, options, allocator) { if (!stream.CanWrite) { diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 07d1dcfdb171d..b002f8c8b1578 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -22,6 +22,7 @@ using System.Threading; using System.Threading.Tasks; using Apache.Arrow.Arrays; +using Apache.Arrow.Memory; using Apache.Arrow.Types; using Google.FlatBuffers; @@ -29,7 +30,7 @@ namespace Apache.Arrow.Ipc { public class ArrowStreamWriter : IDisposable { - internal class ArrowRecordBatchFlatBufferBuilder : + private class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -81,14 +82,21 @@ public Buffer(ArrowBuffer buffer, int offset) } private readonly List _buffers; + private readonly ICompressionCodec _compressionCodec; + private readonly MemoryAllocator _allocator; + private readonly MemoryStream _compressionStream; public IReadOnlyList Buffers => _buffers; public List VariadicCounts { get; private set; } public int TotalLength { get; private set; } - public ArrowRecordBatchFlatBufferBuilder() + public ArrowRecordBatchFlatBufferBuilder( + ICompressionCodec compressionCodec, MemoryAllocator allocator, MemoryStream compressionStream) { + _compressionCodec = compressionCodec; + _compressionStream = compressionStream; + _allocator = allocator; _buffers = new List(); TotalLength = 0; } @@ -238,11 +246,50 @@ private void CreateBuffers(PrimitiveArray array) private Buffer CreateBuffer(ArrowBuffer buffer) { int offset = TotalLength; + const int UncompressedLengthSize = 8; - int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(buffer.Length)); + ArrowBuffer bufferToWrite; + if (_compressionCodec == null) + { + bufferToWrite = buffer; + } + else if (buffer.Length == 0) + { + // Write zero length and skip compression + var uncompressedLengthBytes = _allocator.Allocate(UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(uncompressedLengthBytes.Memory.Span, 0); + bufferToWrite = new ArrowBuffer(uncompressedLengthBytes); + } + else + { + // See format/Message.fbs, and the BUFFER BodyCompressionMethod for documentation on how + // compressed buffers are stored. + _compressionStream.Seek(0, SeekOrigin.Begin); + _compressionStream.SetLength(0); + _compressionCodec.Compress(buffer.Memory, _compressionStream); + if (_compressionStream.Length < buffer.Length) + { + var newBuffer = _allocator.Allocate((int) _compressionStream.Length + UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, buffer.Length); + _compressionStream.Seek(0, SeekOrigin.Begin); + _compressionStream.ReadFullBuffer(newBuffer.Memory.Slice(UncompressedLengthSize)); + bufferToWrite = new ArrowBuffer(newBuffer); + } + else + { + // If the compressed buffer is larger than the uncompressed buffer, use the uncompressed + // buffer instead, and indicate this by setting the uncompressed length to -1 + var newBuffer = _allocator.Allocate(buffer.Length + UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, -1); + buffer.Memory.CopyTo(newBuffer.Memory.Slice(UncompressedLengthSize)); + bufferToWrite = new ArrowBuffer(newBuffer); + } + } + + int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(bufferToWrite.Length)); TotalLength += paddedLength; - return new Buffer(buffer, offset); + return new Buffer(bufferToWrite, offset); } public void Visit(IArrowArray array) @@ -269,6 +316,9 @@ public void Visit(IArrowArray array) private readonly bool _leaveOpen; private readonly IpcOptions _options; + private readonly MemoryAllocator _allocator; + // Reuse a single memory stream for writing compressed data to, to reduce memory allocations + private readonly MemoryStream _compressionStream = new MemoryStream(); private protected const Flatbuf.MetadataVersion CurrentMetadataVersion = Flatbuf.MetadataVersion.V5; @@ -285,15 +335,21 @@ public ArrowStreamWriter(Stream baseStream, Schema schema) } public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen) - : this(baseStream, schema, leaveOpen, options: null) + : this(baseStream, schema, leaveOpen, options: null, allocator: null) { } public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOptions options) + : this(baseStream, schema, leaveOpen, options, allocator: null) + { + } + + public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOptions options, MemoryAllocator allocator) { BaseStream = baseStream ?? throw new ArgumentNullException(nameof(baseStream)); Schema = schema ?? throw new ArgumentNullException(nameof(schema)); _leaveOpen = leaveOpen; + _allocator = allocator ?? MemoryAllocator.Default.Value; Buffers = ArrayPool.Create(); Builder = new FlatBufferBuilder(1024); @@ -301,6 +357,13 @@ public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOp _fieldTypeBuilder = new ArrowTypeFlatbufferBuilder(Builder); _options = options ?? IpcOptions.Default; + + if (_options.CompressionCodec.HasValue && _options.CompressionCodecFactory == null) + { + throw new ArgumentException( + $"A {nameof(_options.CompressionCodecFactory)} must be provided when a {nameof(_options.CompressionCodec)} is specified", + nameof(options)); + } } private void CreateSelfAndChildrenFieldNodes(ArrayData data) @@ -326,6 +389,23 @@ private static int CountAllNodes(IReadOnlyList fields) return count; } + private Offset GetBodyCompression() + { + if (_options.CompressionCodec == null) + { + return default; + } + + var compressionType = _options.CompressionCodec.Value switch + { + CompressionCodecType.Lz4Frame => Flatbuf.CompressionType.LZ4_FRAME, + CompressionCodecType.Zstd => Flatbuf.CompressionType.ZSTD, + _ => throw new ArgumentOutOfRangeException() + }; + return Flatbuf.BodyCompression.CreateBodyCompression( + Builder, compressionType, Flatbuf.BodyCompressionMethod.BUFFER); + } + private static void CountSelfAndChildrenNodes(IArrowType type, ref int count) { if (type is NestedType nestedType) @@ -356,7 +436,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) } (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(recordBatch); + PrepareWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -367,7 +447,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch, @@ -397,7 +477,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat } (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(recordBatch); + PrepareWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -408,7 +488,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, @@ -482,12 +562,12 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList PreparingWritingRecordBatch(RecordBatch recordBatch) + private Tuple PrepareWritingRecordBatch(RecordBatch recordBatch) { - return PreparingWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); + return PrepareWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); } - private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) + private Tuple PrepareWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) { Builder.Clear(); @@ -507,7 +587,13 @@ private Tuple Pre // Serialize buffers - var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(); + // CompressionCodec can be disposed after all data is visited by the builder, + // and doesn't need to be alive for the full lifetime of the ArrowRecordBatchFlatBufferBuilder + using var compressionCodec = _options.CompressionCodec.HasValue + ? _options.CompressionCodecFactory.CreateCodec(_options.CompressionCodec.Value, _options.CompressionLevel) + : null; + + var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(compressionCodec, _allocator, _compressionStream); for (int i = 0; i < fieldCount; i++) { IArrowArray fieldArray = arrays[i]; @@ -599,7 +685,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, var arrays = new List { dictionary }; (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(fields, arrays); + PrepareWritingRecordBatch(fields, arrays); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -607,7 +693,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, dictionary.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); // TODO: Support delta. @@ -994,6 +1080,7 @@ public virtual void Dispose() { BaseStream.Dispose(); } + _compressionStream.Dispose(); } } diff --git a/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs b/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs index b18ca3a5e4190..16c01d7130fb5 100644 --- a/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs +++ b/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs @@ -14,6 +14,7 @@ // limitations under the License. using System; +using System.IO; namespace Apache.Arrow.Ipc { @@ -29,5 +30,19 @@ public interface ICompressionCodec : IDisposable /// Data buffer to write decompressed data to /// The number of decompressed bytes written into the destination int Decompress(ReadOnlyMemory source, Memory destination); + + /// + /// Write compressed data + /// + /// The data to compress + /// The stream to write compressed data to + void Compress(ReadOnlyMemory source, Stream destination) +#if NET6_0_OR_GREATER + { + throw new NotImplementedException("This codec does not support compression"); + } +#else + ; +#endif } } diff --git a/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs b/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs index 5422a033bd6d2..f367b15574b6e 100644 --- a/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs @@ -20,6 +20,27 @@ namespace Apache.Arrow.Ipc /// public interface ICompressionCodecFactory { + /// + /// Create a new compression codec + /// + /// The type of codec to create + /// The created codec ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType); + + /// + /// Create a new compression codec with a specified compression level + /// + /// The type of codec to create + /// The compression level to use when compressing data + /// The created codec + ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType, int? compressionLevel) +#if NET6_0_OR_GREATER + { + // Default implementation ignores the compression level + return CreateCodec(compressionCodecType); + } +#else + ; +#endif } } diff --git a/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs b/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs index b6cc3a1cb4b51..8484c9a04ab2d 100644 --- a/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs +++ b/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs @@ -25,6 +25,23 @@ public class IpcOptions /// public bool WriteLegacyIpcFormat { get; set; } + /// + /// The compression codec to use to compress data buffers. + /// If null (the default value), no compression is used. + /// + public CompressionCodecType? CompressionCodec { get; set; } + + /// + /// The compression codec factory used to create compression codecs. + /// Must be provided if a CompressionCodec is specified. + /// + public ICompressionCodecFactory CompressionCodecFactory { get; set; } + + /// + /// Sets the compression level to use for codecs that support this. + /// + public int? CompressionLevel { get; set; } + public IpcOptions() { } diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 8ed7a93bdcf27..f5e2a0ef8e16e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -15,6 +15,7 @@ + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs new file mode 100644 index 0000000000000..a237f9c1d0660 --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Apache.Arrow.Ipc; +using Apache.Arrow.Tests; +using K4os.Compression.LZ4; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowFileWriterTests + { + [Fact] + public void ThrowsWhenNoCompressionFactoryProvided() + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var options = new IpcOptions + { + CompressionCodec = CompressionCodecType.Zstd, + }; + + using var stream = new MemoryStream(); + var exception = Assert.Throws( + () => new ArrowFileWriter(stream, batch.Schema, leaveOpen: true, options)); + + Assert.Contains("A CompressionCodecFactory must be provided", exception.Message); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd, null)] + [InlineData(CompressionCodecType.Zstd, 2)] + [InlineData(CompressionCodecType.Lz4Frame, null)] + [InlineData(CompressionCodecType.Lz4Frame, (int)LZ4Level.L03_HC)] + public void CanWriteCompressedIpcFile(CompressionCodecType codec, int? compressionLevel) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = compressionLevel, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public async Task CanWriteCompressedIpcFileAsync(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + }; + await TestRoundTripRecordBatchesAsync(new [] {batch}, options, codecFactory); + } + + private static void TestRoundTripRecordBatches( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowFileWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + writer.WriteRecordBatch(originalBatch); + } + writer.WriteEnd(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = Assert.Throws(() => + { + using var reader = new ArrowFileReader(stream, leaveOpen: true); + reader.ReadNextRecordBatch(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowFileReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = reader.ReadNextRecordBatch(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + + private static async Task TestRoundTripRecordBatchesAsync( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowFileWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + await writer.WriteRecordBatchAsync(originalBatch); + } + await writer.WriteEndAsync(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = await Assert.ThrowsAsync(async () => + { + using var reader = new ArrowFileReader(stream, leaveOpen: true); + await reader.ReadNextRecordBatchAsync(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowFileReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = await reader.ReadNextRecordBatchAsync(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + } +} + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs new file mode 100644 index 0000000000000..3b09dc26a343f --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Apache.Arrow.Ipc; +using Apache.Arrow.Tests; +using K4os.Compression.LZ4; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowStreamWriterTests + { + [Fact] + public void ThrowsWhenNoCompressionFactoryProvided() + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var options = new IpcOptions + { + CompressionCodec = CompressionCodecType.Zstd, + }; + + using var stream = new MemoryStream(); + var exception = Assert.Throws( + () => new ArrowStreamWriter(stream, batch.Schema, leaveOpen: true, options)); + + Assert.Contains("A CompressionCodecFactory must be provided", exception.Message); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd, null)] + [InlineData(CompressionCodecType.Zstd, 2)] + [InlineData(CompressionCodecType.Lz4Frame, null)] + [InlineData(CompressionCodecType.Lz4Frame, (int)LZ4Level.L03_HC)] + public void CanWriteCompressedIpcStream(CompressionCodecType codec, int? compressionLevel) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = compressionLevel, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public async Task CanWriteCompressedIpcStreamAsync(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + }; + await TestRoundTripRecordBatchesAsync(new [] {batch}, options, codecFactory); + } + + [Fact] + public void CanWriteEmptyBatches() + { + var batch = TestData.CreateSampleRecordBatch(length: 0); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = CompressionCodecType.Lz4Frame, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public void ThrowsForInvalidCompressionLevel(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = 12345, + }; + + using var stream = new MemoryStream(); + + Assert.Throws(() => + { + using var writer = new ArrowStreamWriter(stream, batch.Schema, leaveOpen: false, options); + writer.WriteRecordBatch(batch); + writer.WriteEnd(); + }); + } + + private static void TestRoundTripRecordBatches( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + writer.WriteRecordBatch(originalBatch); + } + writer.WriteEnd(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = Assert.Throws(() => + { + using var reader = new ArrowStreamReader(stream, leaveOpen: true); + reader.ReadNextRecordBatch(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowStreamReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = reader.ReadNextRecordBatch(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + + private static async Task TestRoundTripRecordBatchesAsync( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + await writer.WriteRecordBatchAsync(originalBatch); + } + await writer.WriteEndAsync(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = await Assert.ThrowsAsync(async () => + { + using var reader = new ArrowStreamReader(stream, leaveOpen: true); + await reader.ReadNextRecordBatchAsync(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowStreamReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = await reader.ReadNextRecordBatchAsync(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + } +} + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj index cb7f7ae896ee2..e77f329bf2a15 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj +++ b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj @@ -10,6 +10,7 @@ + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs index 6a1e91240989b..3886846833c27 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs @@ -16,6 +16,7 @@ using System; using System.IO; using System.Threading.Tasks; +using Apache.Arrow.Compression; using Apache.Arrow.Ipc; using Apache.Arrow.Tests; using Apache.Arrow.Types; @@ -65,8 +66,9 @@ private async Task Validate() { JsonFile jsonFile = await ParseJsonFile(); + var compressionFactory = new CompressionCodecFactory(); using FileStream arrowFileStream = ArrowFileInfo.OpenRead(); - using ArrowFileReader reader = new ArrowFileReader(arrowFileStream); + using ArrowFileReader reader = new ArrowFileReader(arrowFileStream, compressionCodecFactory: compressionFactory); int batchCount = await reader.RecordBatchCountAsync(); if (batchCount != jsonFile.Batches.Count) @@ -122,7 +124,8 @@ private async Task JsonToArrow() private async Task StreamToFile() { - using ArrowStreamReader reader = new ArrowStreamReader(Console.OpenStandardInput()); + var compressionFactory = new CompressionCodecFactory(); + using ArrowStreamReader reader = new ArrowStreamReader(Console.OpenStandardInput(), compressionCodecFactory: compressionFactory); RecordBatch batch = await reader.ReadNextRecordBatchAsync(); @@ -145,7 +148,8 @@ private async Task StreamToFile() private async Task FileToStream() { using FileStream fileStream = ArrowFileInfo.OpenRead(); - using ArrowFileReader fileReader = new ArrowFileReader(fileStream); + var compressionFactory = new CompressionCodecFactory(); + using ArrowFileReader fileReader = new ArrowFileReader(fileStream, compressionCodecFactory: compressionFactory); // read the record batch count to initialize the Schema await fileReader.RecordBatchCountAsync(); diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index d8a92ff756751..c422da56b4cef 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -7,7 +7,7 @@ - net7.0;net472 + net7.0;net472;net462 net7.0 diff --git a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs index 4c2b050d0c8ba..447572dda0eea 100644 --- a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs @@ -83,7 +83,7 @@ public void AppendSingleByte(byte[][] initialContents, byte singleByte) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(new[] { singleByte }); + var expectedArrayContents = initialContents.Concat(new[] { new[] { singleByte } }); // Act var actualReturnValue = builder.Append(singleByte); @@ -130,7 +130,7 @@ public void AppendNull(byte[][] initialContents) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(null); + var expectedArrayContents = initialContents.Concat(new byte[][] { null }); // Act var actualReturnValue = builder.AppendNull(); @@ -180,7 +180,7 @@ public void AppendReadOnlySpan(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; var span = (ReadOnlySpan)bytes; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(span); @@ -230,7 +230,7 @@ public void AppendEnumerable(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; int expectedLength = initialLength + 1; var enumerable = (IEnumerable)bytes; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(enumerable); diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py index 4e5104362254c..caab824aeb38f 100644 --- a/dev/archery/archery/bot.py +++ b/dev/archery/archery/bot.py @@ -324,7 +324,8 @@ def crossbow(obj, crossbow): obj['crossbow_repo'] = crossbow -def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): +def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, + pr_number, pr_branch): """ Clone the repositories and initialize crossbow objects. @@ -338,22 +339,25 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): Object containing information about the pull request the comment bot was triggered from. """ + bare_arrow_path = dest / 'arrow_bare' arrow_path = dest / 'arrow' queue_path = dest / 'crossbow' - # clone arrow and checkout the pull request's branch - pull_request_ref = 'pull/{}/head:{}'.format( - pull_request.number, pull_request.head.ref - ) - git.clone(pull_request.base.repo.clone_url, str(arrow_path)) - git.fetch('origin', pull_request_ref, git_dir=arrow_path) - git.checkout(pull_request.head.ref, git_dir=arrow_path) - - # clone crossbow repository + # 1. clone arrow and checkout the PR's branch + pr_ref = f'pull/{pr_number}/head:{pr_branch}' + # we do a bare clone of upstream arrow to avoid issues when the PR is + # submitted from a fork's main branch (GH-39996) + git.clone('--bare', arrow_repo_url, str(bare_arrow_path)) + # fetch the PR's branch into the bare clone + git.fetch('origin', pr_ref, git_dir=bare_arrow_path) + # clone and checkout the PR's branch into a full local repo + git.clone(f'--branch={pr_branch}', bare_arrow_path, arrow_path) + + # 2. clone crossbow repository crossbow_url = 'https://github.com/{}'.format(crossbow_repo) git.clone(crossbow_url, str(queue_path)) - # initialize crossbow objects + # 3. initialize crossbow objects github_token = os.environ['CROSSBOW_GITHUB_TOKEN'] arrow = Repo(arrow_path) queue = Queue(queue_path, github_token=github_token, require_https=True) @@ -385,7 +389,9 @@ def submit(obj, tasks, groups, params, arrow_version, wait): arrow, queue = _clone_arrow_and_crossbow( dest=Path(tmpdir), crossbow_repo=crossbow_repo, - pull_request=pull_request, + arrow_repo_url=pull_request.base.repo.clone_url, + pr_number=pull_request.number, + pr_branch=pull_request.head.ref, ) # load available tasks configuration and groups from yaml config = Config.load_yaml(arrow.path / "dev" / "tasks" / "tasks.yml") diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 7fadb7e47cf93..299983f62f283 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -158,7 +158,6 @@ def _gold_tests(self, gold_dir): skip_testers.add("JS") skip_testers.add("Rust") if prefix == '2.0.0-compression': - skip_testers.add("C#") skip_testers.add("JS") # See https://github.com/apache/arrow/pull/9822 for how to diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index df6c0778dc805..0c1b98ab32c95 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1089,6 +1089,7 @@ def available_apt_targets ["ubuntu", "focal", "main"], ["ubuntu", "jammy", "main"], ["ubuntu", "mantic", "main"], + ["ubuntu", "noble", "main"], ] end @@ -2121,8 +2122,10 @@ def apt_test_targets_default # "ubuntu-focal-arm64", "ubuntu-jammy", # "ubuntu-jammy-arm64", - "ubuntu-lunar", - # "ubuntu-lunar-arm64", + "ubuntu-mantic", + # "ubuntu-mantic-arm64", + "ubuntu-noble", + # "ubuntu-noble-arm64", ] end diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index f18f7d10c73e6..4df574700e812 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -86,6 +86,21 @@ if [ "$is_major_release" = "yes" ] ; then fi git add docs git commit -m "[Website] Update documentations for ${version}" + +# Update DOCUMENTATION_OPTIONS.theme_switcher_version_match and +# DOCUMENTATION_OPTIONS.show_version_warning_banner +pushd docs/${previous_series} +find ./ \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '';/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '${previous_version}';/g" \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +git add docs/${previous_series} +git commit -m "[Website] Update warning banner for ${previous_series}" git clean -d -f -x popd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 04fc7fd563f65..a61b5ba094c8a 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -196,7 +196,9 @@ test_apt() { "ubuntu:jammy" \ "arm64v8/ubuntu:jammy" \ "ubuntu:mantic" \ - "arm64v8/ubuntu:mantic"; do \ + "arm64v8/ubuntu:mantic" \ + "ubuntu:noble" \ + "arm64v8/ubuntu:noble"; do \ case "${target}" in arm64v8/*) if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index b8ffbfdb715b6..367445c595c4b 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -340,7 +340,6 @@ outputs: # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - pytest - - pytest-lazy-fixture - backports.zoneinfo # [py<39] - boto3 - cffi diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile new file mode 100644 index 0000000000000..0e37ee94bb0a3 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:noble + +RUN \ + echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN \ + echo 'APT::Install-Recommends "false";' > \ + /etc/apt/apt.conf.d/disable-install-recommends + +ARG DEBUG + +RUN \ + quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ + apt update ${quiet} && \ + apt install -y -V ${quiet} \ + build-essential \ + debhelper \ + devscripts \ + fakeroot \ + gnupg \ + lsb-release && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from new file mode 100644 index 0000000000000..4414c353871c6 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arm64v8/ubuntu:noble diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile new file mode 100644 index 0000000000000..33f2d9a35371b --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG FROM=ubuntu:noble +FROM ${FROM} + +RUN \ + echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN \ + echo 'APT::Install-Recommends "false";' > \ + /etc/apt/apt.conf.d/disable-install-recommends + +ARG DEBUG +RUN \ + quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ + apt update ${quiet} && \ + apt install -y -V ${quiet} \ + build-essential \ + clang \ + clang-tools \ + cmake \ + debhelper \ + devscripts \ + git \ + gtk-doc-tools \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgirepository1.0-dev \ + libglib2.0-doc \ + libgmock-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + libgtest-dev \ + liblz4-dev \ + libmlir-15-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libre2-dev \ + libsnappy-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libzstd-dev \ + llvm-dev \ + lsb-release \ + meson \ + mlir-15-tools \ + ninja-build \ + nlohmann-json3-dev \ + pkg-config \ + protobuf-compiler-grpc \ + python3-dev \ + python3-pip \ + python3-setuptools \ + rapidjson-dev \ + tzdata \ + valac \ + zlib1g-dev && \ + if apt list | grep -q '^libcuda'; then \ + apt install -y -V ${quiet} nvidia-cuda-toolkit; \ + else \ + :; \ + fi && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index ecd61054daeb1..51fe0b9a75b0c 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -279,6 +279,8 @@ def apt_targets_default # "ubuntu-jammy-arm64", "ubuntu-mantic", # "ubuntu-mantic-arm64", + "ubuntu-noble", + # "ubuntu-noble-arm64", ] end diff --git a/dev/tasks/python-wheels/github.osx.amd64.yml b/dev/tasks/python-wheels/github.osx.amd64.yml index 526412f84214b..e31a681653b37 100644 --- a/dev/tasks/python-wheels/github.osx.amd64.yml +++ b/dev/tasks/python-wheels/github.osx.amd64.yml @@ -85,6 +85,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 35d74f1462453..380c2e42f1d88 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -71,6 +71,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml index 1641796a719e2..01f4977a9b0b1 100644 --- a/dev/tasks/python-wheels/github.windows.yml +++ b/dev/tasks/python-wheels/github.windows.yml @@ -29,7 +29,7 @@ jobs: # this is a private repository at the moment (mostly because of licensing # consideration of windows images with visual studio), but anyone can # recreate the image by manually building it via: - # `archery build python-wheel-windows-vs2017` + # `archery build python-wheel-windows-vs2019` # note that we don't run docker build since there wouldn't be a cache hit # and rebuilding the dependencies takes a fair amount of time REPO: ghcr.io/ursacomputing/arrow @@ -46,17 +46,17 @@ jobs: run: | cd arrow @rem We want to use only - @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 @rem but it doesn't use pulled caches. @rem It always build an image from scratch. @rem We can remove this workaround once we find a way to use @rem pulled caches when build an image. echo on - archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2017 + archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2019 if errorlevel 1 ( - archery docker build --no-pull python-wheel-windows-vs2017 || exit /B 1 + archery docker build --no-pull python-wheel-windows-vs2019 || exit /B 1 ) - archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 - uses: actions/upload-artifact@v3 with: @@ -77,5 +77,5 @@ jobs: shell: cmd run: | cd arrow - archery docker push python-wheel-windows-vs2017 + archery docker push python-wheel-windows-vs2019 {% endif %} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 6c59364d51a50..cf04d29715306 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -465,7 +465,8 @@ tasks: "debian-trixie", "ubuntu-focal", "ubuntu-jammy", - "ubuntu-mantic"] %} + "ubuntu-mantic", + "ubuntu-noble"] %} {% for architecture in ["amd64", "arm64"] %} {{ target }}-{{ architecture }}: ci: github @@ -747,6 +748,10 @@ tasks: - arrow-jdbc-{no_rc_snapshot_version}-tests.jar - arrow-jdbc-{no_rc_snapshot_version}.jar - arrow-jdbc-{no_rc_snapshot_version}.pom + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.json + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-maven-plugins-{no_rc_snapshot_version}-src.zip + - arrow-maven-plugins-{no_rc_snapshot_version}.pom - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-core-{no_rc_snapshot_version}-javadoc.jar @@ -761,6 +766,13 @@ tasks: - arrow-memory-netty-{no_rc_snapshot_version}-tests.jar - arrow-memory-netty-{no_rc_snapshot_version}.jar - arrow-memory-netty-{no_rc_snapshot_version}.pom + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.json + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-javadoc.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-sources.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-tests.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.pom - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-unsafe-{no_rc_snapshot_version}-javadoc.jar @@ -838,6 +850,13 @@ tasks: - flight-sql-jdbc-driver-{no_rc_snapshot_version}-tests.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.pom + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.json + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.xml + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-javadoc.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-sources.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-src.zip + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.pom ############################## NuGet packages ############################### diff --git a/docker-compose.yml b/docker-compose.yml index a08345c198fa0..7ae625a017417 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -172,7 +172,7 @@ x-hierarchy: - python-wheel-manylinux-2-28 - python-wheel-manylinux-test-imports - python-wheel-manylinux-test-unittests - - python-wheel-windows-vs2017 + - python-wheel-windows-vs2019 - python-wheel-windows-test volumes: @@ -320,6 +320,8 @@ services: # Shrink test runtime by enabling minimal optimizations ARROW_C_FLAGS_DEBUG: "-g1 -Og" ARROW_CXX_FLAGS_DEBUG: "-g1 -Og" + # GH-39973: Do not use debug memory pool for valgrind + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_FLIGHT: "OFF" ARROW_FLIGHT_SQL: "OFF" @@ -598,6 +600,8 @@ services: CXX: clang++-${CLANG_TOOLS} # Avoid creating huge static libraries ARROW_BUILD_STATIC: "OFF" + # GH-39973: Do not use debug memory pool for ASAN + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit # GH-33920: Disable Flight SQL to reduce build time. # We'll be able to re-enable this with Ubuntu 24.04 because @@ -1030,7 +1034,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2014 @@ -1053,7 +1057,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2_28 @@ -1098,19 +1102,19 @@ services: CHECK_UNITTESTS: "ON" command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow - python-wheel-windows-vs2017: - image: ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + python-wheel-windows-vs2019: + image: ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: vcpkg: ${VCPKG} python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-vs2019.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. # Uncomment if no local cache is available. # cache_from: - # - abrarov/msvc-2017:2.11.0 - # - ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + # - abrarov/msvc-2019:2.11.0 + # - ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind @@ -1119,12 +1123,12 @@ services: command: arrow\\ci\\scripts\\python_wheel_windows_build.bat python-wheel-windows-test: - image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2017-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2019-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-test-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-test-vs2019.dockerfile volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind @@ -1709,9 +1713,7 @@ services: arch: ${ARCH} # Use a newer JDK as it seems to improve stability jdk: 17 - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} node: ${NODE} go: ${GO} volumes: *conda-volumes @@ -1742,6 +1744,7 @@ services: args: r: ${R} jdk: ${JDK} + maven: ${MAVEN} node: ${NODE} base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 environment: @@ -1843,9 +1846,7 @@ services: arch: ${ARCH} python: ${PYTHON} jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} hdfs: ${HDFS} links: - impala:impala @@ -1886,9 +1887,7 @@ services: arch: ${ARCH} python: ${PYTHON} jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} spark: ${SPARK} numpy: ${NUMPY} shm_size: *shm-size diff --git a/docs/requirements.txt b/docs/requirements.txt index aee2eb662c06b..5d6fec7ddf72e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ breathe ipython numpydoc -pydata-sphinx-theme==0.14.1 +pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 0fa80aa1106c1..eb7c797df5e27 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -58,8 +58,10 @@ that changing their value later will have an effect. - ``abort`` exits the processus with a non-zero return value; - ``trap`` issues a platform-specific debugger breakpoint / trap instruction; - ``warn`` prints a warning on stderr and continues execution; + - ``none`` disables memory checks; - If this variable is not set, or has empty an value, memory checks are disabled. + If this variable is not set, or has an empty value, it has the same effect + as the value ``none`` - memory checks are disabled. .. note:: While this functionality can be useful and has little overhead, it diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index 812212f536169..ef4bf1cf3238d 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -251,7 +251,7 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. -* A ``large_list_view`` array has format string ``+Lv``, and its single +* A ``large_list_view`` array has format string ``+vL``, and its single child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 73b5e063ff1a0..e6f6c3dbbd3d1 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -63,6 +63,8 @@ may expose data type-specific methods or properties. FixedSizeBinaryArray LargeBinaryArray LargeStringArray + BinaryViewArray, + StringViewArray, Time32Array Time64Array Date32Array @@ -75,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -119,6 +123,8 @@ classes may expose data type-specific methods or properties. FixedSizeBinaryScalar LargeBinaryScalar LargeStringScalar + BinaryViewScalar + StringViewScalar Time32Scalar Time64Scalar Date32Scalar @@ -131,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b879643017a90..928c607d139ce 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -590,4 +590,4 @@ User-Defined Functions :toctree: ../generated/ register_scalar_function - ScalarUdfContext + UdfContext diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 4066ef314234d..62bf4b7723558 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -55,9 +55,13 @@ These should be used to create Arrow data types and schemas. large_binary large_string large_utf8 + binary_view + string_view decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -147,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested @@ -168,6 +174,8 @@ represents a given data type (such as ``int32``) or general category is_large_binary is_large_unicode is_large_string + is_binary_view + is_string_view is_fixed_size_binary is_map is_dictionary diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index e8a5b613c6099..c02059a4f8faa 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -445,9 +445,9 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of -:class:`pyarrow.compute.ScalarUdfContext`. +:class:`pyarrow.compute.UdfContext`. This context exposes several useful attributes, particularly a -:attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for +:attr:`~pyarrow.compute.UdfContext.memory_pool` to be used for allocations in the context of the user-defined function. You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index fda90c4f2a58c..23a4b73bd0965 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -197,7 +197,7 @@ use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow .. ipython:: python - df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=3)}) + df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=3)}) df.dtypes df diff --git a/docs/source/status.rst b/docs/source/status.rst index 03a87012342c2..4bff37c8527fa 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -40,7 +40,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Float16 | ✓ (1) | | ✓ | ✓ | ✓ (2)| ✓ | ✓ | | +| Float16 | ✓ (1) | ✓ (2) | ✓ | ✓ | ✓ (3)| ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -104,7 +104,7 @@ Data Types | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | (special) | | | | | | | | | +===================+=======+=======+=======+============+=======+=======+=======+=======+ -| Dictionary | ✓ | ✓ (3) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +| Dictionary | ✓ | ✓ (4) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -114,8 +114,9 @@ Data Types Notes: * \(1) Casting to/from Float16 in C++ is not supported. -* \(2) Float16 support in C# is only available when targeting .NET 6+. -* \(3) Nested dictionaries not supported +* \(2) Casting to/from Float16 in Java is not supported. +* \(3) Float16 support in C# is only available when targeting .NET 6+. +* \(4) Nested dictionaries not supported .. seealso:: The :ref:`format_columnar` specification. @@ -144,7 +145,7 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Sparse tensors | ✓ | | | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Buffer compression | ✓ | ✓ (3) | ✓ | | ✓ (4) | ✓ | ✓ | | +| Buffer compression | ✓ | ✓ (3) | ✓ | | ✓ | ✓ | ✓ | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Endianness conversion | ✓ (2) | | ✓ (2) | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -159,8 +160,6 @@ Notes: * \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance. -* \(4) Compression when writing is not supported, only decompression when reading. - .. seealso:: The :ref:`format-ipc` specification. @@ -256,9 +255,9 @@ support/not support individual features. +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | CancelQuery | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| ClosePreparedStatement | ✓ | ✓ | ✓ | | ✓ | | | | +| ClosePreparedStatement | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| CreatePreparedStatement | ✓ | ✓ | ✓ | | ✓ | | | | +| CreatePreparedStatement | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | CreatePreparedSubstraitPlan | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -266,35 +265,35 @@ support/not support individual features. +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | EndTransaction | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetCatalogs | ✓ | ✓ | ✓ | | ✓ | | | | +| GetCatalogs | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetCrossReference | ✓ | ✓ | ✓ | | ✓ | | | | +| GetCrossReference | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetDbSchemas | ✓ | ✓ | ✓ | | ✓ | | | | +| GetDbSchemas | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetExportedKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetExportedKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetImportedKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetImportedKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetPrimaryKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetPrimaryKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetSqlInfo | ✓ | ✓ | ✓ | | ✓ | | | | +| GetSqlInfo | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetTables | ✓ | ✓ | ✓ | | ✓ | | | | +| GetTables | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetTableTypes | ✓ | ✓ | ✓ | | ✓ | | | | +| GetTableTypes | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetXdbcTypeInfo | ✓ | ✓ | ✓ | | ✓ | | | | +| GetXdbcTypeInfo | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| PreparedStatementQuery | ✓ | ✓ | ✓ | | ✓ | | | | +| PreparedStatementQuery | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| PreparedStatementUpdate | ✓ | ✓ | ✓ | | ✓ | | | | +| PreparedStatementUpdate | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | StatementSubstraitPlan | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| StatementQuery | ✓ | ✓ | ✓ | | ✓ | | | | +| StatementQuery | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| StatementUpdate | ✓ | ✓ | ✓ | | ✓ | | | | +| StatementUpdate | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ .. seealso:: diff --git a/go/arrow/flight/cookie_middleware.go b/go/arrow/flight/cookie_middleware.go index 27754a13b829a..39c86d8303434 100644 --- a/go/arrow/flight/cookie_middleware.go +++ b/go/arrow/flight/cookie_middleware.go @@ -23,6 +23,7 @@ import ( "sync" "time" + "golang.org/x/exp/maps" "google.golang.org/grpc/metadata" ) @@ -40,11 +41,34 @@ func NewClientCookieMiddleware() ClientMiddleware { return CreateClientMiddleware(&clientCookieMiddleware{jar: make(map[string]http.Cookie)}) } +func NewCookieMiddleware() CookieMiddleware { + return &clientCookieMiddleware{jar: make(map[string]http.Cookie)} +} + +// CookieMiddleware is a go-routine safe middleware for flight clients +// which properly handles Set-Cookie headers for storing cookies. +// This can be passed into `CreateClientMiddleware` to create a new +// middleware object. You can also clone it to create middleware for a +// new client which starts with the same cookies. +type CookieMiddleware interface { + CustomClientMiddleware + // Clone creates a new CookieMiddleware that starts out with the same + // cookies that this one already has. This is useful when creating a + // new client connection for the same server. + Clone() CookieMiddleware +} + type clientCookieMiddleware struct { jar map[string]http.Cookie mx sync.Mutex } +func (cc *clientCookieMiddleware) Clone() CookieMiddleware { + cc.mx.Lock() + defer cc.mx.Unlock() + return &clientCookieMiddleware{jar: maps.Clone(cc.jar)} +} + func (cc *clientCookieMiddleware) StartCall(ctx context.Context) context.Context { cc.mx.Lock() defer cc.mx.Unlock() diff --git a/go/arrow/flight/cookie_middleware_test.go b/go/arrow/flight/cookie_middleware_test.go index 0adf4927652d4..4007d056b2c99 100644 --- a/go/arrow/flight/cookie_middleware_test.go +++ b/go/arrow/flight/cookie_middleware_test.go @@ -239,3 +239,63 @@ func TestCookieExpiration(t *testing.T) { cookieMiddleware.expectedCookies = map[string]string{} makeReq(client, t) } + +func TestCookiesClone(t *testing.T) { + cookieMiddleware := &serverAddCookieMiddleware{} + + s := flight.NewServerWithMiddleware([]flight.ServerMiddleware{ + flight.CreateServerMiddleware(cookieMiddleware), + }) + s.Init("localhost:0") + f := &flightServer{} + s.RegisterFlightService(f) + + go s.Serve() + defer s.Shutdown() + + makeReq := func(c flight.Client, t *testing.T) { + flightStream, err := c.ListFlights(context.Background(), &flight.Criteria{}) + assert.NoError(t, err) + + for { + _, err := flightStream.Recv() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + assert.NoError(t, err) + } + } + } + + credsOpt := grpc.WithTransportCredentials(insecure.NewCredentials()) + cookies := flight.NewCookieMiddleware() + client1, err := flight.NewClientWithMiddleware(s.Addr().String(), nil, + []flight.ClientMiddleware{flight.CreateClientMiddleware(cookies)}, credsOpt) + require.NoError(t, err) + defer client1.Close() + + // set cookies + cookieMiddleware.cookies = []*http.Cookie{ + {Name: "foo", Value: "bar"}, + {Name: "foo2", Value: "bar2", MaxAge: 1}, + } + makeReq(client1, t) + + // validate set + cookieMiddleware.expectedCookies = map[string]string{ + "foo": "bar", "foo2": "bar2", + } + makeReq(client1, t) + + client2, err := flight.NewClientWithMiddleware(s.Addr().String(), nil, + []flight.ClientMiddleware{flight.CreateClientMiddleware(cookies.Clone())}, credsOpt) + require.NoError(t, err) + defer client2.Close() + + // validate clone worked + cookieMiddleware.expectedCookies = map[string]string{ + "foo": "bar", "foo2": "bar2", + } + makeReq(client2, t) +} diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 441f88f39f43a..068bfa84c3144 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -450,6 +450,31 @@ func (c *Client) PrepareSubstrait(ctx context.Context, plan SubstraitPlan, opts return parsePreparedStatementResponse(c, c.Alloc, stream) } +func (c *Client) LoadPreparedStatementFromResult(result *CreatePreparedStatementResult) (*PreparedStatement, error) { + var ( + err error + dsSchema, paramSchema *arrow.Schema + ) + if result.DatasetSchema != nil { + dsSchema, err = flight.DeserializeSchema(result.DatasetSchema, c.Alloc) + if err != nil { + return nil, err + } + } + if result.ParameterSchema != nil { + paramSchema, err = flight.DeserializeSchema(result.ParameterSchema, c.Alloc) + if err != nil { + return nil, err + } + } + return &PreparedStatement{ + client: c, + handle: result.PreparedStatementHandle, + datasetSchema: dsSchema, + paramSchema: paramSchema, + }, nil +} + func parsePreparedStatementResponse(c *Client, mem memory.Allocator, results pb.FlightService_DoActionClient) (*PreparedStatement, error) { if err := results.CloseSend(); err != nil { return nil, err @@ -1027,6 +1052,46 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return p.client.getFlightInfo(ctx, desc, opts...) } +// ExecutePut calls DoPut for the prepared statement on the server. If SetParameters +// has been called then the parameter bindings will be sent before execution. +// +// Will error if already closed. +func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOption) error { + if p.closed { + return errors.New("arrow/flightsql: prepared statement already closed") + } + + cmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + + desc, err := descForCommand(cmd) + if err != nil { + return err + } + + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return err + } + + wr, err := p.writeBindParameters(pstream, desc) + if err != nil { + return err + } + if err = wr.Close(); err != nil { + return err + } + pstream.CloseSend() + + // wait for the server to ack the result + if _, err = pstream.Recv(); err != nil && err != io.EOF { + return err + } + } + + return nil +} + // ExecutePoll executes the prepared statement on the server and returns a PollInfo // indicating the progress of execution. // diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index c8b9f7f1246c1..f35aeefcf4628 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -665,6 +665,36 @@ func (s *FlightSqlClientSuite) TestRenewFlightEndpoint() { s.Equal(&mockedRenewedEndpoint, renewedEndpoint) } +func (s *FlightSqlClientSuite) TestPreparedStatementLoadFromResult() { + const query = "query" + + result := &pb.ActionCreatePreparedStatementResult{ + PreparedStatementHandle: []byte(query), + } + + parameterSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "p_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.ParameterSchema = flight.SerializeSchema(parameterSchemaResult, memory.DefaultAllocator) + datasetSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "ds_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.DatasetSchema = flight.SerializeSchema(datasetSchemaResult, memory.DefaultAllocator) + + prepared, err := s.sqlClient.LoadPreparedStatementFromResult(result) + s.NoError(err) + + s.Equal(string(prepared.Handle()), "query") + + paramSchema := prepared.ParameterSchema() + paramRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, paramSchema, strings.NewReader(`[{"p_id": 1}]`)) + s.NoError(err) + defer paramRec.Release() + + datasetSchema := prepared.DatasetSchema() + datasetRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, datasetSchema, strings.NewReader(`[{"ds_id": 1}]`)) + s.NoError(err) + defer datasetRec.Release() + + s.Equal(string(prepared.Handle()), "query") +} + func TestFlightSqlClient(t *testing.T) { suite.Run(t, new(FlightSqlClientSuite)) } diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index d89e68f028bb8..c70a8bdc4ec26 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -852,3 +852,5 @@ const ( // cancellation request. CancelResultNotCancellable = pb.ActionCancelQueryResult_CANCEL_RESULT_NOT_CANCELLABLE ) + +type CreatePreparedStatementResult = pb.ActionCreatePreparedStatementResult diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index ac857d17e632d..4d603c547ca6a 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -397,7 +397,6 @@ func (w *columnWriter) FlushBufferedDataPages() (err error) { } } w.pages = w.pages[:0] - w.totalCompressedBytes = 0 return } @@ -542,7 +541,9 @@ func (w *columnWriter) Close() (err error) { if !w.closed { w.closed = true if w.hasDict && !w.fallbackToNonDict { - w.WriteDictionaryPage() + if err = w.WriteDictionaryPage(); err != nil { + return err + } } if err = w.FlushBufferedDataPages(); err != nil { @@ -659,7 +660,10 @@ func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int if values.Data().Offset() > 0 { data := values.Data() - buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes]) + elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes() + start := data.Offset() * elemSize + end := start + data.Len()*elemSize + buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end]) } data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0) diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 8011ac2487995..dd597e280b850 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -24,6 +24,8 @@ import ( "sync" "testing" + "github.com/apache/arrow/go/v16/arrow" + "github.com/apache/arrow/go/v16/arrow/array" "github.com/apache/arrow/go/v16/arrow/bitutil" "github.com/apache/arrow/go/v16/arrow/memory" arrutils "github.com/apache/arrow/go/v16/internal/utils" @@ -36,6 +38,7 @@ import ( "github.com/apache/arrow/go/v16/parquet/internal/testutils" "github.com/apache/arrow/go/v16/parquet/internal/utils" "github.com/apache/arrow/go/v16/parquet/metadata" + "github.com/apache/arrow/go/v16/parquet/pqarrow" "github.com/apache/arrow/go/v16/parquet/schema" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -426,6 +429,26 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parque } } +func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { + p.GenerateData(SmallSize) + props := parquet.DefaultColumnProperties() + props.DictionaryEnabled = true + + if version == parquet.V1_0 { + props.Encoding = parquet.Encodings.PlainDict + } else { + props.Encoding = parquet.Encodings.RLEDict + } + + writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version)) + p.WriteBatchValues(writer, nil, nil) + writer.FallbackToPlain() + p.NotEqual(0, writer.TotalCompressedBytes()) + writer.Close() + p.NotEqual(0, writer.TotalCompressedBytes()) + p.NotEqual(0, writer.TotalBytesWritten()) +} + func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { p.testRequiredWithEncoding(parquet.Encodings.Plain) } @@ -575,6 +598,14 @@ func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() { p.testDictionaryFallbackEncoding(parquet.V2_LATEST) } +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV1() { + p.testDictionaryFallbackAndCompressedSize(parquet.V1_0) +} + +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV2() { + p.testDictionaryFallbackAndCompressedSize(parquet.V2_LATEST) +} + func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() { // test case for NULL values p.SetupSchema(parquet.Repetitions.Optional, 1) @@ -708,3 +739,38 @@ func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) } } + +func TestDictionaryReslice(t *testing.T) { + pts := []arrow.DataType{ + arrow.PrimitiveTypes.Int8, + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + arrow.PrimitiveTypes.Uint8, + arrow.PrimitiveTypes.Uint16, + arrow.PrimitiveTypes.Uint32, + arrow.PrimitiveTypes.Uint64, + } + for _, pt := range pts { + t.Run(pt.String(), func(t *testing.T) { + mem := memory.NewGoAllocator() + dt := &arrow.DictionaryType{ + IndexType: pt, + ValueType: &arrow.StringType{}, + } + field := arrow.Field{Name: "test_field", Type: dt, Nullable: true} + schema := arrow.NewSchema([]arrow.Field{field}, nil) + b := array.NewRecordBuilder(mem, schema) + for i := 0; i < 2000; i++ { + b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value") + } + rec := b.NewRecord() + out := &bytes.Buffer{} + pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties()) + assert.NoError(t, err) + err = pqw.WriteBuffered(rec) + assert.NoError(t, err) + + }) + } +} diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java index 5b01077b17996..f95133fc7e44c 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java @@ -21,7 +21,8 @@ * String constants used for metadata returned on Vectors. */ public class Constants { - private Constants() {} + private Constants() { + } public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME"; public static final String SQL_SCHEMA_NAME_KEY = "SQL_SCHEMA_NAME"; diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java index 438a949b736f1..4478cdfbee6f7 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java @@ -231,7 +231,8 @@ public void setDate(int parameterIndex, Date x, Calendar cal) throws SQLExceptio } @Override - public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException {} + public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException { + } @Override public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws SQLException { @@ -241,7 +242,8 @@ public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws S } @Override - public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException {} + public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException { + } @Override public void setURL(int parameterIndex, URL x) throws SQLException { @@ -259,62 +261,80 @@ public void setRowId(int parameterIndex, RowId x) throws SQLException { } @Override - public void setNString(int parameterIndex, String value) throws SQLException {} + public void setNString(int parameterIndex, String value) throws SQLException { + } @Override public void setNCharacterStream(int parameterIndex, Reader value, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, NClob value) throws SQLException {} + public void setNClob(int parameterIndex, NClob value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override public void setBlob(int parameterIndex, InputStream inputStream, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override - public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException {} + public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException { + } @Override public void setObject(int parameterIndex, Object x, int targetSqlType, int scaleOrLength) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override public void setCharacterStream(int parameterIndex, Reader reader, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException {} + public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException {} + public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader) throws SQLException {} + public void setClob(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException {} + public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader) throws SQLException { + } @Override public ResultSet executeQuery(String sql) throws SQLException { @@ -327,7 +347,8 @@ public int executeUpdate(String sql) throws SQLException { } @Override - public void close() throws SQLException {} + public void close() throws SQLException { + } @Override public int getMaxFieldSize() throws SQLException { diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java index c712741b51f5b..ccc7681c5bc8b 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java @@ -348,7 +348,8 @@ public static class MockColumnMetaData { private int displaySize; - private MockColumnMetaData() {} + private MockColumnMetaData() { + } private String getLabel() { return label; diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 265a9a71b80e2..79e51470a426e 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -75,7 +75,7 @@ org.apache.hadoop hadoop-common - 3.3.3 + 3.3.6 test diff --git a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java index 9b599234bdf51..d61799e990f77 100644 --- a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java +++ b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java @@ -32,7 +32,8 @@ class OrcJniUtils { private static final String LIBRARY_NAME = "arrow_orc_jni"; private static boolean isLoaded = false; - private OrcJniUtils() {} + private OrcJniUtils() { + } static void loadOrcAdapterLibraryFromJar() throws IOException, IllegalAccessException { diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java b/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java index 6104cb1a132e4..af28333746290 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java @@ -18,7 +18,9 @@ package org.apache.arrow.compression; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -27,63 +29,223 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.GenerateSampleData; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.compression.CompressionUtil; import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import org.apache.arrow.vector.ipc.message.IpcOption; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class TestArrowReaderWriterWithCompression { - @Test - public void testArrowFileZstdRoundTrip() throws Exception { - // Prepare sample data - final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + private BufferAllocator allocator; + private ByteArrayOutputStream out; + private VectorSchemaRoot root; + + @BeforeEach + public void setup() { + if (allocator == null) { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + out = new ByteArrayOutputStream(); + root = null; + } + + @After + public void tearDown() { + if (root != null) { + root.close(); + } + if (allocator != null) { + allocator.close(); + } + if (out != null) { + out.reset(); + } + + } + + private void createAndWriteArrowFile(DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { List fields = new ArrayList<>(); fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); - VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), allocator); + root = VectorSchemaRoot.create(new Schema(fields), allocator); + final int rowCount = 10; GenerateSampleData.generateTestData(root.getVector(0), rowCount); root.setRowCount(rowCount); - // Write an in-memory compressed arrow file - ByteArrayOutputStream out = new ByteArrayOutputStream(); - try (final ArrowFileWriter writer = - new ArrowFileWriter(root, null, Channels.newChannel(out), new HashMap<>(), - IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, CompressionUtil.CodecType.ZSTD, Optional.of(7))) { + try (final ArrowFileWriter writer = new ArrowFileWriter(root, provider, Channels.newChannel(out), + new HashMap<>(), IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) { writer.start(); writer.writeBatch(); writer.end(); } + } + + private void createAndWriteArrowStream(DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { + List fields = new ArrayList<>(); + fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); + root = VectorSchemaRoot.create(new Schema(fields), allocator); + + final int rowCount = 10; + GenerateSampleData.generateTestData(root.getVector(0), rowCount); + root.setRowCount(rowCount); + + try (final ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out), + IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + } - // Read the in-memory compressed arrow file with CommonsCompressionFactory provided + private Dictionary createDictionary(VarCharVector dictionaryVector) { + setVector(dictionaryVector, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8)); + + return new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null)); + } + + @Test + public void testArrowFileZstdRoundTrip() throws Exception { + createAndWriteArrowFile(null, CompressionUtil.CodecType.ZSTD); + // with compression + try (ArrowFileReader reader = + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression try (ArrowFileReader reader = - new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), - allocator, CommonsCompressionFactory.INSTANCE)) { - Assert.assertEquals(1, reader.getRecordBlocks().size()); + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + } + + @Test + public void testArrowStreamZstdRoundTrip() throws Exception { + createAndWriteArrowStream(null, CompressionUtil.CodecType.ZSTD); + // with compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { Assert.assertTrue(reader.loadNextBatch()); Assert.assertTrue(root.equals(reader.getVectorSchemaRoot())); Assert.assertFalse(reader.loadNextBatch()); } + // without compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assert.assertEquals( + "Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage() + ); + } + } - // Read the in-memory compressed arrow file without CompressionFactory provided + @Test + public void testArrowFileZstdRoundTripWithDictionary() throws Exception { + VarCharVector dictionaryVector = (VarCharVector) + FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector("f1_file", allocator, null); + Dictionary dictionary = createDictionary(dictionaryVector); + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary); + + createAndWriteArrowFile(provider, CompressionUtil.CodecType.ZSTD); + + // with compression + try (ArrowFileReader reader = + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression try (ArrowFileReader reader = - new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), - allocator, NoCompressionCodec.Factory.INSTANCE)) { - Assert.assertEquals(1, reader.getRecordBlocks().size()); + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + dictionaryVector.close(); + } + + @Test + public void testArrowStreamZstdRoundTripWithDictionary() throws Exception { + VarCharVector dictionaryVector = (VarCharVector) + FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector("f1_stream", allocator, null); + Dictionary dictionary = createDictionary(dictionaryVector); + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary); + + createAndWriteArrowStream(provider, CompressionUtil.CodecType.ZSTD); + + // with compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + dictionaryVector.close(); + } - Exception exception = Assert.assertThrows(IllegalArgumentException.class, () -> reader.loadNextBatch()); - String expectedMessage = "Please add arrow-compression module to use CommonsCompressionFactory for ZSTD"; - Assert.assertEquals(expectedMessage, exception.getMessage()); + public static void setVector(VarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } } + vector.setValueCount(length); } } diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java index 13b247452348d..6d33cf057ed3a 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java @@ -32,6 +32,7 @@ import org.apache.arrow.dataset.file.DatasetFileWriter; import org.apache.arrow.dataset.file.FileFormat; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.Float16; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateMilliVector; @@ -39,6 +40,7 @@ import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -89,7 +91,6 @@ public class TestAllTypes extends TestDataset { private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { // Notes: - // - Float16 is not supported by Java. // - IntervalMonthDayNano is not supported by Parquet. // - Map (GH-38250) and SparseUnion are resulting in serialization errors when writing with the Dataset API. // "Unhandled type for Arrow to Parquet schema conversion" errors: IntervalDay, IntervalYear, DenseUnion @@ -109,6 +110,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { Field.nullablePrimitive("uint16", new ArrowType.Int(16, false)), Field.nullablePrimitive("uint32", new ArrowType.Int(32, false)), Field.nullablePrimitive("uint64", new ArrowType.Int(64, false)), + Field.nullablePrimitive("float16", new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)), Field.nullablePrimitive("float32", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Field.nullablePrimitive("float64", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), Field.nullablePrimitive("utf8", ArrowType.Utf8.INSTANCE), @@ -148,6 +150,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { root.getVector("uint16").setNull(0); root.getVector("uint32").setNull(0); root.getVector("uint64").setNull(0); + root.getVector("float16").setNull(0); root.getVector("float32").setNull(0); root.getVector("float64").setNull(0); root.getVector("utf8").setNull(0); @@ -180,6 +183,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { ((UInt2Vector) root.getVector("uint16")).set(1, 1); ((UInt4Vector) root.getVector("uint32")).set(1, 1); ((UInt8Vector) root.getVector("uint64")).set(1, 1); + ((Float2Vector) root.getVector("float16")).set(1, Float16.toFloat16(+32.875f)); ((Float4Vector) root.getVector("float32")).set(1, 1.0f); ((Float8Vector) root.getVector("float64")).set(1, 1.0); ((VarCharVector) root.getVector("utf8")).set(1, new Text("a")); diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml index c27f382ddda76..b63a4a9cba1f3 100644 --- a/java/dev/checkstyle/checkstyle.xml +++ b/java/dev/checkstyle/checkstyle.xml @@ -60,6 +60,11 @@ + + + + + @@ -72,10 +77,6 @@ - - - - @@ -223,13 +224,12 @@ - - - - - + + + + diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml index 585985bf32dbc..a3536e2ca9212 100644 --- a/java/dev/checkstyle/suppressions.xml +++ b/java/dev/checkstyle/suppressions.xml @@ -40,5 +40,5 @@ - + diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java index fc491ebe0df98..8f251a7c7ef07 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java @@ -437,7 +437,8 @@ public ClientStreamListener getWriter() { */ public void getResult() { // After exchange is complete, make sure stream is drained to propagate errors through reader - while (reader.next()) { }; + while (reader.next()) { + } } /** Shut down the streams in this call. */ diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java index eb5e492b4cd46..b711d7ef6b5d7 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java @@ -125,7 +125,8 @@ public void enterIdle() { } } - private FlightGrpcUtils() {} + private FlightGrpcUtils() { + } /** * Creates a Flight service. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java index 7a5a941603ace..84beee7d40564 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java @@ -194,7 +194,8 @@ public void close() throws Exception { } } // Drain the stream without the lock (as next() implicitly needs the lock) - while (next()) { } + while (next()) { + } } catch (FlightRuntimeException e) { suppressor = e; } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java index e80fb41c67273..80ddad90a1d28 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java @@ -119,5 +119,6 @@ default void start(VectorSchemaRoot root, DictionaryProvider dictionaries) { *

The default value can be toggled globally by setting the JVM property arrow.flight.enable_zero_copy_write * or the environment variable ARROW_FLIGHT_ENABLE_ZERO_COPY_WRITE. */ - default void setUseZeroCopy(boolean enabled) {} + default void setUseZeroCopy(boolean enabled) { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java index e3ccdc626d71b..8a37115f1f024 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java @@ -47,5 +47,6 @@ public byte[] parseBytes(byte[] serialized) { public static final Context.Key PEER_IDENTITY_KEY = Context.keyWithDefault("arrow-flight-peer-identity", ""); - private AuthConstants() {} + private AuthConstants() { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java index ad1a36a935fd7..3647e113cc0f6 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java @@ -115,7 +115,9 @@ public boolean hasNext() { @Override public void onError(Throwable t) { completed = true; - while (future == null) {/* busy wait */} + while (future == null) { + /* busy wait */ + } future.cancel(true); } diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java index bcff54bd7f66f..a1fa1f1d18509 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java @@ -303,10 +303,12 @@ public void onBeforeSendingHeaders(CallHeaders outgoingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } @Override - public void onCallErrored(Throwable err) {} + public void onCallErrored(Throwable err) { + } } static class MultiHeaderClientMiddlewareFactory implements FlightClientMiddleware.Factory { @@ -356,6 +358,7 @@ public void onHeadersReceived(CallHeaders incomingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } } } diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java index b8aa46fb5674a..13238f318eaaa 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java @@ -55,7 +55,8 @@ public FlightProducer producer(BufferAllocator allocator, Location location) thr } @Override - public void buildServer(FlightServer.Builder builder) throws Exception {} + public void buildServer(FlightServer.Builder builder) throws Exception { + } @Override public void client(BufferAllocator allocator, Location location, FlightClient client) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java index fdf6c508d93b0..de6dccad4a846 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java @@ -31,7 +31,8 @@ public final class IntervalStringUtils { /** * Constructor Method of class. */ - private IntervalStringUtils( ) {} + private IntervalStringUtils( ) { + } /** * Formats a period similar to Oracle INTERVAL YEAR TO MONTH data type
. diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index b7977462e9c01..78d252f7824c3 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -84,7 +84,7 @@ public void testGetDefaultKeyStoreInstancePassword() throws IOException, keyStoreMockedStatic .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) - .thenReturn(keyStoreMock); + .thenReturn(keyStoreMock); KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); Assert.assertEquals(receiveKeyStore, keyStoreMock); } diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index d0290b6814ed5..6337efcf7e348 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -96,7 +96,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.5 + 3.1.0 sign-artifacts diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java index e903b4e873278..fa5d285b90997 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java @@ -43,7 +43,8 @@ public static ConfigOptions getDefault() { return new ConfigOptions(); } - public ConfigOptions() {} + public ConfigOptions() { + } public ConfigOptions withOptimize(boolean optimize) { this.optimize = optimize; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java index e0c072cfbe52e..703cfaa8be88b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java @@ -23,7 +23,8 @@ * Utility methods for working with {@link Decimal} values. */ public class DecimalTypeUtil { - private DecimalTypeUtil() {} + private DecimalTypeUtil() { + } /** * Enum for supported mathematical operations. diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 90f8684b455a8..e7377cc5c9db4 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -33,7 +33,8 @@ * Utility methods to convert between Arrow and Gandiva types. */ public class ArrowTypeHelper { - private ArrowTypeHelper() {} + private ArrowTypeHelper() { + } static final int WIDTH_8 = 8; static final int WIDTH_16 = 16; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index 8656e886aae24..3d2ea27d044e7 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -29,7 +29,8 @@ * Contains helper functions for constructing expression trees. */ public class TreeBuilder { - private TreeBuilder() {} + private TreeBuilder() { + } /** * Helper functions to create literal constants. diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 3a88ec762e19c..c2b13119fc440 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -235,7 +235,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j @@ -271,7 +271,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package @@ -333,7 +333,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java index ff2b25dfa30ab..b8de6d819eaf8 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java @@ -34,7 +34,8 @@ public interface AllocationListener { * * @param size the buffer size being allocated */ - default void onPreAllocation(long size) {} + default void onPreAllocation(long size) { + } /** * Called each time a new buffer has been allocated. @@ -43,7 +44,8 @@ default void onPreAllocation(long size) {} * * @param size the buffer size being allocated */ - default void onAllocation(long size) {} + default void onAllocation(long size) { + } /** * Informed each time a buffer is released from allocation. @@ -51,7 +53,8 @@ default void onAllocation(long size) {} *

An exception cannot be thrown by this method. * @param size The size of the buffer being released. */ - default void onRelease(long size) {} + default void onRelease(long size) { + } /** @@ -73,7 +76,8 @@ default boolean onFailedAllocation(long size, AllocationOutcome outcome) { * @param parentAllocator The parent allocator to which a child was added * @param childAllocator The child allocator that was just added */ - default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } /** * Called immediately after a child allocator was removed from the parent allocator. @@ -81,5 +85,6 @@ default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator child * @param parentAllocator The parent allocator from which a child was removed * @param childAllocator The child allocator that was just removed */ - default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 8779c7a3434ea..189c800ba0fe5 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -702,18 +702,18 @@ private void verifyAllocator( void print(StringBuilder sb, int level, Verbosity verbosity) { CommonUtil.indent(sb, level) - .append("Allocator(") - .append(name) - .append(") ") - .append(reservation) - .append('/') - .append(getAllocatedMemory()) - .append('/') - .append(getPeakMemoryAllocation()) - .append('/') - .append(getLimit()) - .append(" (res/actual/peak/limit)") - .append('\n'); + .append("Allocator(") + .append(name) + .append(") ") + .append(reservation) + .append('/') + .append(getAllocatedMemory()) + .append('/') + .append(getPeakMemoryAllocation()) + .append('/') + .append(getLimit()) + .append(" (res/actual/peak/limit)") + .append('\n'); if (DEBUG) { CommonUtil.indent(sb, level + 1).append(String.format("child allocators: %d\n", childAllocators.size())); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java index 1ca3e08ecf046..62d268a1f4493 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java @@ -478,20 +478,20 @@ public long getAccountedSize() { */ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { CommonUtil.indent(sb, indent) - .append("ledger[") - .append(ledgerId) - .append("] allocator: ") - .append(allocator.getName()) - .append("), isOwning: ") - .append(", size: ") - .append(", references: ") - .append(bufRefCnt.get()) - .append(", life: ") - .append(lCreationTime) - .append("..") - .append(lDestructionTime) - .append(", allocatorManager: [") - .append(", life: "); + .append("ledger[") + .append(ledgerId) + .append("] allocator: ") + .append(allocator.getName()) + .append("), isOwning: ") + .append(", size: ") + .append(", references: ") + .append(bufRefCnt.get()) + .append(", life: ") + .append(lCreationTime) + .append("..") + .append(lDestructionTime) + .append(", allocatorManager: [") + .append(", life: "); if (!BaseAllocator.DEBUG) { sb.append("]\n"); @@ -499,8 +499,8 @@ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { Preconditions.checkArgument(buffers != null, "IdentityHashMap of buffers must not be null"); synchronized (buffers) { sb.append("] holds ") - .append(buffers.size()) - .append(" buffers. \n"); + .append(buffers.size()) + .append(" buffers. \n"); for (ArrowBuf buf : buffers.keySet()) { buf.print(sb, indent + 2, verbosity); sb.append('\n'); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java index 7d4de18751ba9..64a4232d8aeb7 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java @@ -141,10 +141,12 @@ public boolean release(int decrement) { } @Override - public void retain() { } + public void retain() { + } @Override - public void retain(int increment) { } + public void retain(int increment) { + } @Override public ArrowBuf retain(ArrowBuf srcBuffer, BufferAllocator targetAllocator) { diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java index 9579245ca7004..79d21fa040876 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java @@ -32,7 +32,8 @@ public class ByteFunctionHelpers { private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; - private ByteFunctionHelpers() {} + private ByteFunctionHelpers() { + } /** * Helper function to check for equality of bytes in two ArrowBufs. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java index ccca7b1e03093..707c5f1556062 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java @@ -24,7 +24,8 @@ */ public final class CommonUtil { - private CommonUtil() { } + private CommonUtil() { + } /** * Rounds up the provided value to the nearest power of two. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java new file mode 100644 index 0000000000000..8040158fd090e --- /dev/null +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory.util; + + +import org.apache.arrow.util.VisibleForTesting; + +/** + * Lifted from Apache Parquet MR project: + * https://github.com/apache/parquet-mr/blob/e87b80308869b77f914fcfd04364686e11158950/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java + *

    + * Changes made: + *
  • Modify the data type input from Parquet-MR Binary (toFloat(Binary b)) to Arrow Java short (toFloat(short b))
  • + *
  • Expose NAN and POSITIVE_INFINITY variables
  • + *
+ * + * + * The class is a utility class to manipulate half-precision 16-bit + * IEEE 754 + * floating point data types (also called fp16 or binary16). A half-precision float can be + * created from or converted to single-precision floats, and is stored in a short data type. + * The IEEE 754 standard specifies an float16 as having the following format: + *
    + *
  • Sign bit: 1 bit
  • + *
  • Exponent width: 5 bits
  • + *
  • Significand: 10 bits
  • + *
+ * + *

The format is laid out as follows:

+ *
+ * 1   11111   1111111111
+ * ^   --^--   -----^----
+ * sign  |          |_______ significand
+ *       |
+ *      -- exponent
+ * 
+ * Half-precision floating points can be useful to save memory and/or + * bandwidth at the expense of range and precision when compared to single-precision + * floating points (float32). + * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java + */ +public class Float16 { + // Positive infinity of type half-precision float. + public static final short POSITIVE_INFINITY = (short) 0x7c00; + // A Not-a-Number representation of a half-precision float. + public static final short NaN = (short) 0x7e00; + // The bitmask to and a number with to obtain the sign bit. + private static final int SIGN_MASK = 0x8000; + // The offset to shift by to obtain the exponent bits. + private static final int EXPONENT_SHIFT = 10; + // The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits. + private static final int SHIFTED_EXPONENT_MASK = 0x1f; + // The bitmask to and a number with to obtain significand bits. + private static final int SIGNIFICAND_MASK = 0x3ff; + // The offset of the exponent from the actual value. + private static final int EXPONENT_BIAS = 15; + // The offset to shift by to obtain the sign bit. + private static final int SIGN_SHIFT = 15; + // The bitmask to AND with to obtain exponent and significand bits. + private static final int EXPONENT_SIGNIFICAND_MASK = 0x7fff; + + private static final int FP32_SIGN_SHIFT = 31; + private static final int FP32_EXPONENT_SHIFT = 23; + private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff; + private static final int FP32_SIGNIFICAND_MASK = 0x7fffff; + private static final int FP32_EXPONENT_BIAS = 127; + private static final int FP32_QNAN_MASK = 0x400000; + private static final int FP32_DENORMAL_MAGIC = 126 << 23; + private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC); + + /** + * Returns true if the specified half-precision float value represents + * a Not-a-Number, false otherwise. + * + * @param h A half-precision float value + * @return True if the value is a NaN, false otherwise + * + */ + @VisibleForTesting + public static boolean isNaN(short h) { + return (h & EXPONENT_SIGNIFICAND_MASK) > POSITIVE_INFINITY; + } + + /** + *

Compares the two specified half-precision float values. The following + * conditions apply during the comparison:

+ * + *
    + *
  • NaN is considered by this method to be equal to itself and greater + * than all other half-precision float values (including {@code #POSITIVE_INFINITY})
  • + *
  • POSITIVE_ZERO is considered by this method to be greater than NEGATIVE_ZERO.
  • + *
+ * + * @param x The first half-precision float value to compare. + * @param y The second half-precision float value to compare + * + * @return The value {@code 0} if {@code x} is numerically equal to {@code y}, a + * value less than {@code 0} if {@code x} is numerically less than {@code y}, + * and a value greater than {@code 0} if {@code x} is numerically greater + * than {@code y} + * + */ + @VisibleForTesting + public static int compare(short x, short y) { + boolean xIsNaN = isNaN(x); + boolean yIsNaN = isNaN(y); + + if (!xIsNaN && !yIsNaN) { + int first = ((x & SIGN_MASK) != 0 ? 0x8000 - (x & 0xffff) : x & 0xffff); + int second = ((y & SIGN_MASK) != 0 ? 0x8000 - (y & 0xffff) : y & 0xffff); + // Returns true if the first half-precision float value is less + // (smaller toward negative infinity) than the second half-precision float value. + if (first < second) { + return -1; + } + + // Returns true if the first half-precision float value is greater + // (larger toward positive infinity) than the second half-precision float value. + if (first > second) { + return 1; + } + } + + // Collapse NaNs, akin to halfToIntBits(), but we want to keep + // (signed) short value types to preserve the ordering of -0.0 + // and +0.0 + short xBits = xIsNaN ? NaN : x; + short yBits = yIsNaN ? NaN : y; + return (xBits == yBits ? 0 : (xBits < yBits ? -1 : 1)); + } + + /** + * Converts the specified half-precision float value into a + * single-precision float value. The following special cases are handled: + * If the input is NaN, the returned value is Float NaN. + * If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively + * Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is +/-0.0f. + * Otherwise, the returned value is a normalized single-precision float value. + * + * @param b The half-precision float value to convert to single-precision + * @return A normalized single-precision float value + */ + @VisibleForTesting + public static float toFloat(short b) { + int bits = b & 0xffff; + int s = bits & SIGN_MASK; + int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK; + int m = (bits) & SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0) { // Denormal or 0 + if (m != 0) { + // Convert denorm fp16 into normalized fp32 + float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m); + o -= FP32_DENORMAL_FLOAT; + return s == 0 ? o : -o; + } + } else { + outM = m << 13; + if (e == 0x1f) { // Infinite or NaN + outE = 0xff; + if (outM != 0) { // SNaNs are quieted + outM |= FP32_QNAN_MASK; + } + } else { + outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS; + } + } + int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM; + return Float.intBitsToFloat(out); + } + + /** + * Converts the specified single-precision float value into a + * half-precision float value. The following special cases are handled: + * + * If the input is NaN, the returned value is NaN. + * If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY, + * the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is + * POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_VALUE, the returned value + * is flushed to POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_NORMAL, the returned value + * is a denorm half-precision float. + * Otherwise, the returned value is rounded to the nearest + * representable half-precision float value. + * + * @param f The single-precision float value to convert to half-precision + * @return A half-precision float value + */ + public static short toFloat16(float f) { + int bits = Float.floatToRawIntBits(f); + int s = (bits >>> FP32_SIGN_SHIFT); + int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK; + int m = (bits) & FP32_SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0xff) { // Infinite or NaN + outE = 0x1f; + outM = m != 0 ? 0x200 : 0; + } else { + e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS; + if (e >= 0x1f) { // Overflow + outE = 0x1f; + } else if (e <= 0) { // Underflow + if (e < -10) { + // The absolute fp32 value is less than MIN_VALUE, flush to +/-0 + } else { + // The fp32 value is a normalized float less than MIN_NORMAL, + // we convert to a denorm fp16 + m = m | 0x800000; + int shift = 14 - e; + outM = m >> shift; + int lowm = m & ((1 << shift) - 1); + int hway = 1 << (shift - 1); + // if above halfway or exactly halfway and outM is odd + if (lowm + (outM & 1) > hway) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } else { + outE = e; + outM = m >> 13; + // if above halfway or exactly halfway and outM is odd + if ((m & 0x1fff) + (outM & 0x1) > 0x1000) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } + // The outM is added here as the +1 increments for outM above can + // cause an overflow in the exponent bit which is OK. + return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM); + } + + /** + * Returns a string representation of the specified half-precision + * float value. Calling this method is equivalent to calling + * Float.toString(toFloat(h)). See {@link Float#toString(float)} + * for more information on the format of the string representation. + * + * @param h A half-precision float value in binary little-endian format + * @return A string representation of the specified value + */ + @VisibleForTesting + public static String toFloatString(short h) { + return Float.toString(Float16.toFloat(h)); + } +} diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java index db63bbd14ba5f..94a7873664216 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java @@ -22,7 +22,8 @@ /** Contains utilities for dealing with a 64-bit address base. */ public final class LargeMemoryUtil { - private LargeMemoryUtil() {} + private LargeMemoryUtil() { + } /** * Casts length to an int, but raises an exception the value is outside diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java index 6b01a61ebca39..b88372abaaee1 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java @@ -34,7 +34,8 @@ * Utility methods for manipulating {@link java.util.Collections} and their subclasses/implementations. */ public final class Collections2 { - private Collections2() {} + private Collections2() { + } /** * Creates a {@link List} from the elements remaining in iterator. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java index 8083033007d9c..5e4323cfc9c61 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java @@ -111,7 +111,8 @@ * @since 2.0 */ public final class Preconditions { - private Preconditions() {} + private Preconditions() { + } /** * Ensures the truth of an expression involving one or more parameters to the calling method. diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java index 9ba42abc1ce89..b4385b72a38cf 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java @@ -29,6 +29,7 @@ import java.nio.ByteOrder; import java.util.Arrays; +import org.apache.arrow.memory.util.Float16; import org.junit.Test; import org.slf4j.LoggerFactory; @@ -180,4 +181,14 @@ public void testEnabledHistoricalLog() { ((Logger) LoggerFactory.getLogger("org.apache.arrow")).setLevel(null); } } + + @Test + public void testArrowBufFloat16() { + try (BufferAllocator allocator = new RootAllocator(); + ArrowBuf buf = allocator.buffer(1024) + ) { + buf.setShort(0, Float16.toFloat16(+32.875f)); + assertEquals((short) 0x501c, buf.getShort(0)); + } + } } diff --git a/java/performance/pom.xml b/java/performance/pom.xml index a1d53171f549b..ba5a6616dca77 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -139,7 +139,7 @@ org.codehaus.mojo exec-maven-plugin - 1.6.0 + 3.1.1 run-java-benchmarks diff --git a/java/pom.xml b/java/pom.xml index 3951f1c1bc8ed..6442987f5a192 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,11 +33,11 @@ 5.10.1 2.0.11 33.0.0-jre - 4.1.105.Final - 1.60.0 + 4.1.106.Final + 1.61.1 3.23.1 - 2.16.0 - 2.7.1 + 2.16.1 + 3.3.6 23.5.26 1.11.3 @@ -304,7 +304,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j @@ -364,7 +364,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package @@ -395,7 +395,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -438,14 +438,14 @@ org.immutables value - 2.8.2 + 2.10.0 maven-enforcer-plugin - 3.0.0-M2 + 3.4.1 org.apache.maven.plugins @@ -598,7 +598,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -653,7 +653,7 @@ org.immutables value - 2.8.2 + 2.10.0 provided @@ -803,7 +803,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -1038,7 +1038,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 cdata-cmake @@ -1099,7 +1099,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 jni-cpp-cmake @@ -1214,7 +1214,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 jni-cpp-cmake diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java index bb7cedeb74579..3d9bca58a763c 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java @@ -34,7 +34,8 @@ * first argument and the output is written to standard out. */ public class FileToStream { - private FileToStream() {} + private FileToStream() { + } /** * Reads an Arrow file from in and writes it back to out. diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 2a921804202f0..6c2a967712454 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -49,6 +49,16 @@ { class: "SmallInt", valueHolder: "Int2Holder"}, ] }, + { + major: "Fixed", + width: 2, + javaType: "short", + boxedType: "Short", + fields: [{name: "value", type: "short"}], + minor: [ + { class: "Float2", valueHolder: "Int2Holder"}, + ] + }, { major: "Fixed", width: 4, diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 56a6cc90b321b..822d4822987fb 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,9 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private BaseReader[] readers = new BaseReader[45]; + private static final int NUM_SUPPORTED_TYPES = 46; + + private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; public UnionReader(UnionVector data) { @@ -50,7 +52,7 @@ public MinorType getMinorType() { return TYPES[data.getTypeValue(idx())]; } - private static MinorType[] TYPES = new MinorType[45]; + private static MinorType[] TYPES = new MinorType[NUM_SUPPORTED_TYPES]; static { for (MinorType minorType : MinorType.values()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 6824756d8aca7..abece39475016 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -22,7 +22,8 @@ /** Helper utility methods for allocating storage for Vectors. */ public class AllocationHelper { - private AllocationHelper() {} + private AllocationHelper() { + } /** * Allocates the vector. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index 568554ba75ed6..10f343e260ccc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -33,7 +33,8 @@ */ public class BitVectorHelper { - private BitVectorHelper() {} + private BitVectorHelper() { + } /** * Get the index of byte corresponding to bit index in validity buffer. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java new file mode 100644 index 0000000000000..9d3f25769abff --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.Float16; +import org.apache.arrow.vector.complex.impl.Float2ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float2Holder; +import org.apache.arrow.vector.holders.NullableFloat2Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float2Vector implements a fixed width (2 bytes) vector of + * short values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class Float2Vector extends BaseFixedWidthVector implements FloatingPointVector { + public static final byte TYPE_WIDTH = 2; + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float2Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.FLOAT2.getType()), allocator); + } + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float2Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float2Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + } + + @Override + protected FieldReader getReaderImpl() { + return new Float2ReaderImpl(Float2Vector.this); + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.FLOAT2; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public short get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat2Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + @Override + public Short getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + static short get(final ArrowBuf buffer, final int index) { + return buffer.getShort((long) index * TYPE_WIDTH); + } + + @Override + public double getValueAsDouble(int index) { + return getValueAsFloat(index); + } + + public float getValueAsFloat(int index) { + return Float16.toFloat(this.get(index)); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + private void setValue(int index, short value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, value); + } + + private void setValue(int index, float value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, Float16.toFloat16(value)); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, short value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void setWithPossibleTruncate(int index, float value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat2Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float2Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, short)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, short value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #setWithPossibleTruncate(int, float)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafeWithPossibleTruncate(int index, float value) { + handleSafe(index); + setWithPossibleTruncate(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat2Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float2Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, short value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setWithPossibleTruncate(int index, int isSet, float value) { + if (isSet > 0) { + setWithPossibleTruncate(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, short value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafeWithPossibleTruncate(int index, int isSet, float value) { + handleSafe(index); + setWithPossibleTruncate(index, isSet, value); + } + + @Override + public void setWithPossibleTruncate(int index, double value) { + throw new UnsupportedOperationException("The operation for double data types is not supported."); + } + + @Override + public void setSafeWithPossibleTruncate(int index, double value) { + throw new UnsupportedOperationException("The operation for double data types is not supported."); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair comprising this and a target vector of + * the same type. + * + * @param field Field object used by the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + return new TransferImpl(field, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float2Vector) to); + } + + private class TransferImpl implements TransferPair { + Float2Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float2Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Field field, BufferAllocator allocator) { + to = new Float2Vector(field, allocator); + } + + public TransferImpl(Float2Vector to) { + this.to = to; + } + + @Override + public Float2Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float2Vector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java index 6cda18a8a53d3..be501ce245410 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java @@ -27,7 +27,8 @@ * with sample data. This class should be used for that purpose. */ public class GenerateSampleData { - private GenerateSampleData() {} + private GenerateSampleData() { + } /** Populates vector with valueCount random values. */ public static void generateTestData(final ValueVector vector, final int valueCount) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index d7b147feb152f..3b734bbf6608b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -300,7 +300,8 @@ public int getNullCount() { * @param index position of element */ @Override - public void setNull(int index) {} + public void setNull(int index) { + } @Override public boolean isNull(int index) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java index 0de99ab011f66..76db0734464ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java @@ -41,7 +41,8 @@ public class Range { /** * Constructs a new instance. */ - public Range() {} + public Range() { + } /** * Constructs a new instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java index 0098f68360a1a..2cd64c4fc6766 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -23,7 +23,8 @@ * Utility methods for state machines based on enums. */ public class StateTool { - private StateTool() {} + private StateTool() { + } static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StateTool.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java index 9c399669affc3..b16315caa9f51 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java @@ -25,7 +25,8 @@ * Magic header/footer helpers for {@link ArrowFileWriter} and {@link ArrowFileReader} formatted files. */ class ArrowMagic { - private ArrowMagic(){} + private ArrowMagic(){ + } private static final byte[] MAGIC = "ARROW1".getBytes(StandardCharsets.UTF_8); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java index 04c57d7e82fef..01f4e925c69b3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java @@ -251,7 +251,7 @@ private void load(ArrowDictionaryBatch dictionaryBatch, FieldVector vector) { VectorSchemaRoot root = new VectorSchemaRoot( Collections.singletonList(vector.getField()), Collections.singletonList(vector), 0); - VectorLoader loader = new VectorLoader(root); + VectorLoader loader = new VectorLoader(root, this.compressionFactory); try { loader.load(dictionaryBatch.getDictionary()); } finally { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java index a33c55de53f23..1cc201ae56f4b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java @@ -61,9 +61,14 @@ public abstract class ArrowWriter implements AutoCloseable { private final DictionaryProvider dictionaryProvider; private final Set dictionaryIdsUsed = new HashSet<>(); + private final CompressionCodec.Factory compressionFactory; + private final CompressionUtil.CodecType codecType; + private final Optional compressionLevel; private boolean started = false; private boolean ended = false; + private final CompressionCodec codec; + protected IpcOption option; protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { @@ -89,16 +94,19 @@ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, Writab protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, IpcOption option, CompressionCodec.Factory compressionFactory, CompressionUtil.CodecType codecType, Optional compressionLevel) { - this.unloader = new VectorUnloader( - root, /*includeNullCount*/ true, - compressionLevel.isPresent() ? - compressionFactory.createCodec(codecType, compressionLevel.get()) : - compressionFactory.createCodec(codecType), - /*alignBuffers*/ true); this.out = new WriteChannel(out); this.option = option; this.dictionaryProvider = provider; + this.compressionFactory = compressionFactory; + this.codecType = codecType; + this.compressionLevel = compressionLevel; + this.codec = this.compressionLevel.isPresent() ? + this.compressionFactory.createCodec(this.codecType, this.compressionLevel.get()) : + this.compressionFactory.createCodec(this.codecType); + this.unloader = new VectorUnloader(root, /*includeNullCount*/ true, codec, + /*alignBuffers*/ true); + List fields = new ArrayList<>(root.getSchema().getFields().size()); MetadataV4UnionChecker.checkForUnion(root.getSchema().getFields().iterator(), option.metadataVersion); @@ -133,7 +141,8 @@ protected void writeDictionaryBatch(Dictionary dictionary) throws IOException { Collections.singletonList(vector.getField()), Collections.singletonList(vector), count); - VectorUnloader unloader = new VectorUnloader(dictRoot); + VectorUnloader unloader = new VectorUnloader(dictRoot, /*includeNullCount*/ true, this.codec, + /*alignBuffers*/ true); ArrowRecordBatch batch = unloader.getRecordBatch(); ArrowDictionaryBatch dictionaryBatch = new ArrowDictionaryBatch(id, batch, false); try { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java index 26736ed91c5ca..59b3bb07bcf16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java @@ -31,7 +31,8 @@ * Utility methods for {@linkplain org.apache.arrow.vector.ipc.message.FBSerializable}s. */ public class FBSerializables { - private FBSerializables() {} + private FBSerializables() { + } /** * Writes every element of all to builder and calls {@link FlatBufferBuilder#endVector()} afterwards. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index f29157524f2df..0b0e0d66a98f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector.types; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; +import static org.apache.arrow.vector.types.FloatingPointPrecision.HALF; import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; import static org.apache.arrow.vector.types.UnionMode.Dense; import static org.apache.arrow.vector.types.UnionMode.Sparse; @@ -33,6 +34,7 @@ import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -79,6 +81,7 @@ import org.apache.arrow.vector.complex.impl.DenseUnionWriter; import org.apache.arrow.vector.complex.impl.DurationWriterImpl; import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.Float2WriterImpl; import org.apache.arrow.vector.complex.impl.Float4WriterImpl; import org.apache.arrow.vector.complex.impl.Float8WriterImpl; import org.apache.arrow.vector.complex.impl.IntWriterImpl; @@ -432,6 +435,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntervalYearWriterImpl((IntervalYearVector) vector); } }, + FLOAT2(new FloatingPoint(HALF)) { + @Override + public FieldVector getNewVector(Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new Float2Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float2WriterImpl((Float2Vector) vector); + } + }, // 4 byte ieee 754 FLOAT4(new FloatingPoint(SINGLE)) { @Override @@ -894,7 +908,7 @@ public MinorType visit(Int type) { public MinorType visit(FloatingPoint type) { switch (type.getPrecision()) { case HALF: - throw new UnsupportedOperationException("NYI: " + type); + return MinorType.FLOAT2; case SINGLE: return MinorType.FLOAT4; case DOUBLE: diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java index 9e8b6d26f6fd7..f7f975a0d0e7b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -26,7 +26,8 @@ /** Utility class for Date, DateTime, TimeStamp, Interval data types. */ public class DateUtility { - private DateUtility() {} + private DateUtility() { + } private static final String UTC = "UTC"; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 0dfb61dcdf269..4635822e5141b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -29,7 +29,8 @@ * Utility methods for configurable precision Decimal values (e.g. {@link BigDecimal}). */ public class DecimalUtility { - private DecimalUtility() {} + private DecimalUtility() { + } public static final byte [] zeroes = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index 9592f3975ab99..76fb585e6bd3a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -35,7 +35,8 @@ * Utility methods for working with Dictionaries used in Dictionary encodings. */ public class DictionaryUtility { - private DictionaryUtility() {} + private DictionaryUtility() { + } /** * Convert field and child fields that have a dictionary encoding to message format, so fields diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java index 39488e96efda0..5fa4c1b2260e3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java @@ -26,7 +26,8 @@ */ public final class ObjectMapperFactory { - private ObjectMapperFactory() {} + private ObjectMapperFactory() { + } /** * Creates a new {@link ObjectMapper} instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java index f8167604c21ad..5b3d00f6b7362 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java @@ -33,7 +33,8 @@ * Schema utility class including serialization and deserialization. */ public class SchemaUtility { - private SchemaUtility() {} + private SchemaUtility() { + } /** * Deserialize Arrow schema from byte array. diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 614aff18d4554..10091aebdd50b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -332,6 +332,204 @@ public void testSizeOfValueBuffer() { } } + @Test + public void testFixedFloat2() { + try (final Float2Vector floatVector = new Float2Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 4); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 32 bytes (16 * 2) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + floatVector.zeroVector(); + + /* populate the floatVector */ + floatVector.set(0, (short) 0x101c); // Float16.toFloat16(+0.00050163269043f) + floatVector.set(2, (short) 0x901c); // Float16.toFloat16(-0.00050163269043f) + floatVector.set(4, (short) 0x101d); // Float16.toFloat16(+0.000502109527588f) + floatVector.set(6, (short) 0x901d); // Float16.toFloat16(-0.000502109527588f) + floatVector.set(8, (short) 0x121c); // Float16.toFloat16(+0.00074577331543f) + floatVector.set(10, (short) 0x921c); // Float16.toFloat16(-0.00074577331543f) + floatVector.set(12, (short) 0x501c); // Float16.toFloat16(+32.875f) + floatVector.set(14, (short) 0xd01c); // Float16.toFloat16(-32.875f) + + try { + floatVector.set(initialCapacity, (short) 0x141c); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals((short) 0x901d, floatVector.get(6)); + assertEquals((short) 0x121c, floatVector.get(8)); + assertEquals((short) 0x921c, floatVector.get(10)); + assertEquals((short) 0x501c, floatVector.get(12)); + assertEquals((short) 0xd01c, floatVector.get(14)); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + } + + /* this should trigger a realloc() */ + floatVector.setSafe(initialCapacity, (short) 0x141c); // Float16.toFloat16(+0.00100326538086f) + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals((short) 0x901d, floatVector.get(6)); + assertEquals((short) 0x121c, floatVector.get(8)); + assertEquals((short) 0x921c, floatVector.get(10)); + assertEquals((short) 0x501c, floatVector.get(12)); + assertEquals((short) 0xd01c, floatVector.get(14)); + assertEquals((short) 0x141c, floatVector.get(initialCapacity)); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-zero data not expected at index: " + i, floatVector.isNull(i)); + } + } + } + + @Test + public void testFixedFloat2WithPossibleTruncate() { + try (final Float2Vector floatVector = new Float2Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 4); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 32 bytes (16 * 2) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + floatVector.zeroVector(); + + /* populate the floatVector */ + floatVector.set(0, (short) 0x101c); // Float16.toFloat16(+0.00050163269043f) + floatVector.set(2, (short) 0x901c); // Float16.toFloat16(-0.00050163269043f) + floatVector.set(4, (short) 0x101d); // Float16.toFloat16(+0.000502109527588f) + floatVector.setWithPossibleTruncate(6, 2049.0f); // in f32=2049.000000, out f16=2048 + floatVector.setWithPossibleTruncate(8, 4098.0f); // in f32=4098.000000, out f16=4096 + floatVector.setWithPossibleTruncate(10, 8196.0f); // in f32=8196.000000, out f16=8192 + floatVector.setWithPossibleTruncate(12, 16392.0f); // in f32=16392.000000, out f16=16384 + floatVector.setWithPossibleTruncate(14, 32784.0f); // in f32=32784.000000, out f16=32768 + + try { + floatVector.setWithPossibleTruncate(initialCapacity, 1.618034f); // in f32=1.618034, out f16=1.6181641 + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals(2048.0f, floatVector.getValueAsFloat(6), 0); + assertEquals(4096.0f, floatVector.getValueAsFloat(8), 0); + assertEquals(8192.0f, floatVector.getValueAsFloat(10), 0); + assertEquals(16384.0f, floatVector.getValueAsDouble(12), 0); + assertEquals(32768.0f, floatVector.getValueAsDouble(14), 0); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + } + + /* this should trigger a realloc() */ + floatVector.setSafeWithPossibleTruncate(initialCapacity, 1.618034f); // in f32=1.618034, out f16=1.6181641 + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals(2048.0f, floatVector.getValueAsFloat(6), 0); + assertEquals(4096.0f, floatVector.getValueAsFloat(8), 0); + assertEquals(8192.0f, floatVector.getValueAsFloat(10), 0); + assertEquals(16384.0f, floatVector.getValueAsDouble(12), 0); + assertEquals(32768.0f, floatVector.getValueAsDouble(14), 0); + assertEquals(1.6181641f, floatVector.getValueAsDouble(initialCapacity), 0); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-zero data not expected at index: " + i, floatVector.isNull(i)); + } + } + } + @Test /* Float4Vector */ public void testFixedType3() { try (final Float4Vector floatVector = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index f9f0357861c15..9e96e75880522 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -75,7 +75,8 @@ */ public class ValueVectorDataPopulator { - private ValueVectorDataPopulator(){} + private ValueVectorDataPopulator() { + } /** * Populate values for BigIntVector. diff --git a/js/package.json b/js/package.json index 57f9267afa3a8..bb70fd0a395b0 100644 --- a/js/package.json +++ b/js/package.json @@ -79,7 +79,7 @@ "cross-env": "7.0.3", "del": "7.1.0", "del-cli": "5.1.0", - "esbuild": "0.19.2", + "esbuild": "0.20.0", "esbuild-plugin-alias": "0.2.1", "eslint": "8.52.0", "eslint-plugin-jest": "27.4.2", @@ -102,7 +102,7 @@ "memfs": "4.5.0", "mkdirp": "3.0.1", "multistream": "4.1.0", - "regenerator-runtime": "0.14.0", + "regenerator-runtime": "0.14.1", "rollup": "4.3.0", "rxjs": "7.8.1", "ts-jest": "29.1.1", diff --git a/js/yarn.lock b/js/yarn.lock index 10d2a256e1cac..7b3180740d3da 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -416,225 +416,230 @@ resolved "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz#1d572bfbbe14b7704e0ba0f39b74815b84870d70" integrity sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw== +"@esbuild/aix-ppc64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.20.0.tgz#509621cca4e67caf0d18561a0c56f8b70237472f" + integrity sha512-fGFDEctNh0CcSwsiRPxiaqX0P5rq+AqE0SRhYGZ4PX46Lg1FNR6oCxJghf8YgY0WQEgQuh3lErUFE4KxLeRmmw== + "@esbuild/android-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.17.19.tgz#bafb75234a5d3d1b690e7c2956a599345e84a2fd" integrity sha512-KBMWvEZooR7+kzY0BtbTQn0OAYY7CsiydT63pVEaPtVYF0hXbUaOyZog37DKxK7NF3XacBJOpYT4adIJh+avxA== -"@esbuild/android-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.2.tgz#bc35990f412a749e948b792825eef7df0ce0e073" - integrity sha512-lsB65vAbe90I/Qe10OjkmrdxSX4UJDjosDgb8sZUKcg3oefEuW2OT2Vozz8ef7wrJbMcmhvCC+hciF8jY/uAkw== +"@esbuild/android-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.20.0.tgz#109a6fdc4a2783fc26193d2687827045d8fef5ab" + integrity sha512-aVpnM4lURNkp0D3qPoAzSG92VXStYmoVPOgXveAUoQBWRSuQzt51yvSju29J6AHPmwY1BjH49uR29oyfH1ra8Q== "@esbuild/android-arm@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.17.19.tgz#5898f7832c2298bc7d0ab53701c57beb74d78b4d" integrity sha512-rIKddzqhmav7MSmoFCmDIb6e2W57geRsM94gV2l38fzhXMwq7hZoClug9USI2pFRGL06f4IOPHHpFNOkWieR8A== -"@esbuild/android-arm@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.2.tgz#edd1c8f23ba353c197f5b0337123c58ff2a56999" - integrity sha512-tM8yLeYVe7pRyAu9VMi/Q7aunpLwD139EY1S99xbQkT4/q2qa6eA4ige/WJQYdJ8GBL1K33pPFhPfPdJ/WzT8Q== +"@esbuild/android-arm@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.20.0.tgz#1397a2c54c476c4799f9b9073550ede496c94ba5" + integrity sha512-3bMAfInvByLHfJwYPJRlpTeaQA75n8C/QKpEaiS4HrFWFiJlNI0vzq/zCjBrhAYcPyVPG7Eo9dMrcQXuqmNk5g== "@esbuild/android-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.17.19.tgz#658368ef92067866d95fb268719f98f363d13ae1" integrity sha512-uUTTc4xGNDT7YSArp/zbtmbhO0uEEK9/ETW29Wk1thYUJBz3IVnvgEiEwEa9IeLyvnpKrWK64Utw2bgUmDveww== -"@esbuild/android-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.2.tgz#2dcdd6e6f1f2d82ea1b746abd8da5b284960f35a" - integrity sha512-qK/TpmHt2M/Hg82WXHRc/W/2SGo/l1thtDHZWqFq7oi24AjZ4O/CpPSu6ZuYKFkEgmZlFoa7CooAyYmuvnaG8w== +"@esbuild/android-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.20.0.tgz#2b615abefb50dc0a70ac313971102f4ce2fdb3ca" + integrity sha512-uK7wAnlRvjkCPzh8jJ+QejFyrP8ObKuR5cBIsQZ+qbMunwR8sbd8krmMbxTLSrDhiPZaJYKQAU5Y3iMDcZPhyQ== "@esbuild/darwin-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.17.19.tgz#584c34c5991b95d4d48d333300b1a4e2ff7be276" integrity sha512-80wEoCfF/hFKM6WE1FyBHc9SfUblloAWx6FJkFWTWiCoht9Mc0ARGEM47e67W9rI09YoUxJL68WHfDRYEAvOhg== -"@esbuild/darwin-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.2.tgz#55b36bc06d76f5c243987c1f93a11a80d8fc3b26" - integrity sha512-Ora8JokrvrzEPEpZO18ZYXkH4asCdc1DLdcVy8TGf5eWtPO1Ie4WroEJzwI52ZGtpODy3+m0a2yEX9l+KUn0tA== +"@esbuild/darwin-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.20.0.tgz#5c122ed799eb0c35b9d571097f77254964c276a2" + integrity sha512-AjEcivGAlPs3UAcJedMa9qYg9eSfU6FnGHJjT8s346HSKkrcWlYezGE8VaO2xKfvvlZkgAhyvl06OJOxiMgOYQ== "@esbuild/darwin-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.17.19.tgz#7751d236dfe6ce136cce343dce69f52d76b7f6cb" integrity sha512-IJM4JJsLhRYr9xdtLytPLSH9k/oxR3boaUIYiHkAawtwNOXKE8KoU8tMvryogdcT8AU+Bflmh81Xn6Q0vTZbQw== -"@esbuild/darwin-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.2.tgz#982524af33a6424a3b5cb44bbd52559623ad719c" - integrity sha512-tP+B5UuIbbFMj2hQaUr6EALlHOIOmlLM2FK7jeFBobPy2ERdohI4Ka6ZFjZ1ZYsrHE/hZimGuU90jusRE0pwDw== +"@esbuild/darwin-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.20.0.tgz#9561d277002ba8caf1524f209de2b22e93d170c1" + integrity sha512-bsgTPoyYDnPv8ER0HqnJggXK6RyFy4PH4rtsId0V7Efa90u2+EifxytE9pZnsDgExgkARy24WUQGv9irVbTvIw== "@esbuild/freebsd-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.17.19.tgz#cacd171665dd1d500f45c167d50c6b7e539d5fd2" integrity sha512-pBwbc7DufluUeGdjSU5Si+P3SoMF5DQ/F/UmTSb8HXO80ZEAJmrykPyzo1IfNbAoaqw48YRpv8shwd1NoI0jcQ== -"@esbuild/freebsd-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.2.tgz#8e478a0856645265fe79eac4b31b52193011ee06" - integrity sha512-YbPY2kc0acfzL1VPVK6EnAlig4f+l8xmq36OZkU0jzBVHcOTyQDhnKQaLzZudNJQyymd9OqQezeaBgkTGdTGeQ== +"@esbuild/freebsd-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.20.0.tgz#84178986a3138e8500d17cc380044868176dd821" + integrity sha512-kQ7jYdlKS335mpGbMW5tEe3IrQFIok9r84EM3PXB8qBFJPSc6dpWfrtsC/y1pyrz82xfUIn5ZrnSHQQsd6jebQ== "@esbuild/freebsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.17.19.tgz#0769456eee2a08b8d925d7c00b79e861cb3162e4" integrity sha512-4lu+n8Wk0XlajEhbEffdy2xy53dpR06SlzvhGByyg36qJw6Kpfk7cp45DR/62aPH9mtJRmIyrXAS5UWBrJT6TQ== -"@esbuild/freebsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.2.tgz#01b96604f2540db023c73809bb8ae6cd1692d6f3" - integrity sha512-nSO5uZT2clM6hosjWHAsS15hLrwCvIWx+b2e3lZ3MwbYSaXwvfO528OF+dLjas1g3bZonciivI8qKR/Hm7IWGw== +"@esbuild/freebsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.20.0.tgz#3f9ce53344af2f08d178551cd475629147324a83" + integrity sha512-uG8B0WSepMRsBNVXAQcHf9+Ko/Tr+XqmK7Ptel9HVmnykupXdS4J7ovSQUIi0tQGIndhbqWLaIL/qO/cWhXKyQ== "@esbuild/linux-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.17.19.tgz#38e162ecb723862c6be1c27d6389f48960b68edb" integrity sha512-ct1Tg3WGwd3P+oZYqic+YZF4snNl2bsnMKRkb3ozHmnM0dGWuxcPTTntAF6bOP0Sp4x0PjSF+4uHQ1xvxfRKqg== -"@esbuild/linux-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.2.tgz#7e5d2c7864c5c83ec789b59c77cd9c20d2594916" - integrity sha512-ig2P7GeG//zWlU0AggA3pV1h5gdix0MA3wgB+NsnBXViwiGgY77fuN9Wr5uoCrs2YzaYfogXgsWZbm+HGr09xg== +"@esbuild/linux-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.20.0.tgz#24efa685515689df4ecbc13031fa0a9dda910a11" + integrity sha512-uTtyYAP5veqi2z9b6Gr0NUoNv9F/rOzI8tOD5jKcCvRUn7T60Bb+42NDBCWNhMjkQzI0qqwXkQGo1SY41G52nw== "@esbuild/linux-arm@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.17.19.tgz#1a2cd399c50040184a805174a6d89097d9d1559a" integrity sha512-cdmT3KxjlOQ/gZ2cjfrQOtmhG4HJs6hhvm3mWSRDPtZ/lP5oe8FWceS10JaSJC13GBd4eH/haHnqf7hhGNLerA== -"@esbuild/linux-arm@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.2.tgz#c32ae97bc0246664a1cfbdb4a98e7b006d7db8ae" - integrity sha512-Odalh8hICg7SOD7XCj0YLpYCEc+6mkoq63UnExDCiRA2wXEmGlK5JVrW50vZR9Qz4qkvqnHcpH+OFEggO3PgTg== +"@esbuild/linux-arm@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.20.0.tgz#6b586a488e02e9b073a75a957f2952b3b6e87b4c" + integrity sha512-2ezuhdiZw8vuHf1HKSf4TIk80naTbP9At7sOqZmdVwvvMyuoDiZB49YZKLsLOfKIr77+I40dWpHVeY5JHpIEIg== "@esbuild/linux-ia32@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.17.19.tgz#e28c25266b036ce1cabca3c30155222841dc035a" integrity sha512-w4IRhSy1VbsNxHRQpeGCHEmibqdTUx61Vc38APcsRbuVgK0OPEnQ0YD39Brymn96mOx48Y2laBQGqgZ0j9w6SQ== -"@esbuild/linux-ia32@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.2.tgz#3fc4f0fa026057fe885e4a180b3956e704f1ceaa" - integrity sha512-mLfp0ziRPOLSTek0Gd9T5B8AtzKAkoZE70fneiiyPlSnUKKI4lp+mGEnQXcQEHLJAcIYDPSyBvsUbKUG2ri/XQ== +"@esbuild/linux-ia32@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.20.0.tgz#84ce7864f762708dcebc1b123898a397dea13624" + integrity sha512-c88wwtfs8tTffPaoJ+SQn3y+lKtgTzyjkD8NgsyCtCmtoIC8RDL7PrJU05an/e9VuAke6eJqGkoMhJK1RY6z4w== "@esbuild/linux-loong64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.17.19.tgz#0f887b8bb3f90658d1a0117283e55dbd4c9dcf72" integrity sha512-2iAngUbBPMq439a+z//gE+9WBldoMp1s5GWsUSgqHLzLJ9WoZLZhpwWuym0u0u/4XmZ3gpHmzV84PonE+9IIdQ== -"@esbuild/linux-loong64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.2.tgz#633bcaea443f3505fb0ed109ab840c99ad3451a4" - integrity sha512-hn28+JNDTxxCpnYjdDYVMNTR3SKavyLlCHHkufHV91fkewpIyQchS1d8wSbmXhs1fiYDpNww8KTFlJ1dHsxeSw== +"@esbuild/linux-loong64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.20.0.tgz#1922f571f4cae1958e3ad29439c563f7d4fd9037" + integrity sha512-lR2rr/128/6svngnVta6JN4gxSXle/yZEZL3o4XZ6esOqhyR4wsKyfu6qXAL04S4S5CgGfG+GYZnjFd4YiG3Aw== "@esbuild/linux-mips64el@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.17.19.tgz#f5d2a0b8047ea9a5d9f592a178ea054053a70289" integrity sha512-LKJltc4LVdMKHsrFe4MGNPp0hqDFA1Wpt3jE1gEyM3nKUvOiO//9PheZZHfYRfYl6AwdTH4aTcXSqBerX0ml4A== -"@esbuild/linux-mips64el@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.2.tgz#e0bff2898c46f52be7d4dbbcca8b887890805823" - integrity sha512-KbXaC0Sejt7vD2fEgPoIKb6nxkfYW9OmFUK9XQE4//PvGIxNIfPk1NmlHmMg6f25x57rpmEFrn1OotASYIAaTg== +"@esbuild/linux-mips64el@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.20.0.tgz#7ca1bd9df3f874d18dbf46af009aebdb881188fe" + integrity sha512-9Sycc+1uUsDnJCelDf6ZNqgZQoK1mJvFtqf2MUz4ujTxGhvCWw+4chYfDLPepMEvVL9PDwn6HrXad5yOrNzIsQ== "@esbuild/linux-ppc64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.17.19.tgz#876590e3acbd9fa7f57a2c7d86f83717dbbac8c7" integrity sha512-/c/DGybs95WXNS8y3Ti/ytqETiW7EU44MEKuCAcpPto3YjQbyK3IQVKfF6nbghD7EcLUGl0NbiL5Rt5DMhn5tg== -"@esbuild/linux-ppc64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.2.tgz#d75798da391f54a9674f8c143b9a52d1dbfbfdde" - integrity sha512-dJ0kE8KTqbiHtA3Fc/zn7lCd7pqVr4JcT0JqOnbj4LLzYnp+7h8Qi4yjfq42ZlHfhOCM42rBh0EwHYLL6LEzcw== +"@esbuild/linux-ppc64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.20.0.tgz#8f95baf05f9486343bceeb683703875d698708a4" + integrity sha512-CoWSaaAXOZd+CjbUTdXIJE/t7Oz+4g90A3VBCHLbfuc5yUQU/nFDLOzQsN0cdxgXd97lYW/psIIBdjzQIwTBGw== "@esbuild/linux-riscv64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.17.19.tgz#7f49373df463cd9f41dc34f9b2262d771688bf09" integrity sha512-FC3nUAWhvFoutlhAkgHf8f5HwFWUL6bYdvLc/TTuxKlvLi3+pPzdZiFKSWz/PF30TB1K19SuCxDTI5KcqASJqA== -"@esbuild/linux-riscv64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.2.tgz#012409bd489ed1bb9b775541d4a46c5ded8e6dd8" - integrity sha512-7Z/jKNFufZ/bbu4INqqCN6DDlrmOTmdw6D0gH+6Y7auok2r02Ur661qPuXidPOJ+FSgbEeQnnAGgsVynfLuOEw== +"@esbuild/linux-riscv64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.20.0.tgz#ca63b921d5fe315e28610deb0c195e79b1a262ca" + integrity sha512-mlb1hg/eYRJUpv8h/x+4ShgoNLL8wgZ64SUr26KwglTYnwAWjkhR2GpoKftDbPOCnodA9t4Y/b68H4J9XmmPzA== "@esbuild/linux-s390x@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.17.19.tgz#e2afd1afcaf63afe2c7d9ceacd28ec57c77f8829" integrity sha512-IbFsFbxMWLuKEbH+7sTkKzL6NJmG2vRyy6K7JJo55w+8xDk7RElYn6xvXtDW8HCfoKBFK69f3pgBJSUSQPr+4Q== -"@esbuild/linux-s390x@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.2.tgz#ece3ed75c5a150de8a5c110f02e97d315761626b" - integrity sha512-U+RinR6aXXABFCcAY4gSlv4CL1oOVvSSCdseQmGO66H+XyuQGZIUdhG56SZaDJQcLmrSfRmx5XZOWyCJPRqS7g== +"@esbuild/linux-s390x@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.20.0.tgz#cb3d069f47dc202f785c997175f2307531371ef8" + integrity sha512-fgf9ubb53xSnOBqyvWEY6ukBNRl1mVX1srPNu06B6mNsNK20JfH6xV6jECzrQ69/VMiTLvHMicQR/PgTOgqJUQ== "@esbuild/linux-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.17.19.tgz#8a0e9738b1635f0c53389e515ae83826dec22aa4" integrity sha512-68ngA9lg2H6zkZcyp22tsVt38mlhWde8l3eJLWkyLrp4HwMUr3c1s/M2t7+kHIhvMjglIBrFpncX1SzMckomGw== -"@esbuild/linux-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.2.tgz#dea187019741602d57aaf189a80abba261fbd2aa" - integrity sha512-oxzHTEv6VPm3XXNaHPyUTTte+3wGv7qVQtqaZCrgstI16gCuhNOtBXLEBkBREP57YTd68P0VgDgG73jSD8bwXQ== +"@esbuild/linux-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.20.0.tgz#ac617e0dc14e9758d3d7efd70288c14122557dc7" + integrity sha512-H9Eu6MGse++204XZcYsse1yFHmRXEWgadk2N58O/xd50P9EvFMLJTQLg+lB4E1cF2xhLZU5luSWtGTb0l9UeSg== "@esbuild/netbsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.17.19.tgz#c29fb2453c6b7ddef9a35e2c18b37bda1ae5c462" integrity sha512-CwFq42rXCR8TYIjIfpXCbRX0rp1jo6cPIUPSaWwzbVI4aOfX96OXY8M6KNmtPcg7QjYeDmN+DD0Wp3LaBOLf4Q== -"@esbuild/netbsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.2.tgz#bbfd7cf9ab236a23ee3a41b26f0628c57623d92a" - integrity sha512-WNa5zZk1XpTTwMDompZmvQLHszDDDN7lYjEHCUmAGB83Bgs20EMs7ICD+oKeT6xt4phV4NDdSi/8OfjPbSbZfQ== +"@esbuild/netbsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.20.0.tgz#6cc778567f1513da6e08060e0aeb41f82eb0f53c" + integrity sha512-lCT675rTN1v8Fo+RGrE5KjSnfY0x9Og4RN7t7lVrN3vMSjy34/+3na0q7RIfWDAj0e0rCh0OL+P88lu3Rt21MQ== "@esbuild/openbsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.17.19.tgz#95e75a391403cb10297280d524d66ce04c920691" integrity sha512-cnq5brJYrSZ2CF6c35eCmviIN3k3RczmHz8eYaVlNasVqsNY+JKohZU5MKmaOI+KkllCdzOKKdPs762VCPC20g== -"@esbuild/openbsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.2.tgz#fa5c4c6ee52a360618f00053652e2902e1d7b4a7" - integrity sha512-S6kI1aT3S++Dedb7vxIuUOb3oAxqxk2Rh5rOXOTYnzN8JzW1VzBd+IqPiSpgitu45042SYD3HCoEyhLKQcDFDw== +"@esbuild/openbsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.20.0.tgz#76848bcf76b4372574fb4d06cd0ed1fb29ec0fbe" + integrity sha512-HKoUGXz/TOVXKQ+67NhxyHv+aDSZf44QpWLa3I1lLvAwGq8x1k0T+e2HHSRvxWhfJrFxaaqre1+YyzQ99KixoA== "@esbuild/sunos-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.17.19.tgz#722eaf057b83c2575937d3ffe5aeb16540da7273" integrity sha512-vCRT7yP3zX+bKWFeP/zdS6SqdWB8OIpaRq/mbXQxTGHnIxspRtigpkUcDMlSCOejlHowLqII7K2JKevwyRP2rg== -"@esbuild/sunos-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.2.tgz#52a2ac8ac6284c02d25df22bb4cfde26fbddd68d" - integrity sha512-VXSSMsmb+Z8LbsQGcBMiM+fYObDNRm8p7tkUDMPG/g4fhFX5DEFmjxIEa3N8Zr96SjsJ1woAhF0DUnS3MF3ARw== +"@esbuild/sunos-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.20.0.tgz#ea4cd0639bf294ad51bc08ffbb2dac297e9b4706" + integrity sha512-GDwAqgHQm1mVoPppGsoq4WJwT3vhnz/2N62CzhvApFD1eJyTroob30FPpOZabN+FgCjhG+AgcZyOPIkR8dfD7g== "@esbuild/win32-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.17.19.tgz#9aa9dc074399288bdcdd283443e9aeb6b9552b6f" integrity sha512-yYx+8jwowUstVdorcMdNlzklLYhPxjniHWFKgRqH7IFlUEa0Umu3KuYplf1HUZZ422e3NU9F4LGb+4O0Kdcaag== -"@esbuild/win32-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.2.tgz#719ed5870855de8537aef8149694a97d03486804" - integrity sha512-5NayUlSAyb5PQYFAU9x3bHdsqB88RC3aM9lKDAz4X1mo/EchMIT1Q+pSeBXNgkfNmRecLXA0O8xP+x8V+g/LKg== +"@esbuild/win32-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.20.0.tgz#a5c171e4a7f7e4e8be0e9947a65812c1535a7cf0" + integrity sha512-0vYsP8aC4TvMlOQYozoksiaxjlvUcQrac+muDqj1Fxy6jh9l9CZJzj7zmh8JGfiV49cYLTorFLxg7593pGldwQ== "@esbuild/win32-ia32@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.17.19.tgz#95ad43c62ad62485e210f6299c7b2571e48d2b03" integrity sha512-eggDKanJszUtCdlVs0RB+h35wNlb5v4TWEkq4vZcmVt5u/HiDZrTXe2bWFQUez3RgNHwx/x4sk5++4NSSicKkw== -"@esbuild/win32-ia32@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.2.tgz#24832223880b0f581962c8660f8fb8797a1e046a" - integrity sha512-47gL/ek1v36iN0wL9L4Q2MFdujR0poLZMJwhO2/N3gA89jgHp4MR8DKCmwYtGNksbfJb9JoTtbkoe6sDhg2QTA== +"@esbuild/win32-ia32@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.20.0.tgz#f8ac5650c412d33ea62d7551e0caf82da52b7f85" + integrity sha512-p98u4rIgfh4gdpV00IqknBD5pC84LCub+4a3MO+zjqvU5MVXOc3hqR2UgT2jI2nh3h8s9EQxmOsVI3tyzv1iFg== "@esbuild/win32-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.17.19.tgz#8cfaf2ff603e9aabb910e9c0558c26cf32744061" integrity sha512-lAhycmKnVOuRYNtRtatQR1LPQf2oYCkRGkSFnseDAKPl8lu5SOsK/e1sXe5a0Pc5kHIHe6P2I/ilntNv2xf3cA== -"@esbuild/win32-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.2.tgz#1205014625790c7ff0e471644a878a65d1e34ab0" - integrity sha512-tcuhV7ncXBqbt/Ybf0IyrMcwVOAPDckMK9rXNHtF17UTK18OKLpg08glminN06pt2WCoALhXdLfSPbVvK/6fxw== +"@esbuild/win32-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.20.0.tgz#2efddf82828aac85e64cef62482af61c29561bee" + integrity sha512-NgJnesu1RtWihtTtXGFMU5YSE6JyyHPMxCwBZK7a6/8d31GuSo9l0Ss7w1Jw5QnKUawG6UEehs883kcXf5fYwg== "@eslint-community/eslint-utils@^4.2.0", "@eslint-community/eslint-utils@^4.4.0": version "4.4.0" @@ -2888,33 +2893,34 @@ esbuild-plugin-alias@0.2.1: resolved "https://registry.npmjs.org/esbuild-plugin-alias/-/esbuild-plugin-alias-0.2.1.tgz#45a86cb941e20e7c2bc68a2bea53562172494fcb" integrity sha512-jyfL/pwPqaFXyKnj8lP8iLk6Z0m099uXR45aSN8Av1XD4vhvQutxxPzgA2bTcAwQpa1zCXDcWOlhFgyP3GKqhQ== -esbuild@0.19.2: - version "0.19.2" - resolved "https://registry.npmjs.org/esbuild/-/esbuild-0.19.2.tgz#b1541828a89dfb6f840d38538767c6130dca2aac" - integrity sha512-G6hPax8UbFakEj3hWO0Vs52LQ8k3lnBhxZWomUJDxfz3rZTLqF5k/FCzuNdLx2RbpBiQQF9H9onlDDH1lZsnjg== +esbuild@0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.20.0.tgz#a7170b63447286cd2ff1f01579f09970e6965da4" + integrity sha512-6iwE3Y2RVYCME1jLpBqq7LQWK3MW6vjV2bZy6gt/WrqkY+WE74Spyc0ThAOYpMtITvnjX09CrC6ym7A/m9mebA== optionalDependencies: - "@esbuild/android-arm" "0.19.2" - "@esbuild/android-arm64" "0.19.2" - "@esbuild/android-x64" "0.19.2" - "@esbuild/darwin-arm64" "0.19.2" - "@esbuild/darwin-x64" "0.19.2" - "@esbuild/freebsd-arm64" "0.19.2" - "@esbuild/freebsd-x64" "0.19.2" - "@esbuild/linux-arm" "0.19.2" - "@esbuild/linux-arm64" "0.19.2" - "@esbuild/linux-ia32" "0.19.2" - "@esbuild/linux-loong64" "0.19.2" - "@esbuild/linux-mips64el" "0.19.2" - "@esbuild/linux-ppc64" "0.19.2" - "@esbuild/linux-riscv64" "0.19.2" - "@esbuild/linux-s390x" "0.19.2" - "@esbuild/linux-x64" "0.19.2" - "@esbuild/netbsd-x64" "0.19.2" - "@esbuild/openbsd-x64" "0.19.2" - "@esbuild/sunos-x64" "0.19.2" - "@esbuild/win32-arm64" "0.19.2" - "@esbuild/win32-ia32" "0.19.2" - "@esbuild/win32-x64" "0.19.2" + "@esbuild/aix-ppc64" "0.20.0" + "@esbuild/android-arm" "0.20.0" + "@esbuild/android-arm64" "0.20.0" + "@esbuild/android-x64" "0.20.0" + "@esbuild/darwin-arm64" "0.20.0" + "@esbuild/darwin-x64" "0.20.0" + "@esbuild/freebsd-arm64" "0.20.0" + "@esbuild/freebsd-x64" "0.20.0" + "@esbuild/linux-arm" "0.20.0" + "@esbuild/linux-arm64" "0.20.0" + "@esbuild/linux-ia32" "0.20.0" + "@esbuild/linux-loong64" "0.20.0" + "@esbuild/linux-mips64el" "0.20.0" + "@esbuild/linux-ppc64" "0.20.0" + "@esbuild/linux-riscv64" "0.20.0" + "@esbuild/linux-s390x" "0.20.0" + "@esbuild/linux-x64" "0.20.0" + "@esbuild/netbsd-x64" "0.20.0" + "@esbuild/openbsd-x64" "0.20.0" + "@esbuild/sunos-x64" "0.20.0" + "@esbuild/win32-arm64" "0.20.0" + "@esbuild/win32-ia32" "0.20.0" + "@esbuild/win32-x64" "0.20.0" esbuild@^0.17.11: version "0.17.19" @@ -5961,10 +5967,10 @@ redent@^4.0.0: indent-string "^5.0.0" strip-indent "^4.0.0" -regenerator-runtime@0.14.0: - version "0.14.0" - resolved "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.0.tgz#5e19d68eb12d486f797e15a3c6a918f7cec5eb45" - integrity sha512-srw17NI0TUWHuGa5CFGGmhfNIeja30WMBfbslPNhf6JrqQlLN5gcrvig1oqPxiVaXb0oW0XRKtH6Nngs5lKCIA== +regenerator-runtime@0.14.1: + version "0.14.1" + resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f" + integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw== regex-not@^1.0.0, regex-not@^1.0.2: version "1.0.2" diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9da94885ec6b2..2ee97ddb662e5 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -163,10 +163,11 @@ def print_entry(label, value): time32, time64, timestamp, date32, date64, duration, month_day_nano_interval, float16, float32, float64, - binary, string, utf8, + binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +175,9 @@ def print_entry(label, value): field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,10 +203,12 @@ def print_entry(label, value): Int32Array, UInt32Array, Int64Array, UInt64Array, HalfFloatArray, FloatArray, DoubleArray, - ListArray, LargeListArray, MapArray, - FixedSizeListArray, UnionArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, + BinaryViewArray, StringViewArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, TimestampArray, @@ -219,12 +223,13 @@ def print_entry(label, value): HalfFloatScalar, FloatScalar, DoubleScalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, MonthDayNanoIntervalScalar, - BinaryScalar, LargeBinaryScalar, - StringScalar, LargeStringScalar, + BinaryScalar, LargeBinaryScalar, BinaryViewScalar, + StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, ExtensionScalar) diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index a2ff045f256ac..67ee7590560f0 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -2013,8 +2013,9 @@ cdef CStatus _data_stream_next(void* self, CFlightPayload* payload) except *: max_attempts = 128 for _ in range(max_attempts): if stream.current_stream != nullptr: - check_flight_status( - stream.current_stream.get().Next().Value(payload)) + with nogil: + check_flight_status( + stream.current_stream.get().Next().Value(payload)) # If the stream ended, see if there's another stream from the # generator if payload.ipc_message.metadata != nullptr: diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 13b8c748cb8ca..f5bab99a49f7a 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -245,6 +245,11 @@ cdef class S3FileSystem(FileSystem): retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3) The retry strategy to use with S3; fail after max_attempts. Available strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy. + force_virtual_addressing : bool, default False + Whether to use virtual addressing of buckets. + If true, then virtual addressing is always enabled. + If false, then virtual addressing is only enabled if `endpoint_override` is empty. + This can be used for non-AWS backends that only support virtual hosted-style access. Examples -------- @@ -268,7 +273,9 @@ cdef class S3FileSystem(FileSystem): role_arn=None, session_name=None, external_id=None, load_frequency=900, proxy_options=None, allow_bucket_creation=False, allow_bucket_deletion=False, - retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)): + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy( + max_attempts=3), + force_virtual_addressing=False): cdef: optional[CS3Options] options shared_ptr[CS3FileSystem] wrapped @@ -380,6 +387,7 @@ cdef class S3FileSystem(FileSystem): options.value().allow_bucket_creation = allow_bucket_creation options.value().allow_bucket_deletion = allow_bucket_deletion + options.value().force_virtual_addressing = force_virtual_addressing if isinstance(retry_strategy, AwsStandardS3RetryStrategy): options.value().retry_strategy = CS3RetryStrategy.GetAwsStandardRetryStrategy( @@ -447,6 +455,7 @@ cdef class S3FileSystem(FileSystem): opts.proxy_options.username), 'password': frombytes( opts.proxy_options.password)}, + force_virtual_addressing=opts.force_virtual_addressing, ),) ) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1416f5f4346d9..ad01d45571ba1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2460,6 +2460,578 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) +cdef class ListViewArray(Array): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this ListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + +cdef class LargeListViewArray(Array): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this LargeListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + cdef class MapArray(ListArray): """ Concrete class for Arrow arrays of a map data type. @@ -2942,6 +3514,12 @@ cdef class LargeStringArray(Array): null_count, offset) +cdef class StringViewArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) view data type. + """ + + cdef class BinaryArray(Array): """ Concrete class for Arrow arrays of variable-sized binary data type. @@ -2968,6 +3546,12 @@ cdef class LargeBinaryArray(Array): return ( self.ap).total_values_length() +cdef class BinaryViewArray(Array): + """ + Concrete class for Arrow arrays of variable-sized binary view data type. + """ + + cdef class DictionaryArray(Array): """ Concrete class for dictionary-encoded Arrow arrays. @@ -3529,7 +4113,7 @@ cdef class ExtensionArray(Array): return result -class FixedShapeTensorArray(ExtensionArray): +cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. @@ -3570,17 +4154,48 @@ class FixedShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): """ - Convert fixed shape tensor extension array to a numpy array (with dim+1). + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. - Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. """ - if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): - np_flat = np.asarray(self.storage.flatten()) - numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) - return numpy_tensor - else: - raise ValueError( - 'Only non-permuted tensors can be converted to numpy tensors.') + + cdef: + CFixedShapeTensorArray* ext_array = (self.ap) + CResult[shared_ptr[CTensor]] ctensor + with nogil: + ctensor = ext_array.ToTensor() + return pyarrow_wrap_tensor(GetResultValue(ctensor)) @staticmethod def from_numpy_ndarray(obj): @@ -3588,9 +4203,7 @@ class FixedShapeTensorArray(ExtensionArray): Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. The first dimension of ndarray will become the length of the fixed shape tensor array. - - Numpy array needs to be C-contiguous in memory - (``obj.flags["C_CONTIGUOUS"]==True``). + If input array data is not contiguous a copy will be made. Parameters ---------- @@ -3624,17 +4237,25 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if not obj.flags["C_CONTIGUOUS"]: - raise ValueError('The data in the numpy array need to be in a single, ' - 'C-style contiguous segment.') + + if len(obj.shape) < 2: + raise ValueError( + "Cannot convert 1D array or scalar to fixed shape tensor array") + if np.prod(obj.shape) == 0: + raise ValueError("Expected a non-empty ndarray") + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') arrow_type = from_numpy_dtype(obj.dtype) - shape = obj.shape[1:] - size = obj.size / obj.shape[0] + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") return ExtensionArray.from_storage( - fixed_shape_tensor(arrow_type, shape), - FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) + fixed_shape_tensor(arrow_type, shape[1:], permutation=permutation[1:] - 1), + FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) @@ -3661,6 +4282,8 @@ cdef dict _array_classes = { _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, _Type_SPARSE_UNION: UnionArray, @@ -3669,6 +4292,8 @@ cdef dict _array_classes = { _Type_STRING: StringArray, _Type_LARGE_BINARY: LargeBinaryArray, _Type_LARGE_STRING: LargeStringArray, + _Type_BINARY_VIEW: BinaryViewArray, + _Type_STRING_VIEW: StringViewArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, _Type_DECIMAL128: Decimal128Array, diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi index a34ea5412e14a..2af39e2c589e6 100644 --- a/python/pyarrow/builder.pxi +++ b/python/pyarrow/builder.pxi @@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable): def __len__(self): return self.builder.get().length() + + +cdef class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + cdef: + unique_ptr[CStringViewBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CStringViewBuilder(pool)) + + def append(self, value): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, (bytes, str)): + self.builder.get().Append(tobytes(value)) + else: + raise TypeError('StringViewBuilder only accepts string objects') + + def append_values(self, values): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + for value in values: + self.append(value) + + def finish(self): + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 74e92594b04e5..8056d99354965 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -126,10 +126,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY" _Type_LARGE_STRING" arrow::Type::LARGE_STRING" _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY" + _Type_BINARY_VIEW" arrow::Type::BINARY_VIEW" + _Type_STRING_VIEW" arrow::Type::STRING_VIEW" _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" + _Type_LIST_VIEW" arrow::Type::LIST_VIEW" + _Type_LARGE_LIST_VIEW" arrow::Type::LARGE_LIST_VIEW" _Type_STRUCT" arrow::Type::STRUCT" _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" _Type_DENSE_UNION" arrow::Type::DENSE_UNION" @@ -364,6 +368,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CListViewType" arrow::ListViewType"(CDataType): + CListViewType(const shared_ptr[CDataType]& value_type) + CListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + + cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): + CLargeListViewType(const shared_ptr[CDataType]& value_type) + CLargeListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, const shared_ptr[CField]& item_field, c_bool keys_sorted) @@ -483,6 +499,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] run_end_type, shared_ptr[CDataType] value_type) + cdef shared_ptr[CDataType] CMakeListViewType" arrow::list_view"( + shared_ptr[CField] value_type) + + cdef shared_ptr[CDataType] CMakeLargeListViewType" arrow::large_list_view"( + shared_ptr[CField] value_type) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -688,6 +710,70 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + const int32_t* raw_value_offsets() + const int32_t* raw_value_sizes() + int32_t value_offset(int i) + int32_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + + cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( @@ -1148,6 +1234,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CListScalar" arrow::ListScalar"(CBaseListScalar): pass + cdef cppclass CListViewScalar" arrow::ListViewScalar"(CBaseListScalar): + pass + + cdef cppclass CLargeListViewScalar" arrow::LargeListViewScalar"(CBaseListScalar): + pass + cdef cppclass CMapScalar" arrow::MapScalar"(CListScalar): pass @@ -1295,7 +1387,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil: cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder): CStringBuilder(CMemoryPool* pool) + CStatus Append(const c_string& value) + + cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder): + CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool) + CStatus Append(const char* value, int32_t length) + cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder): + CStringViewBuilder(CMemoryPool* pool) CStatus Append(const c_string& value) cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder): @@ -2695,26 +2794,26 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): + CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const + @staticmethod CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, const vector[int64_t]& shape, const vector[int64_t]& permutation, const vector[c_string]& dim_names) - CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, - const c_string& serialized_data) const - - c_string Serialize() const - const shared_ptr[CDataType] value_type() const vector[int64_t] shape() const vector[int64_t] permutation() const vector[c_string] dim_names() + cdef cppclass CFixedShapeTensorArray \ + " arrow::extension::FixedShapeTensorArray"(CExtensionArray): + const CResult[shared_ptr[CTensor]] ToTensor() const cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index cb30f4e750eff..7876fb0f96671 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -167,6 +167,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_bool background_writes c_bool allow_bucket_creation c_bool allow_bucket_deletion + c_bool force_virtual_addressing shared_ptr[const CKeyValueMetadata] default_metadata c_string role_arn c_string session_name diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 58ec34addbc0a..48350212c2076 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -120,6 +120,16 @@ cdef class LargeListType(DataType): const CLargeListType* list_type +cdef class ListViewType(DataType): + cdef: + const CListViewType* list_view_type + + +cdef class LargeListViewType(DataType): + cdef: + const CLargeListViewType* list_view_type + + cdef class MapType(DataType): cdef: const CMapType* map_type @@ -425,6 +435,14 @@ cdef class LargeListArray(BaseListArray): pass +cdef class ListViewArray(Array): + pass + + +cdef class LargeListViewArray(Array): + pass + + cdef class MapArray(ListArray): pass @@ -445,6 +463,14 @@ cdef class BinaryArray(Array): pass +cdef class StringViewArray(Array): + pass + + +cdef class BinaryViewArray(Array): + pass + + cdef class DictionaryArray(Array): cdef: object _indices, _dictionary diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 29a0bed55949c..3245e50f0fe69 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -106,8 +106,12 @@ Type_STRING = _Type_STRING Type_LARGE_BINARY = _Type_LARGE_BINARY Type_LARGE_STRING = _Type_LARGE_STRING Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY +Type_BINARY_VIEW = _Type_BINARY_VIEW +Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST +Type_LIST_VIEW = _Type_LIST_VIEW +Type_LARGE_LIST_VIEW = _Type_LARGE_LIST_VIEW Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec387..966273b4bea84 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -87,6 +87,10 @@ cdef api object pyarrow_wrap_data_type( out = ListType.__new__(ListType) elif type.get().id() == _Type_LARGE_LIST: out = LargeListType.__new__(LargeListType) + elif type.get().id() == _Type_LIST_VIEW: + out = ListViewType.__new__(ListViewType) + elif type.get().id() == _Type_LARGE_LIST_VIEW: + out = LargeListViewType.__new__(LargeListViewType) elif type.get().id() == _Type_MAP: out = MapType.__new__(MapType) elif type.get().id() == _Type_FIXED_SIZE_LIST: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 9a66dc81226d4..41bfde39adb6f 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar): pass +cdef class BinaryViewScalar(BinaryScalar): + pass + + +cdef class StringViewScalar(StringScalar): + pass + + cdef class ListScalar(Scalar): """ Concrete class for list-like scalars. @@ -712,6 +720,14 @@ cdef class LargeListScalar(ListScalar): pass +cdef class ListViewScalar(ListScalar): + pass + + +cdef class LargeListViewScalar(ListScalar): + pass + + cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. @@ -1027,6 +1043,48 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +cdef class FixedShapeTensorScalar(ExtensionScalar): + """ + Concrete class for fixed shape tensor extension scalar. + """ + + def to_numpy(self): + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + cdef: + CFixedShapeTensorType* c_type = static_pointer_cast[CFixedShapeTensorType, CDataType]( + self.wrapped.get().type).get() + shared_ptr[CExtensionScalar] scalar = static_pointer_cast[CExtensionScalar, CScalar](self.wrapped) + shared_ptr[CTensor] ctensor + + with nogil: + ctensor = GetResultValue(c_type.MakeTensor(scalar)) + return pyarrow_wrap_tensor(ctensor) + + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, @@ -1051,11 +1109,15 @@ cdef dict _scalar_classes = { _Type_BINARY: BinaryScalar, _Type_LARGE_BINARY: LargeBinaryScalar, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar, + _Type_BINARY_VIEW: BinaryViewScalar, _Type_STRING: StringScalar, _Type_LARGE_STRING: LargeStringScalar, + _Type_STRING_VIEW: StringViewScalar, _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, + _Type_LIST_VIEW: ListViewScalar, + _Type_LARGE_LIST_VIEW: LargeListViewScalar, _Type_STRUCT: StructScalar, _Type_MAP: MapScalar, _Type_DICTIONARY: DictionaryScalar, diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index c266abc169d49..2c86c86a919be 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -63,6 +63,8 @@ std::shared_ptr GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(STRING, utf8); GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); + GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view); + GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view); GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); default: return nullptr; diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index d1d94ac17a13e..3c4d59d6594a2 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -486,6 +486,10 @@ class PyValue { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -499,8 +503,8 @@ class PyValue { } template - static enable_if_string Convert(const T*, const O& options, I obj, - PyBytesView& view) { + static enable_if_t::value || is_string_view_type::value, Status> + Convert(const T*, const O& options, I obj, PyBytesView& view) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 @@ -570,18 +574,12 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, - enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value && !is_binary_view_like_type::value) || - std::is_same::value>> { + T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; -template -struct PyConverterTrait> { - // not implemented -}; - template struct PyConverterTrait> { using type = PyListConverter; @@ -699,11 +697,22 @@ class PyPrimitiveConverter:: PyBytesView view_; }; +template +struct OffsetTypeTrait { + using type = typename T::offset_type; +}; + +template +struct OffsetTypeTrait> { + using type = int64_t; +}; + template -class PyPrimitiveConverter> +class PyPrimitiveConverter< + T, enable_if_t::value || is_binary_view_like_type::value>> : public PrimitiveConverter { public: - using OffsetType = typename T::offset_type; + using OffsetType = typename OffsetTypeTrait::type; Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3c450d61a7659..abda784fb7c18 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -4172,6 +4172,8 @@ cdef class Table(_Tabular): reader.reset(new TableBatchReader(deref(self.table))) if max_chunksize is not None: + if not max_chunksize > 0: + raise ValueError("'max_chunksize' should be strictly positive") c_max_chunksize = max_chunksize reader.get().set_chunksize(c_max_chunksize) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a5941e8c8d1a8..0da757a4bc56e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -24,7 +24,6 @@ import urllib.request import pytest -from pytest_lazyfixture import lazy_fixture import hypothesis as h from ..conftest import groups, defaults @@ -259,13 +258,13 @@ def gcs_server(): @pytest.fixture( params=[ - lazy_fixture('builtin_pickle'), - lazy_fixture('cloudpickle') + 'builtin_pickle', + 'cloudpickle' ], scope='session' ) def pickle_module(request): - return request.param + return request.getfixturevalue(request.param) @pytest.fixture(scope='session') diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f851d4e0b6c29..bd9ae214b041e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3573,3 +3573,74 @@ def test_run_end_encoded_from_buffers(): with pytest.raises(ValueError): pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers, 1, offset, children) + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_from_arrays(list_array_type): + # test in order offsets, similar to ListArray representation + values = [1, 2, 3, 4, 5, 6, None, 7] + offsets = [0, 2, 4, 6] + sizes = [2, 2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[1, 2], [3, 4], [5, 6], [None, 7]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test out of order offsets with overlapping values + values = [1, 2, 3, 4] + offsets = [2, 1, 0] + sizes = [2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[3, 4], [2, 3], [1, 2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test null offsets and empty list values + values = [] + offsets = [0, None] + sizes = [0, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[], None] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == [0, 0] + assert array.sizes.to_pylist() == sizes + + # test null sizes and empty list values + values = [] + offsets = [0, 0] + sizes = [None, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [None, []] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == [0, 0] + + # test null bitmask + values = [1, 2] + offsets = [0, 0, 1] + sizes = [1, 0, 1] + mask = pa.array([False, True, False]) + array = list_array_type.from_arrays(offsets, sizes, values, mask=mask) + + assert array.to_pylist() == [[1], None, [2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_flatten(list_array_type): + values = [1, 2, 3, 4] + offsets = [3, 2, 1, 0] + sizes = [1, 1, 1, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.flatten().to_pylist() == [4, 3, 2, 1] diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index 50d801026b7d8..abc8a0013df37 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -20,7 +20,7 @@ import numpy as np import pyarrow as pa -from pyarrow.lib import StringBuilder +from pyarrow.lib import StringBuilder, StringViewBuilder def test_weakref(): @@ -65,3 +65,22 @@ def test_string_builder_append_after_finish(): sbuilder.append("No effect") expected = [None, None, "text", None, "other text"] assert arr.to_pylist() == expected + + +def test_string_view_builder(): + builder = StringViewBuilder() + builder.append(b"a byte string") + builder.append("a string") + builder.append("a longer not-inlined string") + builder.append(np.nan) + builder.append_values([None, "text"]) + assert len(builder) == 6 + assert builder.null_count == 2 + arr = builder.finish() + assert isinstance(arr, pa.Array) + assert arr.null_count == 2 + assert arr.type == 'string_view' + expected = [ + "a byte string", "a string", "a longer not-inlined string", None, None, "text" + ] + assert arr.to_pylist() == expected diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 49c4f1a6e79d6..55ea28f50fbb3 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -763,6 +763,16 @@ def test_sequence_unicode(): assert arr.to_pylist() == data +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) +def test_sequence_unicode_explicit_type(ty): + data = ['foo', 'bar', None, 'mañana'] + arr = pa.array(data, type=ty) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == ty + assert arr.to_pylist() == data + + def check_array_mixed_unicode_bytes(binary_type, string_type): values = ['qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type): def test_array_mixed_unicode_bytes(): check_array_mixed_unicode_bytes(pa.binary(), pa.string()) check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) + check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view()) @pytest.mark.large_memory @@ -818,7 +829,7 @@ def test_large_binary_value(ty): @pytest.mark.large_memory -@pytest.mark.parametrize("ty", [pa.binary(), pa.string()]) +@pytest.mark.parametrize("ty", [pa.binary(), pa.string(), pa.string_view()]) def test_string_too_large(ty): # Construct a binary array with a single value larger than 4GB s = b"0123456789abcdefghijklmnopqrstuvwxyz" @@ -836,7 +847,7 @@ def test_sequence_bytes(): u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] - for ty in [None, pa.binary(), pa.large_binary()]: + for ty in [None, pa.binary(), pa.large_binary(), pa.binary_view()]: arr = pa.array(data, type=ty) assert len(arr) == 6 assert arr.null_count == 1 @@ -844,7 +855,7 @@ def test_sequence_bytes(): assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None] -@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()]) +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) def test_sequence_utf8_to_unicode(ty): # ARROW-1225 data = [b'foo', None, b'bar'] @@ -2431,6 +2442,8 @@ def test_array_from_pylist_offset_overflow(): pa.binary(3)), ([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()), (["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()), + ([b"a"], [pa.scalar("a", type=pa.binary_view())], pa.binary_view()), + (["a"], [pa.scalar("a", type=pa.string_view())], pa.string_view()), ( ["a"], [pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))], diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 31f24187e3b37..bc1dd8a09a768 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -667,6 +667,31 @@ def row_num(x): 'b': ["e", "j"], } + def test_chunker_out_of_sync(self): + # GH-39892: if there are newlines in values, the parser may become + # out of sync with the chunker. In this case, we try to produce an + # informative error message. + rows = b"""a,b,c\nd,e,"f\n"\ng,h,i\n""" + expected = { + 'a': ["d", "g"], + 'b': ["e", "h"], + 'c': ["f\n", "i"], + } + for block_size in range(8, 15): + # Sanity check: parsing works with newlines_in_values=True + d = self.read_bytes( + rows, parse_options=ParseOptions(newlines_in_values=True), + read_options=ReadOptions(block_size=block_size)).to_pydict() + assert d == expected + # With these block sizes, a block would end on the physical newline + # inside the quoted cell value, leading to a mismatch between + # CSV chunker and parser. + for block_size in range(8, 11): + with pytest.raises(ValueError, + match="cell values spanning multiple lines"): + self.read_bytes( + rows, read_options=ReadOptions(block_size=block_size)) + class BaseCSVTableRead(BaseTestCSV): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a4838d63a6b0b..a9054f0b174aa 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -100,7 +100,6 @@ def assert_dataset_fragment_convenience_methods(dataset): @pytest.fixture -@pytest.mark.parquet def mockfs(): mockfs = fs._MockFileSystem() @@ -221,7 +220,6 @@ def multisourcefs(request): @pytest.fixture -@pytest.mark.parquet def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) @@ -2692,7 +2690,6 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module) @pytest.fixture -@pytest.mark.parquet def s3_example_simple(s3_server): from pyarrow.fs import FileSystem diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index a88e20eefe098..fe38bf651baae 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1318,39 +1318,120 @@ def test_tensor_type(): assert tensor_type.permutation is None -def test_tensor_class_methods(): - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], - pa.list_(pa.float32(), 6)) +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_class_methods(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) + storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr.to_numpy_ndarray() + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type) + np.testing.assert_array_equal(arr.to_tensor(), expected) + np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) + + expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type) + result = arr[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr[:1].to_numpy_ndarray() + values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] + flat_arr = np.array(values[0], dtype=value_type) + bw = value_type.itemsize + storage = pa.array(values, pa.list_(arrow_type, 12)) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + result = flat_arr.reshape(1, 2, 3, 2) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) np.testing.assert_array_equal(result, expected) - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="C") + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 6, bw * 2)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 0, 1] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + + +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_array_from_numpy(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) - assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.value_type == arrow_type assert tensor_array_from_numpy.type.shape == [2, 3] - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="F") - with pytest.raises(ValueError, match="C-style contiguous segment"): + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match="First stride needs to be largest"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) - storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) - arr = pa.ExtensionArray.from_storage(tensor_type, storage) - with pytest.raises(ValueError, match="non-permuted tensors"): - arr.to_numpy_ndarray() + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [3, 4] + assert tensor_array_from_numpy.type.permutation == [0, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [2, 2, 3] + assert tensor_array_from_numpy.type.permutation == [0, 2, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type) + expected = arr[1:] + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) + + arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array(1, dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array([], dtype=value_type) + + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0, 3, 2))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2))) @pytest.mark.parametrize("tensor_type", ( @@ -1485,10 +1566,7 @@ def test_legacy_int_type(): batch = pa.RecordBatch.from_arrays([ext_arr], names=['ext']) buf = ipc_write_batch(batch) - with pytest.warns( - RuntimeWarning, - match="pickle-based deserialization of pyarrow.PyExtensionType " - "subclasses is disabled by default"): + with pytest.warns((RuntimeWarning, FutureWarning)): batch = ipc_read_batch(buf) assert isinstance(batch.column(0).type, pa.UnknownExtensionType) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index d0fa253e314e9..6ba5137e4f63e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -362,79 +362,79 @@ def py_fsspec_s3fs(request, s3_server): @pytest.fixture(params=[ pytest.param( - pytest.lazy_fixture('localfs'), + 'localfs', id='LocalFileSystem()' ), pytest.param( - pytest.lazy_fixture('localfs_with_mmap'), + 'localfs_with_mmap', id='LocalFileSystem(use_mmap=True)' ), pytest.param( - pytest.lazy_fixture('subtree_localfs'), + 'subtree_localfs', id='SubTreeFileSystem(LocalFileSystem())' ), pytest.param( - pytest.lazy_fixture('s3fs'), + 's3fs', id='S3FileSystem', marks=pytest.mark.s3 ), pytest.param( - pytest.lazy_fixture('gcsfs'), + 'gcsfs', id='GcsFileSystem', marks=pytest.mark.gcs ), pytest.param( - pytest.lazy_fixture('hdfs'), + 'hdfs', id='HadoopFileSystem', marks=pytest.mark.hdfs ), pytest.param( - pytest.lazy_fixture('mockfs'), + 'mockfs', id='_MockFileSystem()' ), pytest.param( - pytest.lazy_fixture('py_localfs'), + 'py_localfs', id='PyFileSystem(ProxyHandler(LocalFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_mockfs'), + 'py_mockfs', id='PyFileSystem(ProxyHandler(_MockFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_localfs'), + 'py_fsspec_localfs', id='PyFileSystem(FSSpecHandler(fsspec.LocalFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_memoryfs'), + 'py_fsspec_memoryfs', id='PyFileSystem(FSSpecHandler(fsspec.filesystem("memory")))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_s3fs'), + 'py_fsspec_s3fs', id='PyFileSystem(FSSpecHandler(s3fs.S3FileSystem()))', marks=pytest.mark.s3 ), ]) def filesystem_config(request): - return request.param + return request.getfixturevalue(request.param) @pytest.fixture -def fs(request, filesystem_config): +def fs(filesystem_config): return filesystem_config['fs'] @pytest.fixture -def pathfn(request, filesystem_config): +def pathfn(filesystem_config): return filesystem_config['pathfn'] @pytest.fixture -def allow_move_dir(request, filesystem_config): +def allow_move_dir(filesystem_config): return filesystem_config['allow_move_dir'] @pytest.fixture -def allow_append_to_file(request, filesystem_config): +def allow_append_to_file(filesystem_config): return filesystem_config['allow_append_to_file'] @@ -1186,6 +1186,10 @@ def test_s3_options(pickle_module): assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2 assert fs2 != fs + fs = S3FileSystem(endpoint_override='localhost:8999', force_virtual_addressing=True) + assert isinstance(fs, S3FileSystem) + assert pickle_module.loads(pickle_module.dumps(fs)) == fs + with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index f75ec8158a9da..407011d90b734 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -142,16 +142,16 @@ def stream_fixture(): @pytest.fixture(params=[ pytest.param( - pytest.lazy_fixture('file_fixture'), + 'file_fixture', id='File Format' ), pytest.param( - pytest.lazy_fixture('stream_fixture'), + 'stream_fixture', id='Stream Format' ) ]) def format_fixture(request): - return request.param + return request.getfixturevalue(request.param) def test_empty_file(): diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index d9fdeb152c35e..4f199952344f2 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -243,13 +243,35 @@ def test_debug_memory_pool_warn(pool_factory): assert "Wrong size on deallocation" in res.stderr -@pytest.mark.parametrize('pool_factory', supported_factories()) -def test_debug_memory_pool_disabled(pool_factory): - res = run_debug_memory_pool(pool_factory.__name__, "") +def check_debug_memory_pool_disabled(pool_factory, env_value, msg): + res = run_debug_memory_pool(pool_factory.__name__, env_value) # The subprocess either returned successfully or was killed by a signal # (due to writing out of bounds), depending on the underlying allocator. if os.name == "posix": assert res.returncode <= 0 else: res.check_returncode() - assert res.stderr == "" + if msg == "": + assert res.stderr == "" + else: + assert msg in res.stderr + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_none(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "none", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_empty(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_unknown(pool_factory): + env_value = "some_arbitrary_value" + msg = ( + f"Invalid value for ARROW_DEBUG_MEMORY_POOL: '{env_value}'. " + "Valid values are 'abort', 'trap', 'warn', 'none'." + ) + check_debug_memory_pool_disabled(pool_factory, env_value, msg) diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8b8c50882b749..39dac4eb81dfb 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -154,6 +154,8 @@ def test_set_timezone_db_path_non_windows(): pa.ListType, pa.LargeListType, pa.FixedSizeListType, + pa.ListViewType, + pa.LargeListViewType, pa.UnionType, pa.SparseUnionType, pa.DenseUnionType, @@ -185,6 +187,8 @@ def test_set_timezone_db_path_non_windows(): pa.UnionArray, pa.BinaryArray, pa.StringArray, + pa.BinaryViewArray, + pa.StringViewArray, pa.FixedSizeBinaryArray, pa.DictionaryArray, pa.Date32Array, @@ -221,8 +225,12 @@ def test_set_timezone_db_path_non_windows(): pa.StringScalar, pa.BinaryScalar, pa.FixedSizeBinaryScalar, + pa.BinaryViewScalar, + pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, + pa.ListViewScalar, + pa.LargeListViewScalar, pa.MapScalar, pa.FixedSizeListScalar, pa.UnionScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 74dee59558239..074fb757e265a 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -51,10 +51,15 @@ (b"bytes", None, pa.BinaryScalar), ("largestring", pa.large_string(), pa.LargeStringScalar), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar), + ("string_view", pa.string_view(), pa.StringViewScalar), + (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar), ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar), + # TODO GH-39855 + # ([1, 2, 3], pa.list_view(pa.int8()), pa.ListViewScalar), + # ([1, 2, 3, 4], pa.large_list_view(pa.int8()), pa.LargeListViewScalar), (datetime.date.today(), None, pa.Date32Scalar), (datetime.date.today(), pa.date64(), pa.Date64Scalar), (datetime.datetime.now(), None, pa.TimestampScalar), @@ -488,7 +493,8 @@ def test_month_day_nano_interval(): @pytest.mark.parametrize('value', ['foo', 'mañana']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), - (pa.large_string(), pa.LargeStringScalar) + (pa.large_string(), pa.LargeStringScalar), + (pa.string_view(), pa.StringViewScalar), ]) def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -506,7 +512,8 @@ def test_string(value, ty, scalar_typ): @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.binary(), pa.BinaryScalar), - (pa.large_binary(), pa.LargeBinaryScalar) + (pa.large_binary(), pa.LargeBinaryScalar), + (pa.binary_view(), pa.BinaryViewScalar), ]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -533,7 +540,10 @@ def test_fixed_size_binary(): @pytest.mark.parametrize(('ty', 'klass'), [ (pa.list_(pa.string()), pa.ListScalar), - (pa.large_list(pa.string()), pa.LargeListScalar) + (pa.large_list(pa.string()), pa.LargeListScalar), + # TODO GH-39855 + # (pa.list_view(pa.string()), pa.ListViewScalar), + # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): v = ['foo', None] diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index ff38c614c251f..d6def54570581 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1089,6 +1089,9 @@ def test_table_to_batches(): table_from_iter = pa.Table.from_batches(iter([batch1, batch2, batch1])) assert table.equals(table_from_iter) + with pytest.raises(ValueError): + table.to_batches(max_chunksize=0) + def test_table_basics(): data = [ diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index c8a52c6b626c2..0add5786088d3 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -61,9 +61,13 @@ def get_many_types(): pa.binary(10), pa.large_string(), pa.large_binary(), + pa.string_view(), + pa.binary_view(), pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.uint16()), pa.map_(pa.string(), pa.int32()), pa.map_(pa.field('key', pa.int32(), nullable=False), pa.field('value', pa.int32())), @@ -167,6 +171,18 @@ def test_is_list(): assert not types.is_list(pa.int32()) +def test_is_list_view(): + a = pa.list_view(pa.int32()) + b = pa.large_list_view(pa.int32()) + + assert types.is_list_view(a) + assert not types.is_large_list_view(a) + assert not types.is_list(a) + assert types.is_large_list_view(b) + assert not types.is_list_view(b) + assert not types.is_large_list(b) + + def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) @@ -244,6 +260,12 @@ def test_is_binary_string(): assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary()) + assert types.is_string_view(pa.string_view()) + assert not types.is_string_view(pa.string()) + assert types.is_binary_view(pa.binary_view()) + assert not types.is_binary_view(pa.binary()) + assert not types.is_binary_view(pa.string_view()) + def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] @@ -565,6 +587,41 @@ def test_large_list_type(): pa.large_list(None) +def test_list_view_type(): + ty = pa.list_view(pa.int64()) + assert isinstance(ty, pa.ListViewType) + assert ty.value_type == pa.int64() + assert ty.value_field == pa.field("item", pa.int64(), nullable=True) + + # nullability matters in comparison + ty_non_nullable = pa.list_view(pa.field("item", pa.int64(), nullable=False)) + assert ty != ty_non_nullable + + # field names don't matter by default + ty_named = pa.list_view(pa.field("element", pa.int64())) + assert ty == ty_named + assert not ty.equals(ty_named, check_metadata=True) + + # metadata doesn't matter by default + ty_metadata = pa.list_view( + pa.field("item", pa.int64(), metadata={"hello": "world"})) + assert ty == ty_metadata + assert not ty.equals(ty_metadata, check_metadata=True) + + with pytest.raises(TypeError): + pa.list_view(None) + + +def test_large_list_view_type(): + ty = pa.large_list_view(pa.utf8()) + assert isinstance(ty, pa.LargeListViewType) + assert ty.value_type == pa.utf8() + assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) + + with pytest.raises(TypeError): + pa.large_list_view(None) + + def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b6dc53d633543..50b10c5512dc1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -557,6 +557,101 @@ cdef class LargeListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class ListViewType(DataType): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + +cdef class LargeListViewType(DataType): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return large_list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + cdef class MapType(DataType): """ Concrete class for map data types. @@ -1658,20 +1753,6 @@ cdef class FixedShapeTensorType(BaseExtensionType): else: return None - def __arrow_ext_serialize__(self): - """ - Serialized representation of metadata to reconstruct the type object. - """ - return self.tensor_ext_type.Serialize() - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - """ - Return an FixedShapeTensor type instance from the storage type and serialized - metadata. - """ - return self.tensor_ext_type.Deserialize(storage_type, serialized) - def __arrow_ext_class__(self): return FixedShapeTensorArray @@ -1679,6 +1760,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): return fixed_shape_tensor, (self.value_type, self.shape, self.dim_names, self.permutation) + def __arrow_ext_scalar_class__(self): + return FixedShapeTensorScalar + _py_extension_type_auto_load = False @@ -4375,6 +4459,36 @@ def large_utf8(): return large_string() +def binary_view(): + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + return primitive_type(_Type_BINARY_VIEW) + + +def string_view(): + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + return primitive_type(_Type_STRING_VIEW) + + def list_(value_type, int list_size=-1): """ Create ListType instance from child data type or field. @@ -4498,6 +4612,82 @@ cpdef LargeListType large_list(value_type): return out +cpdef ListViewType list_view(value_type): + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('ListView requires DataType or Field') + + list_view_type = CMakeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + +cpdef LargeListViewType large_list_view(value_type): + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('LargeListView requires DataType or Field') + + list_view_type = CMakeLargeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + cpdef MapType map_(key_type, item_type, keys_sorted=False): """ Create MapType instance from key and item data types or fields. @@ -4946,8 +5136,9 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N cdef FixedShapeTensorType out = FixedShapeTensorType.__new__(FixedShapeTensorType) - c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( - value_type.sp_type, c_shape, c_permutation, c_dim_names)) + with nogil: + c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) out.init(c_tensor_ext_type) @@ -4991,6 +5182,8 @@ cdef dict _type_aliases = { 'large_str': large_string, 'large_utf8': large_string, 'large_binary': large_binary, + 'binary_view': binary_view, + 'string_view': string_view, 'date32': date32, 'date64': date64, 'date32[day]': date32, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 5d7dbe4b451b9..0f68ca9fe574b 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -151,6 +151,16 @@ def is_fixed_size_list(t): return t.id == lib.Type_FIXED_SIZE_LIST +@doc(is_null, datatype="list view") +def is_list_view(t): + return t.id == lib.Type_LIST_VIEW + + +@doc(is_null, datatype="large list view") +def is_large_list_view(t): + return t.id == lib.Type_LARGE_LIST_VIEW + + @doc(is_null, datatype="struct") def is_struct(t): return t.id == lib.Type_STRUCT @@ -243,6 +253,16 @@ def is_fixed_size_binary(t): return t.id == lib.Type_FIXED_SIZE_BINARY +@doc(is_null, datatype="variable-length binary view") +def is_binary_view(t): + return t.id == lib.Type_BINARY_VIEW + + +@doc(is_null, datatype="variable-length string (utf-8) view") +def is_string_view(t): + return t.id == lib.Type_STRING_VIEW + + @doc(is_null, datatype="date") def is_date(t): return t.id in _DATE_TYPES diff --git a/python/pyproject.toml b/python/pyproject.toml index 437de105ab8e7..9079618ad1c7d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,7 +18,12 @@ [build-system] requires = [ "cython >= 0.29.31", - "oldest-supported-numpy>=0.14", + # Starting with NumPy 1.25, NumPy is (by default) as far back compatible + # as oldest-support-numpy was (customizable with a NPY_TARGET_VERSION + # define). For older Python versions (where NumPy 1.25 is not yet avaiable) + # continue using oldest-support-numpy. + "oldest-supported-numpy>=0.14; python_version<'3.9'", + "numpy>=1.25; python_version>='3.9'", "setuptools_scm < 8.0.0", "setuptools >= 40.1.0", "wheel" diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 56e9d479ee9ba..e1372e807f88d 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,5 @@ cython>=0.29.31 -oldest-supported-numpy>=0.14 +oldest-supported-numpy>=0.14; python_version<'3.9' +numpy>=1.25; python_version>='3.9' setuptools_scm<8.0.0 setuptools>=38.6.0 diff --git a/python/requirements-test.txt b/python/requirements-test.txt index b3ba5d852b968..2108d70a543f5 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -2,5 +2,4 @@ cffi hypothesis pandas pytest<8 -pytest-lazy-fixture pytz diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index f42ee4a018f3c..044f9de5f8214 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,5 +1,6 @@ cython>=0.29.31 -oldest-supported-numpy>=0.14 +oldest-supported-numpy>=0.14; python_version<'3.9' +numpy>=1.25; python_version>='3.9' setuptools_scm<8.0.0 setuptools>=58 wheel diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt index c74a8ca6908a7..a1046bc18c704 100644 --- a/python/requirements-wheel-test.txt +++ b/python/requirements-wheel-test.txt @@ -2,7 +2,6 @@ cffi cython hypothesis pytest<8 -pytest-lazy-fixture pytz tzdata; sys_platform == 'win32' diff --git a/python/setup.py b/python/setup.py index d7a2da2077cdd..098d75a3186af 100755 --- a/python/setup.py +++ b/python/setup.py @@ -449,7 +449,7 @@ def has_ext_modules(foo): install_requires = ( - 'numpy >= 1.16.6, <2', + 'numpy >= 1.16.6', ) diff --git a/r/PACKAGING.md b/r/PACKAGING.md index 7f42ecf562e59..4edeb4f2130cc 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -26,6 +26,7 @@ For a high-level overview of the release process see the ## Before the release candidate is cut - [ ] [Create a GitHub issue](https://github.com/apache/arrow/issues/new/) entitled `[R] CRAN packaging checklist for version X.X.X` and copy this checklist to the issue. +- [ ] Review deprecated functions to advance their deprecation status, including removing preprocessor directives that no longer apply (search for `ARROW_VERSION_MAJOR` in r/src). - [ ] Evaluate the status of any failing [nightly tests and nightly packaging builds](http://crossbow.voltrondata.com). These checks replicate most of the checks that CRAN runs, so we need them all to be passing or to understand that the failures may (though won't necessarily) result in a rejection from CRAN. - [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_arrow.html) - [ ] Ensure the contents of the README are accurate and up to date. diff --git a/r/R/python.R b/r/R/python.R index 023d914f16a9e..1159806bf7c25 100644 --- a/r/R/python.R +++ b/r/R/python.R @@ -339,15 +339,9 @@ install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) { } pyarrow_compatible_pointer <- function(ptr) { - pa <- reticulate::import("pyarrow") - version_string <- pa$`__version__` - # remove trailing .devXXX because it won't work with package_version() - pyarrow_version <- package_version(gsub("\\.dev.*?$", "", version_string)) - - # pyarrow pointers changed in version 7.0.0 - if (pyarrow_version >= "7.0.0") { - return(ptr) - } else { - return(external_pointer_addr_double(ptr)) - } + # GH-39933: Workaround because there is no built-in way to send a + # 64-bit integer to Python from an R object + py <- reticulate::import_builtins(convert = FALSE) + addr <- external_pointer_addr_character(ptr) + py$int(addr) } diff --git a/r/configure.win b/r/configure.win index 2d9e5cdf54e44..b6ac19faea2d4 100755 --- a/r/configure.win +++ b/r/configure.win @@ -17,33 +17,58 @@ # specific language governing permissions and limitations # under the License. +: ${PKG_CONFIG:="pkg-config"} +# Library settings +PKG_CONFIG_NAME="arrow" +PKG_TEST_HEADER="" + +VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` + +# Development mode, also increases verbosity in the bundled build +ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'` +# If present, `pkg-config` will be used to find libarrow on the system, +# unless this is set to false +ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` # generate code -if [ "$ARROW_R_DEV" == "TRUE" ]; then +if [ "$ARROW_R_DEV" == "true" ]; then echo "*** Generating code with data-raw/codegen.R" "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" data-raw/codegen.R fi -OPENSSL_LIBS="-lcrypto -lcrypt32" -MIMALLOC_LIBS="-lbcrypt -lpsapi" -BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it -AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ - -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ - -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ - -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" -# pkg-config --libs libcurl -GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ - -lz -lws2_32 -lnghttp2 -ldbghelp" +# Test if pkg-config is available to use +if ${PKG_CONFIG} --version >/dev/null 2>&1; then + PKG_CONFIG_AVAILABLE="true" + echo "*** pkg-config found." +else + echo "*** pkg-config not found." + PKG_CONFIG_AVAILABLE="false" + ARROW_USE_PKG_CONFIG="false" +fi -function configure_release() { - VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) + +function configure_binaries() { # Try to find/download a C++ Arrow binary, "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/nixlibs.R" $VERSION # If binary not found, script exits nonzero if [ $? -ne 0 ]; then + _LIBARROW_FOUND="false" echo "Arrow C++ library was not found" + # return 0 so set -e doesn't exit the script + return 0 fi + OPENSSL_LIBS="-lcrypto -lcrypt32" + MIMALLOC_LIBS="-lbcrypt -lpsapi" + BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it + AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ + -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ + -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ + -luserenv -lversion -lws2_32 -lbcrypt -lwininet -lwinhttp" + # pkg-config --libs libcurl + GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ + -lz -lws2_32 -lnghttp2 -ldbghelp" + # Set the right flags to point to and enable arrow/parquet if [ -d "windows/arrow-$VERSION" ]; then RWINLIB="../windows/arrow-$VERSION" @@ -75,12 +100,160 @@ function configure_release() { # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" fi + +} + +# Once libarrow is obtained, this function sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` +# either from pkg-config or by inferring things about the directory in $1 +set_pkg_vars () { + set_lib_dir_with_pc + + # Check cmake options for enabled features. This uses LIB_DIR that + # is set by the above set_lib_dir_* call. + add_feature_flags + set_pkg_vars_with_pc + + # Set any user-defined CXXFLAGS + if [ "$ARROW_R_CXXFLAGS" ]; then + PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" + fi + + # We use expr because the product version returns more than just 10.13 and we want to + # match the substring. However, expr always outputs the number of matched characters + # to stdout, to avoid noise in the log we redirect the output to /dev/null + if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then + # avoid C++17 availability warnings on macOS < 11 + PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" + fi +} + +# If we have pkg-config, it will tell us what libarrow needs +set_lib_dir_with_pc () { + LIB_DIR="`${PKG_CONFIG} --variable=libdir ${PKG_CONFIG_NAME}`" +} +set_pkg_vars_with_pc () { + pkg_config_names="${PKG_CONFIG_NAME} ${PKG_CONFIG_NAMES_FEATURES}" + PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" + PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" + PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" + PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` +} + +add_feature_flags () { + PKG_CFLAGS_FEATURES="" + PKG_CONFIG_NAMES_FEATURES="" + PKG_LIBS_FEATURES="" + PKG_LIBS_FEATURES_WITHOUT_PC="" + + # Now we need to check what features it was built with and enable + # the corresponding feature flags in the R bindings (-DARROW_R_WITH_stuff). + # We do this by inspecting ArrowOptions.cmake, which the libarrow build + # generates. + ARROW_OPTS_CMAKE="$LIB_DIR/cmake/Arrow/ArrowOptions.cmake" + if [ ! -f "${ARROW_OPTS_CMAKE}" ]; then + echo "*** $ARROW_OPTS_CMAKE not found; some features will not be enabled" + else + if arrow_built_with ARROW_PARQUET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_PARQUET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES parquet" + PKG_LIBS_FEATURES_WITHOUT_PC="-lparquet $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: parquet is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_DATASET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_DATASET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-dataset" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_dataset $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_dataset is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_ACERO; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_ACERO" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-acero" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_acero $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_acero is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_SUBSTRAIT; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_SUBSTRAIT" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-substrait" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_substrait $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_substrait is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_JSON; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_JSON" + fi + if arrow_built_with ARROW_S3; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" + fi + if arrow_built_with ARROW_GCS; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" + fi + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + # If pkg-config is available it will handle this for us automatically + SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" + fi + fi +} + + +arrow_built_with() { + # Function to check cmake options for features + grep -i 'set('"$1"' "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 +} + +function configure_rtools() { + # Use pkg-config to find arrow from rtools + _LIBARROW_PREFIX="`${PKG_CONFIG} --variable=prefix ${PKG_CONFIG_NAME}`" + _LIBARROW_FOUND="true" + echo "*** Trying Arrow C++ found by pkg-config: $_LIBARROW_PREFIX" + + PC_LIB_VERSION=`${PKG_CONFIG} --modversion ${PKG_CONFIG_NAME}` + # This is in an R script for convenience and testability. + # Success means the found C++ library is ok to use. + # Error means the versions don't line up and we shouldn't use it. + # More specific messaging to the user is in the R script + if ! ${R_HOME}/bin/Rscript tools/check-versions.R $VERSION $PC_LIB_VERSION 2> /dev/null; then + _LIBARROW_FOUND="false" + fi + + # We should have a valid libarrow build in $_LIBARROW_FOUND +# Now set `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` based on that. +if [ "$_LIBARROW_FOUND" == "true" ]; then + set_pkg_vars ${_LIBARROW_PREFIX} + # add mingw specific windows flags + PKG_LIBS="$PKG_LIBS -lws2_32 -lole32 -lwldap32 -lsecur32 -lncrypt -lcrypt32 -lshlwapi" + # override -fno-exceptions from aws-cpp-sdk pc file + PKG_CFLAGS="$PKG_CFLAGS -fexceptions" +else + # To make it easier to debug which code path was taken add a specific + # message to the log in addition to the 'NOTE' + echo "*** Failed to find Arrow C++ libraries in rtools" +fi +} + +function configure_release() { + if [ "$ARROW_USE_PKG_CONFIG" != "false" ] && $PKG_CONFIG --exists $PKG_CONFIG_NAME; then + configure_rtools + else + configure_binaries + fi + + if [ "$_LIBARROW_FOUND" == "false" ]; then + echo "------------------------- NOTE ---------------------------" + echo "There was an issue preparing the Arrow C++ libraries." + echo "See https://arrow.apache.org/docs/r/articles/install.html" + echo "----------------------------------------------------------" + exit 1 + fi } # Returns 1 if CMAKE options is set "ON", otherwise 0 function cmake_option() { ARROW_OPTS_CMAKE="$ARROW_HOME/lib/cmake/Arrow/ArrowOptions.cmake" - grep -cm1 "set($1 \"ON\")" $ARROW_OPTS_CMAKE + arrow_built_with $1 } function configure_dev() { diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d2db11e14a787..a81210f0ad914 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1050,6 +1050,7 @@ class RDictionaryConverter> template struct RConverterTrait; +#if ARROW_VERSION_MAJOR >= 15 template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && @@ -1061,6 +1062,14 @@ template struct RConverterTrait> { // not implemented }; +#else +template +struct RConverterTrait< + T, enable_if_t::value && !is_interval_type::value && + !is_extension_type::value>> { + using type = RPrimitiveConverter; +}; +#endif template struct RConverterTrait> { diff --git a/r/tools/check-versions.R b/r/tools/check-versions.R index 3d8cbf02a14c9..34b2ef680c547 100644 --- a/r/tools/check-versions.R +++ b/r/tools/check-versions.R @@ -20,6 +20,20 @@ args <- commandArgs(TRUE) # TESTING is set in test-check-version.R; it won't be set when called from configure test_mode <- exists("TESTING") +release_version_supported <- function(r_version, cpp_version) { + r_version <- package_version(r_version) + cpp_version <- package_version(cpp_version) + major <- function(x) as.numeric(x[1, 1]) + minimum_cpp_version <- package_version("13.0.0") + + allow_mismatch <- identical(tolower(Sys.getenv("ARROW_R_ALLOW_CPP_VERSION_MISMATCH", "false")), "true") + # If we allow a version mismatch we still need to cover the minimum version (13.0.0 for now) + # we don't allow newer C++ versions as new features without additional feature gates are likely to + # break the R package + version_valid <- cpp_version >= minimum_cpp_version && major(cpp_version) <= major(r_version) + allow_mismatch && version_valid || major(r_version) == major(cpp_version) +} + check_versions <- function(r_version, cpp_version) { r_parsed <- package_version(r_version) r_dev_version <- r_parsed[1, 4] @@ -39,20 +53,10 @@ check_versions <- function(r_version, cpp_version) { "*** > or retry with FORCE_BUNDLED_BUILD=true" ) cat(paste0(msg, "\n", collapse = "")) - } else if (r_is_patch && as.character(r_parsed[1, 1:3]) == cpp_version) { - # Patch releases we do for CRAN feedback get an extra x.y.z.1 version. - # These should work with the x.y.z C++ library (which never has .1 added) - cat( - sprintf( - "*** > Using C++ library version %s with R package %s\n", - cpp_version, - r_version - ) - ) - } else if (r_version != cpp_version) { + } else if (cpp_is_dev || !release_version_supported(r_version, cpp_parsed)) { cat( sprintf( - "**** Not using: C++ library version (%s) does not match R package (%s)\n", + "**** Not using: C++ library version (%s): not supported by R package version %s\n", cpp_version, r_version ) @@ -61,7 +65,12 @@ check_versions <- function(r_version, cpp_version) { # Add ALLOW_VERSION_MISMATCH env var to override stop()? (Could be useful for debugging) } else { # OK - cat(sprintf("**** C++ and R library versions match: %s\n", cpp_version)) + cat( + sprintf( + "**** C++ library version %s is supported by R version %s\n", + cpp_version, r_version + ) + ) } } diff --git a/r/tools/nixlibs-allowlist.txt b/r/tools/nixlibs-allowlist.txt index 9c368e6ed15a2..bd9f0c1b2c084 100644 --- a/r/tools/nixlibs-allowlist.txt +++ b/r/tools/nixlibs-allowlist.txt @@ -2,4 +2,3 @@ ubuntu centos redhat rhel -darwin diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 17c6ab0a8078b..0af41888b95b7 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -222,7 +222,7 @@ check_allowlist <- function(os, allowed = "https://raw.githubusercontent.com/apa # Try a remote allowlist so that we can add/remove without a release suppressWarnings(readLines(allowed)), # Fallback to default: allowed only on Ubuntu and CentOS/RHEL - error = function(e) c("ubuntu", "centos", "redhat", "rhel", "darwin") + error = function(e) c("ubuntu", "centos", "redhat", "rhel") ) # allowlist should contain valid regular expressions (plain strings ok too) any(grepl(paste(allowlist, collapse = "|"), os)) diff --git a/r/tools/test-check-versions.R b/r/tools/test-check-versions.R index 9c284507b8801..f558648bed1e3 100644 --- a/r/tools/test-check-versions.R +++ b/r/tools/test-check-versions.R @@ -24,10 +24,10 @@ TESTING <- TRUE source("check-versions.R", local = TRUE) -test_that("check_versions", { +test_that("check_versions without mismatch", { expect_output( check_versions("10.0.0", "10.0.0"), - "**** C++ and R library versions match: 10.0.0", + "**** C++ library version 10.0.0 is supported by R version 10.0.0", fixed = TRUE ) expect_output( @@ -35,7 +35,7 @@ test_that("check_versions", { check_versions("10.0.0", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0", fixed = TRUE ) expect_output( @@ -43,20 +43,12 @@ test_that("check_versions", { check_versions("10.0.0.9000", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0.9000)", - fixed = TRUE - ) - expect_output( - expect_error( - check_versions("10.0.0.9000", "10.0.0"), - "version mismatch" - ), - "**** Not using: C++ library version (10.0.0) does not match R package (10.0.0.9000)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0.9000", fixed = TRUE ) expect_output( check_versions("10.0.0.3", "10.0.0"), - "*** > Using C++ library version 10.0.0 with R package 10.0.0.3", + "**** C++ library version 10.0.0 is supported by R version 10.0.0.3", fixed = TRUE ) expect_output( @@ -65,3 +57,25 @@ test_that("check_versions", { fixed = TRUE ) }) + +test_that("check_versions with mismatch", { + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "false")) + + expect_false( + release_version_supported("15.0.0", "13.0.0") + ) + + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "true")) + + expect_true( + release_version_supported("15.0.0", "13.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "16.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "12.0.0") + ) +}) diff --git a/testing b/testing index ad82a736c170e..25d16511e8d42 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170 +Subproject commit 25d16511e8d42c2744a1d94d90169e3a36e92631