diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index fa84f4dc3b08b..3e8b35fa18270 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -32,7 +32,7 @@ on: - 'docker-compose.yml' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 0f19f7351c325..e6ceccc2e0841 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -38,7 +38,7 @@ on: - 'format/Flight.proto' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: @@ -53,7 +53,7 @@ jobs: name: ${{ matrix.title }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 45 + timeout-minutes: 60 strategy: fail-fast: false matrix: diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index b339b8f46555d..59e5113685fc9 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -30,7 +30,7 @@ on: - 'csharp/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 9ef46c31fa34e..4b368cdc88373 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -23,7 +23,7 @@ on: pull_request: concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 5d8034ff09f1f..70300b7be36a7 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -33,7 +33,7 @@ on: - 'go/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: @@ -238,13 +238,12 @@ jobs: timeout-minutes: 60 strategy: fail-fast: false - matrix: - go: [1.15] + matrix: mingw-n-bits: #- 32 runtime handling for CGO needs 64-bit currently - 64 env: - ARROW_GO_TESTCGO: "1" + ARROW_GO_TESTCGO: "1" steps: - name: Disable Crash Dialogs run: | @@ -254,10 +253,6 @@ jobs: /t REG_DWORD ` /d 1 ` /f - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - name: Checkout Arrow uses: actions/checkout@v2 with: @@ -278,7 +273,10 @@ jobs: run: | echo "CGO_CPPFLAGS=-I$(cygpath --windows ${MINGW_PREFIX}/include)" >> $GITHUB_ENV echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV - echo "$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_PATH + echo "GOROOT=$(cygpath --windows ${MINGW_PREFIX}/lib/go)" >> $GITHUB_ENV + echo "GOPATH=$(cygpath --windows ${HOME}/gopath)" >> $GITHUB_ENV + mkdir -p $(cygpath --windows ${HOME}/gopath) + echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV - name: Build shell: bash run: ci/scripts/go_build.sh . diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7a4deb8e3ea31..c93d4133fc7cc 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -42,7 +42,7 @@ on: - 'format/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 72f4df7e36e38..df942771bba25 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -36,7 +36,7 @@ on: - 'java/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 48351f3c22ad3..d4296b674dfb4 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -36,7 +36,7 @@ on: - 'java/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: @@ -78,3 +78,36 @@ jobs: if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true run: archery docker push debian-java-jni + + docker_integration_python: + name: AMD64 Debian 9 Java C Data Interface Integration + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 90 + steps: + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Fetch Submodules and Tags + run: ci/scripts/util_checkout.sh + - name: Free Up Disk Space + run: ci/scripts/util_cleanup.sh + - name: Cache Docker Volumes + uses: actions/cache@v2 + with: + path: .docker + key: maven-${{ hashFiles('java/**') }} + restore-keys: maven- + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Setup Archery + run: pip install -e dev/archery[docker] + - name: Execute Docker Build + run: archery docker run conda-python-java-integration + - name: Docker Push + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + continue-on-error: true + run: archery docker push conda-python-java-integration diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index cbd6ce0acef8f..8f8c43403ee4b 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -32,7 +32,7 @@ on: - 'js/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/julia.yml b/.github/workflows/julia.yml index 226ec3e6ad042..519a4fab2972d 100644 --- a/.github/workflows/julia.yml +++ b/.github/workflows/julia.yml @@ -27,7 +27,7 @@ on: - 'julia/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 1ac3df6020ba6..268bb63cb2f2c 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -32,13 +32,13 @@ on: - 'cpp/src/arrow/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true jobs: matlab: - name: AMD64 Ubuntu 20.04 MATLAB + name: AMD64 Ubuntu 20.04 MATLAB runs-on: ubuntu-latest steps: - name: Check out repository @@ -54,7 +54,7 @@ jobs: - name: Build MATLAB Interface run: ci/scripts/matlab_build.sh $(pwd) - name: Run MATLAB Tests - env: + env: # libarrow.so requires a more recent version of libstdc++.so # than is bundled with MATLAB under /sys/os/glnxa64. # Therefore, if a MEX function that depends on libarrow.so diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index cce66fe71ba79..81ec8c093a472 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -32,7 +32,7 @@ on: - 'python/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/r-without-arrow.yml b/.github/workflows/r-without-arrow.yml index a517cfcd18744..12afdd43c4aeb 100644 --- a/.github/workflows/r-without-arrow.yml +++ b/.github/workflows/r-without-arrow.yml @@ -28,7 +28,7 @@ on: - "r/src/**" concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 900acdac794fb..4f8709fe5bc4a 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -40,7 +40,7 @@ on: - "r/**" concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 067b40aefe92f..2afb753231866 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -44,7 +44,7 @@ on: - 'ruby/**' concurrency: - group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true env: diff --git a/.gitignore b/.gitignore index 6f123362ef1fb..8b12a9a5f7a97 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,8 @@ perf.data.old cpp/.idea/ .clangd/ cpp/.clangd/ +.cache/clangd/ +cpp/.cache/clangd/ cpp/apidoc/xml/ docs/example.gz docs/example1.dat diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ed715d7f4fc4..1e5474b1a5c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,1988 @@ -# Apache Arrow 3.0.0 (2021-01-18) +# Apache Arrow 6.0.0 (2021-10-26) + +## Bug Fixes + +* [ARROW-6946](https://issues.apache.org/jira/browse/ARROW-6946) - [Go] Run tests with assert build tag enabled +* [ARROW-8452](https://issues.apache.org/jira/browse/ARROW-8452) - [Go][Integration] Go JSON producer generates incorrect nullable flag for nested types +* [ARROW-8453](https://issues.apache.org/jira/browse/ARROW-8453) - [Integration][Go] Recursive nested types unsupported +* [ARROW-8999](https://issues.apache.org/jira/browse/ARROW-8999) - [Python][C++] Non-deterministic segfault in "AMD64 MacOS 10.15 Python 3.7" build +* [ARROW-9948](https://issues.apache.org/jira/browse/ARROW-9948) - [C++] Decimal128 does not check scale range when rescaling; can cause buffer overflow +* [ARROW-10213](https://issues.apache.org/jira/browse/ARROW-10213) - [C++] Temporal cast from timestamp to date rounds instead of extracting date component +* [ARROW-10373](https://issues.apache.org/jira/browse/ARROW-10373) - [C++] ValidateFull() does not validate null\_count +* [ARROW-10773](https://issues.apache.org/jira/browse/ARROW-10773) - [R] parallel as.data.frame.Table hangs indefinitely on Windows +* [ARROW-11518](https://issues.apache.org/jira/browse/ARROW-11518) - [C++] [Parquet] Parquet reader crashes when reading boolean columns +* [ARROW-11579](https://issues.apache.org/jira/browse/ARROW-11579) - [R] read\_feather hanging on Windows +* [ARROW-11634](https://issues.apache.org/jira/browse/ARROW-11634) - [C++][Parquet] Parquet statistics (min/max) for dictionary columns are incorrect +* [ARROW-11729](https://issues.apache.org/jira/browse/ARROW-11729) - [R] Add examples to the datasets documentation +* [ARROW-12011](https://issues.apache.org/jira/browse/ARROW-12011) - [C++][Python] Crashes and incorrect results when converting large integers to dates +* [ARROW-12072](https://issues.apache.org/jira/browse/ARROW-12072) - (ipc.Writer).Write panics with \`arrow/array: index out of range\` +* [ARROW-12087](https://issues.apache.org/jira/browse/ARROW-12087) - [C++] Fix sort\_indices, array\_sort\_indices timestamp support discrepancy +* [ARROW-12513](https://issues.apache.org/jira/browse/ARROW-12513) - [C++][Parquet] Parquet Writer always puts null\_count=0 in Parquet statistics for dictionary-encoded array with nulls +* [ARROW-12540](https://issues.apache.org/jira/browse/ARROW-12540) - [C++] Implement cast from date32[day] to utf8 +* [ARROW-12636](https://issues.apache.org/jira/browse/ARROW-12636) - [JS] ESM Tree-Shaking produces broken code +* [ARROW-12700](https://issues.apache.org/jira/browse/ARROW-12700) - [R] Read/Write\_feather stuck forever after bad write, R, Win32 +* [ARROW-12837](https://issues.apache.org/jira/browse/ARROW-12837) - [C++] Array::ToString() segfaults with null buffer. +* [ARROW-13134](https://issues.apache.org/jira/browse/ARROW-13134) - [C++] SSL-related arrow-s3fs-test failures with aws-sdk-cpp 1.9.51 +* [ARROW-13151](https://issues.apache.org/jira/browse/ARROW-13151) - [Python] Unable to read single child field of struct column from Parquet +* [ARROW-13198](https://issues.apache.org/jira/browse/ARROW-13198) - [C++][Dataset] Async scanner occasionally segfaulting in CI +* [ARROW-13293](https://issues.apache.org/jira/browse/ARROW-13293) - [R] open\_dataset followed by collect hangs (while compute works) +* [ARROW-13304](https://issues.apache.org/jira/browse/ARROW-13304) - [C++] Unable to install nightly on Ubuntu 21.04 due to day of week options +* [ARROW-13336](https://issues.apache.org/jira/browse/ARROW-13336) - [Doc][Python] make clean doesn't clean up "generated" documentation +* [ARROW-13422](https://issues.apache.org/jira/browse/ARROW-13422) - [R] Clarify README about S3 support on Windows +* [ARROW-13424](https://issues.apache.org/jira/browse/ARROW-13424) - [C++] conda-forge benchmark library rejected +* [ARROW-13425](https://issues.apache.org/jira/browse/ARROW-13425) - [Dev][Archery] Archery import pandas which imports pyarrow +* [ARROW-13429](https://issues.apache.org/jira/browse/ARROW-13429) - [C++][Gandiva] Gandiva crashes when compiling If-else expression with binary type +* [ARROW-13430](https://issues.apache.org/jira/browse/ARROW-13430) - [Integration][Go] Various errors in the integration tests +* [ARROW-13436](https://issues.apache.org/jira/browse/ARROW-13436) - [Python][Doc] Clarify what should be expected if read\_table is passed an empty list of columns +* [ARROW-13437](https://issues.apache.org/jira/browse/ARROW-13437) - [C++] Slice of FixedSizeList fails ValidateFull +* [ARROW-13441](https://issues.apache.org/jira/browse/ARROW-13441) - [CSV] Streaming reader conversion should skip empty blocks +* [ARROW-13443](https://issues.apache.org/jira/browse/ARROW-13443) - [C++] Fix the incorrect mapping from flatbuf::MetadataVersion to arrow::ipc::MetadataVersion +* [ARROW-13445](https://issues.apache.org/jira/browse/ARROW-13445) - [Java][Packaging] Fix artifact patterns for the Java jars +* [ARROW-13446](https://issues.apache.org/jira/browse/ARROW-13446) - [Release] Fix verification on amazon linux +* [ARROW-13447](https://issues.apache.org/jira/browse/ARROW-13447) - [Release] Verification script for arm64 and universal2 macOS wheels +* [ARROW-13450](https://issues.apache.org/jira/browse/ARROW-13450) - [Python][Packaging] Set deployment target to 10.13 for universal2 wheels +* [ARROW-13469](https://issues.apache.org/jira/browse/ARROW-13469) - [C++] Suppress -Wmissing-field-initializers in DayMilliseconds arrow/type.h +* [ARROW-13474](https://issues.apache.org/jira/browse/ARROW-13474) - [C++][Python] PyArrow crash when filter/take empty Extension array +* [ARROW-13477](https://issues.apache.org/jira/browse/ARROW-13477) - [Release] Pass ARTIFACTORY\_API\_KEY to the upload script +* [ARROW-13484](https://issues.apache.org/jira/browse/ARROW-13484) - [Release] Packages not available for Amazon Linux 2 +* [ARROW-13490](https://issues.apache.org/jira/browse/ARROW-13490) - [R] [CI] Need to gate duckdb examples on duckdb version +* [ARROW-13492](https://issues.apache.org/jira/browse/ARROW-13492) - [R] [CI] Move r tools 35 build back to per-commit/pre-PR +* [ARROW-13493](https://issues.apache.org/jira/browse/ARROW-13493) - [C++] Anonymous structs in an anonymous union are a GNU extension +* [ARROW-13495](https://issues.apache.org/jira/browse/ARROW-13495) - [C++] UBSAN error in BitUtil when writing dataset +* [ARROW-13496](https://issues.apache.org/jira/browse/ARROW-13496) - [CI][R] Repair r-sanitizer job +* [ARROW-13497](https://issues.apache.org/jira/browse/ARROW-13497) - [C++][R] FunctionOptions not used by aggregation nodes +* [ARROW-13499](https://issues.apache.org/jira/browse/ARROW-13499) - [R] Aggregation on expression doesn't NSE correctly +* [ARROW-13500](https://issues.apache.org/jira/browse/ARROW-13500) - [C++] warning: unrecognized command line option '-Wno-unknown-warning-option' when building with gcc 9.3 +* [ARROW-13504](https://issues.apache.org/jira/browse/ARROW-13504) - [Python] It is impossible to skip s3 or hdfs tests with pytest markers +* [ARROW-13507](https://issues.apache.org/jira/browse/ARROW-13507) - [R] LTO job on CRAN fails +* [ARROW-13509](https://issues.apache.org/jira/browse/ARROW-13509) - [C++] Take compute function should pass through ChunkedArray type to handle empty input arrays +* [ARROW-13522](https://issues.apache.org/jira/browse/ARROW-13522) - [C++] Regression with compute \`utf8\_\*trim\` functions on macOS. +* [ARROW-13523](https://issues.apache.org/jira/browse/ARROW-13523) - Unified the test case name +* [ARROW-13524](https://issues.apache.org/jira/browse/ARROW-13524) - [C++] Fix description for ApplicationVersion::VersionEq +* [ARROW-13529](https://issues.apache.org/jira/browse/ARROW-13529) - Too many releases in IPC writer when writing slices +* [ARROW-13538](https://issues.apache.org/jira/browse/ARROW-13538) - [R] [CI] Don't test DuckDB in the minimal build +* [ARROW-13543](https://issues.apache.org/jira/browse/ARROW-13543) - [R] Handle summarize() with 0 arguments or no aggregate functions +* [ARROW-13556](https://issues.apache.org/jira/browse/ARROW-13556) - [C++] on Ubuntu 21.04 with system libs flight is not linked against libprotobuf +* [ARROW-13559](https://issues.apache.org/jira/browse/ARROW-13559) - [CI][C++] test-conda-cpp-valgrind nightly build failure +* [ARROW-13560](https://issues.apache.org/jira/browse/ARROW-13560) - [R] Allow Scanner$create() to accept filter / project even with arrow\_dplyr\_querys +* [ARROW-13580](https://issues.apache.org/jira/browse/ARROW-13580) - [C++] quoted\_strings\_can\_be\_null only applied to string columns +* [ARROW-13597](https://issues.apache.org/jira/browse/ARROW-13597) - [C++] [R] ExecNode factory named source not present in registry +* [ARROW-13600](https://issues.apache.org/jira/browse/ARROW-13600) - [C++] Maybe uninitialized warnings +* [ARROW-13602](https://issues.apache.org/jira/browse/ARROW-13602) - [C++] Tests dereferencing type-punned pointer compiler warnings +* [ARROW-13603](https://issues.apache.org/jira/browse/ARROW-13603) - [GLib] GARROW\_VERSION\_CHECK() always returns false +* [ARROW-13605](https://issues.apache.org/jira/browse/ARROW-13605) - [C++] Data race in GroupByNode found by ThreadSanitizer +* [ARROW-13608](https://issues.apache.org/jira/browse/ARROW-13608) - [R] symbol initialization appears to be depending on undefined behavior +* [ARROW-13611](https://issues.apache.org/jira/browse/ARROW-13611) - [C++] Scanning datasets does not enforce back pressure +* [ARROW-13624](https://issues.apache.org/jira/browse/ARROW-13624) - [R] readr short type mapping has T and t backwards +* [ARROW-13628](https://issues.apache.org/jira/browse/ARROW-13628) - [Format] Add MonthDayNano interval type. +* [ARROW-13630](https://issues.apache.org/jira/browse/ARROW-13630) - [CI][C++] Travis s390x CI job is failing and blocks endianness related code verification +* [ARROW-13632](https://issues.apache.org/jira/browse/ARROW-13632) - [Python] Filter mask is always applied to elements at the start of FixedSizeListArray when filtering a slice +* [ARROW-13638](https://issues.apache.org/jira/browse/ARROW-13638) - [C++][R] GroupByNode accesses FunctionOptions after Init/ExecNode\_Aggregate keep\_alives aren't kept alive +* [ARROW-13639](https://issues.apache.org/jira/browse/ARROW-13639) - [C++] Concatenate with an empty dictionary segfaults (ASan failure in TestFilterKernelWithString/0.FilterDictionary) +* [ARROW-13654](https://issues.apache.org/jira/browse/ARROW-13654) - [C++][Parquet] Appending a FileMetaData object to itselfs explodes memory +* [ARROW-13655](https://issues.apache.org/jira/browse/ARROW-13655) - [C++][Parquet] Reading large Parquet file can give "MaxMessageSize reached" error with Thrift 0.14 +* [ARROW-13662](https://issues.apache.org/jira/browse/ARROW-13662) - [CI] Failing test test\_extract\_datetime\_components with pandas 0.24 +* [ARROW-13662](https://issues.apache.org/jira/browse/ARROW-13662) - [CI] Failing test test\_extract\_datetime\_components with pandas 0.24 +* [ARROW-13669](https://issues.apache.org/jira/browse/ARROW-13669) - [C++] Variant emplace methods appear to be missing curly braces. +* [ARROW-13671](https://issues.apache.org/jira/browse/ARROW-13671) - [Dev] Fix conda recipe on Arm 64K page system +* [ARROW-13676](https://issues.apache.org/jira/browse/ARROW-13676) - [C++] Coredump writing Arrow table to Parquet file +* [ARROW-13681](https://issues.apache.org/jira/browse/ARROW-13681) - [C++] list\_parent\_indices only computes for first chunk +* [ARROW-13685](https://issues.apache.org/jira/browse/ARROW-13685) - [C++] Cannot write dataset to S3FileSystem if bucket already exists +* [ARROW-13689](https://issues.apache.org/jira/browse/ARROW-13689) - [C\#] Initial C\# Integration Tests +* [ARROW-13694](https://issues.apache.org/jira/browse/ARROW-13694) - [R] Arrow filter crashes (R aborted session) +* [ARROW-13743](https://issues.apache.org/jira/browse/ARROW-13743) - [CI] OSX job fails due to incompatible git and libcurl +* [ARROW-13744](https://issues.apache.org/jira/browse/ARROW-13744) - [CI] c++14 and 17 nightly job fails +* [ARROW-13747](https://issues.apache.org/jira/browse/ARROW-13747) - [CI][C++] s3fs test failed in conda-python-pandas nightly job +* [ARROW-13755](https://issues.apache.org/jira/browse/ARROW-13755) - [Python] Allow usage of field\_names in partitioning when saving datasets +* [ARROW-13761](https://issues.apache.org/jira/browse/ARROW-13761) - [R] arrow::filter() crashes (aborts R session) +* [ARROW-13784](https://issues.apache.org/jira/browse/ARROW-13784) - [Python] Table.from\_arrays should raise an error when array is empty but names is not +* [ARROW-13786](https://issues.apache.org/jira/browse/ARROW-13786) - [R] [CI] Don't fail the RCHK build if arrow doesn't build +* [ARROW-13788](https://issues.apache.org/jira/browse/ARROW-13788) - [C++] Temporal component extraction functions don't support date32/64 +* [ARROW-13792](https://issues.apache.org/jira/browse/ARROW-13792) - [Java] The toString representation is incorrect for unsigned integer vectors +* [ARROW-13799](https://issues.apache.org/jira/browse/ARROW-13799) - [R] case\_when error handling is capturing strings +* [ARROW-13800](https://issues.apache.org/jira/browse/ARROW-13800) - [R] Use divide instead of divide\_checked +* [ARROW-13812](https://issues.apache.org/jira/browse/ARROW-13812) - [C++] Valgrind failure in Grouper.BooleanKey (uninitialized values) +* [ARROW-13814](https://issues.apache.org/jira/browse/ARROW-13814) - [CI] Nightly integration build with spark master failing to compile spark +* [ARROW-13819](https://issues.apache.org/jira/browse/ARROW-13819) - [C++] Build fails with "'subseconds' may be used uninitialized in this function" +* [ARROW-13846](https://issues.apache.org/jira/browse/ARROW-13846) - [C++] Fix crashes on invalid IPC file (OSS-Fuzz) +* [ARROW-13850](https://issues.apache.org/jira/browse/ARROW-13850) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz) +* [ARROW-13860](https://issues.apache.org/jira/browse/ARROW-13860) - [R] arrow 5.0.0 write\_parquet throws error writing grouped data.frame +* [ARROW-13872](https://issues.apache.org/jira/browse/ARROW-13872) - [Java] ExtensionTypeVector does not work with RangeEqualsVisitor +* [ARROW-13876](https://issues.apache.org/jira/browse/ARROW-13876) - [C++] Uniform null handling in compute functions +* [ARROW-13877](https://issues.apache.org/jira/browse/ARROW-13877) - [C++] Added support for fixed sized list to compute functions that process lists +* [ARROW-13878](https://issues.apache.org/jira/browse/ARROW-13878) - [C++] Add fixed\_size\_binary support to compute functions +* [ARROW-13880](https://issues.apache.org/jira/browse/ARROW-13880) - [C++] Compute function sort\_indices does not support timestamps with time zones +* [ARROW-13881](https://issues.apache.org/jira/browse/ARROW-13881) - [Python] Error message says "Please use a release of Arrow Flight built with gRPC 1.27 or higher." although I'm using gRPC 1.39 +* [ARROW-13882](https://issues.apache.org/jira/browse/ARROW-13882) - [C++] Add compute function min\_max support for more types +* [ARROW-13884](https://issues.apache.org/jira/browse/ARROW-13884) - Arrow 5.0.0 cannot compile with Typescript 4.2.2 +* [ARROW-13912](https://issues.apache.org/jira/browse/ARROW-13912) - [R] TrimOptions implementation breaks test-r-minimal-build due to dependencies +* [ARROW-13913](https://issues.apache.org/jira/browse/ARROW-13913) - [C++] segfault if compute function index called with no options supplied +* [ARROW-13915](https://issues.apache.org/jira/browse/ARROW-13915) - [R][CI] R UCRT C++ bundles are incomplete +* [ARROW-13916](https://issues.apache.org/jira/browse/ARROW-13916) - [C++] Implement strftime on date32/64 types +* [ARROW-13921](https://issues.apache.org/jira/browse/ARROW-13921) - [Python][Packaging] Pin minimum setuptools version for the macos wheels +* [ARROW-13940](https://issues.apache.org/jira/browse/ARROW-13940) - [R] Turn on multithreading with Arrow engine queries +* [ARROW-13961](https://issues.apache.org/jira/browse/ARROW-13961) - [C++] iso\_calendar may be uninitialized +* [ARROW-13976](https://issues.apache.org/jira/browse/ARROW-13976) - Adapt to arm architecture CPU in hdfs\_internal.cc +* [ARROW-13978](https://issues.apache.org/jira/browse/ARROW-13978) - [C++] Bump gtest to 1.11 to unbreak builds with recent clang +* [ARROW-13981](https://issues.apache.org/jira/browse/ARROW-13981) - [Java] VectorSchemaRootAppender doesn't work for BitVector +* [ARROW-13982](https://issues.apache.org/jira/browse/ARROW-13982) - [C++] Async scanner stalls if a fragment generates no batches +* [ARROW-13983](https://issues.apache.org/jira/browse/ARROW-13983) - [C++] fcntl(..., F\_RDADVISE, ...) may fail on macOS with NFS mount +* [ARROW-13996](https://issues.apache.org/jira/browse/ARROW-13996) - [Go][Parquet] Fix file offsets for row groups +* [ARROW-13997](https://issues.apache.org/jira/browse/ARROW-13997) - [C++] restore exec node based query performance +* [ARROW-14001](https://issues.apache.org/jira/browse/ARROW-14001) - [Go] AppendBooleans in BitmapWriter is broken +* [ARROW-14004](https://issues.apache.org/jira/browse/ARROW-14004) - [Python] to\_pandas() converts to float instead of using pandas nullable types +* [ARROW-14014](https://issues.apache.org/jira/browse/ARROW-14014) - FlightClient.ClientStreamListener not notified on error when parsing invalid trailers +* [ARROW-14017](https://issues.apache.org/jira/browse/ARROW-14017) - [C++] NULLPTR is not included in type\_fwd.h +* [ARROW-14020](https://issues.apache.org/jira/browse/ARROW-14020) - [R] Writing datafames with list columns is slow and scales poorly with nesting level +* [ARROW-14024](https://issues.apache.org/jira/browse/ARROW-14024) - [C++] ScanOptions::batch\_size not respected in parquet/IPC readers +* [ARROW-14026](https://issues.apache.org/jira/browse/ARROW-14026) - [C++] Batch readahead not working correctly in Parquet scanner +* [ARROW-14027](https://issues.apache.org/jira/browse/ARROW-14027) - [C++][R] Ensure groupers accept scalar inputs (was: Allow me to group\_by + summarise() with partitioning fields) +* [ARROW-14040](https://issues.apache.org/jira/browse/ARROW-14040) - [C++] Spurious test failure in ScanNode.MinimalGroupedAggEndToEnd +* [ARROW-14053](https://issues.apache.org/jira/browse/ARROW-14053) - [C++] AsyncReaderTests.InvalidRowsSkipped is flaky +* [ARROW-14057](https://issues.apache.org/jira/browse/ARROW-14057) - [C++] Bump aws-c-common version +* [ARROW-14063](https://issues.apache.org/jira/browse/ARROW-14063) - [R] open\_dataset() does not work on CSVs without header rows +* [ARROW-14076](https://issues.apache.org/jira/browse/ARROW-14076) - Unable to use \`red-arrow\` gem on Heroku/Ubuntu 20.04 (focal) +* [ARROW-14090](https://issues.apache.org/jira/browse/ARROW-14090) - [C++][Parquet] rows\_written\_ should be int64\_t instead of int +* [ARROW-14103](https://issues.apache.org/jira/browse/ARROW-14103) - [R] [C++] Allow min/max in grouped aggregation +* [ARROW-14109](https://issues.apache.org/jira/browse/ARROW-14109) - Segfault When Reading JSON With Duplicate Keys +* [ARROW-14124](https://issues.apache.org/jira/browse/ARROW-14124) - [R] Timezone support in R <= 3.4 +* [ARROW-14129](https://issues.apache.org/jira/browse/ARROW-14129) - [C++] An empty dictionary array crashes on \`unique\` and \`value\_counts\`. +* [ARROW-14139](https://issues.apache.org/jira/browse/ARROW-14139) - [IR] [C++] Table flatbuffer object fails to compile on older GCCs +* [ARROW-14141](https://issues.apache.org/jira/browse/ARROW-14141) - [IR] [C++] Join missing from RelationImpl +* [ARROW-14156](https://issues.apache.org/jira/browse/ARROW-14156) - [C++] StructArray::Flatten is incorrect in some cases +* [ARROW-14162](https://issues.apache.org/jira/browse/ARROW-14162) - [R] Simple arrange %\>% head does not respect ordering +* [ARROW-14173](https://issues.apache.org/jira/browse/ARROW-14173) - [IR] Allow typed null literals to be represented +* [ARROW-14179](https://issues.apache.org/jira/browse/ARROW-14179) - [C++] Import/Export of UnionArray in C data interface has wrong buffer count +* [ARROW-14192](https://issues.apache.org/jira/browse/ARROW-14192) - [C++][Dataset] Backpressure broken on ordered scans +* [ARROW-14195](https://issues.apache.org/jira/browse/ARROW-14195) - [R] Fix ExecPlan binding annotations +* [ARROW-14197](https://issues.apache.org/jira/browse/ARROW-14197) - [C++] Hashjoin + datasets hanging +* [ARROW-14200](https://issues.apache.org/jira/browse/ARROW-14200) - [R] strftime on a date should not use or be confused by timezones +* [ARROW-14203](https://issues.apache.org/jira/browse/ARROW-14203) - [C++] Fix description of ExecBatch.length for Scalars in aggregate kernels +* [ARROW-14204](https://issues.apache.org/jira/browse/ARROW-14204) - [C++] Fails to compile Arrow without RE2 due to missing ifdef guard in MatchLike +* [ARROW-14206](https://issues.apache.org/jira/browse/ARROW-14206) - [Go] Fix Build for ARM and s390x +* [ARROW-14206](https://issues.apache.org/jira/browse/ARROW-14206) - [Go] Fix Build for ARM and s390x +* [ARROW-14208](https://issues.apache.org/jira/browse/ARROW-14208) - [C++] Build errors with Visual Studio 2019 +* [ARROW-14210](https://issues.apache.org/jira/browse/ARROW-14210) - [C++] CMAKE\_AR is not passed to bzip2 thirdparty dependency +* [ARROW-14211](https://issues.apache.org/jira/browse/ARROW-14211) - [C++] Valgrind and TSAN errors in arrow-compute-hash-join-node-test +* [ARROW-14214](https://issues.apache.org/jira/browse/ARROW-14214) - [Python][CI] wheel-windows-cp36-amd64 nightly build failure +* [ARROW-14216](https://issues.apache.org/jira/browse/ARROW-14216) - [R] Disable auto-cleaning of duckdb tables +* [ARROW-14219](https://issues.apache.org/jira/browse/ARROW-14219) - [R] [CI] DuckDB valgrind failure +* [ARROW-14220](https://issues.apache.org/jira/browse/ARROW-14220) - [C++] Missing ending quote in thirdpartyversions +* [ARROW-14221](https://issues.apache.org/jira/browse/ARROW-14221) - [R] [CI] DuckDB tests fail on R < 4.0 +* [ARROW-14223](https://issues.apache.org/jira/browse/ARROW-14223) - [C++] Add google\_cloud\_cpp\_storage to ARROW\_THIRDPARTY\_DEPENDENCIES +* [ARROW-14224](https://issues.apache.org/jira/browse/ARROW-14224) - [R] [CI] R sanitizer build failing +* [ARROW-14226](https://issues.apache.org/jira/browse/ARROW-14226) - [R] Handle n\_distinct() with args != 1 +* [ARROW-14237](https://issues.apache.org/jira/browse/ARROW-14237) - [R] [CI] Disable altrep in R <= 3.5 +* [ARROW-14240](https://issues.apache.org/jira/browse/ARROW-14240) - [C++] nlohmann\_json\_ep always rebuilt +* [ARROW-14246](https://issues.apache.org/jira/browse/ARROW-14246) - [C++] find\_package(CURL) in build\_google\_cloud\_cpp\_storage fails +* [ARROW-14247](https://issues.apache.org/jira/browse/ARROW-14247) - [C++] Valgrind error in parquet-arrow-test +* [ARROW-14249](https://issues.apache.org/jira/browse/ARROW-14249) - [R] Slow down in dataframe-to-table benchmark +* [ARROW-14252](https://issues.apache.org/jira/browse/ARROW-14252) - [R] Partial matching of arguments warning +* [ARROW-14255](https://issues.apache.org/jira/browse/ARROW-14255) - [Python] FlightClient.do\_action is a generator instead of returning one. +* [ARROW-14257](https://issues.apache.org/jira/browse/ARROW-14257) - [Doc][Python] dataset doc build fails +* [ARROW-14260](https://issues.apache.org/jira/browse/ARROW-14260) - [C++] GTest linker error with vcpkg and Visual Studio 2019 +* [ARROW-14283](https://issues.apache.org/jira/browse/ARROW-14283) - [C++][CI] LLVM 13 cannot be used on macOS GHA builds +* [ARROW-14285](https://issues.apache.org/jira/browse/ARROW-14285) - [C++] Fix crashes when pretty-printing data from valid IPC file (OSS-Fuzz) +* [ARROW-14299](https://issues.apache.org/jira/browse/ARROW-14299) - [Dev][CI] "linux-apt-r" dockerfile reinstalls Minio +* [ARROW-14300](https://issues.apache.org/jira/browse/ARROW-14300) - [R][CI] "test-r-gcc-11" nightly build failure +* [ARROW-14301](https://issues.apache.org/jira/browse/ARROW-14301) - [C++][CI] "test-ubuntu-20.04-cpp-17" nightly build crash in GCSFS test +* [ARROW-14302](https://issues.apache.org/jira/browse/ARROW-14302) - [C++] Valgrind errors +* [ARROW-14305](https://issues.apache.org/jira/browse/ARROW-14305) - [C++] Valgrind errors in arrow-compute-hash-join-node-test +* [ARROW-14307](https://issues.apache.org/jira/browse/ARROW-14307) - [R] crashes when reading empty feather with POSIXct column +* [ARROW-14313](https://issues.apache.org/jira/browse/ARROW-14313) - [Doc][Dev] Installation instructions for Archery incomplete +* [ARROW-14321](https://issues.apache.org/jira/browse/ARROW-14321) - [R] segfault converting dictionary ChunkedArray with 0 chunks +* [ARROW-14340](https://issues.apache.org/jira/browse/ARROW-14340) - [C++] Fix xsimd build error on apple m1 +* [ARROW-14370](https://issues.apache.org/jira/browse/ARROW-14370) - [C++] ASAN CI job failed +* [ARROW-14373](https://issues.apache.org/jira/browse/ARROW-14373) - [Packaging][Java] Missing LLVM dependency in the macOS java-jars build +* [ARROW-14377](https://issues.apache.org/jira/browse/ARROW-14377) - [Packaging][Python] Python 3.9 installation fails in macOS wheel build +* [ARROW-14381](https://issues.apache.org/jira/browse/ARROW-14381) - [CI][Python] Spark integration failures +* [ARROW-14382](https://issues.apache.org/jira/browse/ARROW-14382) - [C++][Compute] Remove duplicate ThreadIndexer definition +* [ARROW-14392](https://issues.apache.org/jira/browse/ARROW-14392) - [C++] Bundled gRPC misses bundled Abseil include path +* [ARROW-14393](https://issues.apache.org/jira/browse/ARROW-14393) - [C++] GTest linking errors during the source release verification +* [ARROW-14397](https://issues.apache.org/jira/browse/ARROW-14397) - [C++] Fix valgrind error in test utility +* [ARROW-14406](https://issues.apache.org/jira/browse/ARROW-14406) - [Python][CI] Nightly dask integration jobs fail +* [ARROW-14411](https://issues.apache.org/jira/browse/ARROW-14411) - [Release][Integration] Go integration tests fail for 6.0.0-RC1 +* [ARROW-14417](https://issues.apache.org/jira/browse/ARROW-14417) - [R] Joins ignore projection on left dataset +* [ARROW-14423](https://issues.apache.org/jira/browse/ARROW-14423) - [Python] Fix version constraints in pyproject.toml +* [ARROW-14424](https://issues.apache.org/jira/browse/ARROW-14424) - [Packaging][Python] Disable windows wheel testing for python 3.6 +* [ARROW-14434](https://issues.apache.org/jira/browse/ARROW-14434) - R crashes when making an empty selection for Datasets with DateTime +* [PARQUET-2067](https://issues.apache.org/jira/browse/PARQUET-2067) - [C++] null\_count and num\_nulls incorrect for repeated columns +* [PARQUET-2089](https://issues.apache.org/jira/browse/PARQUET-2089) - [C++] RowGroupMetaData file\_offset set incorrectly + + +## New Features and Improvements + +* [ARROW-1565](https://issues.apache.org/jira/browse/ARROW-1565) - [C++][Compute] Implement TopK/BottomK +* [ARROW-1568](https://issues.apache.org/jira/browse/ARROW-1568) - [C++] Implement "drop null" kernels that return array without nulls +* [ARROW-4333](https://issues.apache.org/jira/browse/ARROW-4333) - [C++] Sketch out design for kernels and "query" execution in compute layer +* [ARROW-4700](https://issues.apache.org/jira/browse/ARROW-4700) - [C++] Add DecimalType support to arrow::json::TableReader +* [ARROW-5002](https://issues.apache.org/jira/browse/ARROW-5002) - [C++] Implement Hash Aggregation query execution node +* [ARROW-5244](https://issues.apache.org/jira/browse/ARROW-5244) - [C++] Review experimental / unstable APIs +* [ARROW-6072](https://issues.apache.org/jira/browse/ARROW-6072) - [C++] Implement casting List <-\> LargeList +* [ARROW-6607](https://issues.apache.org/jira/browse/ARROW-6607) - [Python] Support for set/list columns when converting from Pandas +* [ARROW-6626](https://issues.apache.org/jira/browse/ARROW-6626) - [Python] Handle nested "set" values as lists when converting to Arrow +* [ARROW-6870](https://issues.apache.org/jira/browse/ARROW-6870) - [C\#] Add Support for Dictionary Arrays and Dictionary Encoding +* [ARROW-7102](https://issues.apache.org/jira/browse/ARROW-7102) - [Python] Make filesystems compatible with fsspec +* [ARROW-7179](https://issues.apache.org/jira/browse/ARROW-7179) - [C++][Compute] Consolidate fill\_null and coalesce +* [ARROW-7901](https://issues.apache.org/jira/browse/ARROW-7901) - [Integration][Go] Add null type (and integration test) +* [ARROW-8022](https://issues.apache.org/jira/browse/ARROW-8022) - [C++] Provide or Vendor a small\_vector implementation +* [ARROW-8147](https://issues.apache.org/jira/browse/ARROW-8147) - [C++] Add google-cloud-cpp to ThirdpartyToolchain +* [ARROW-8379](https://issues.apache.org/jira/browse/ARROW-8379) - [R] Investigate/fix thread safety issues (esp. Windows) +* [ARROW-8621](https://issues.apache.org/jira/browse/ARROW-8621) - [Release][Go] Add Module support by creating tags +* [ARROW-8780](https://issues.apache.org/jira/browse/ARROW-8780) - [Python] A fsspec-compatible wrapper for pyarrow.fs filesystems +* [ARROW-8928](https://issues.apache.org/jira/browse/ARROW-8928) - [C++] Measure microperformance associated with ExecBatchIterator +* [ARROW-9226](https://issues.apache.org/jira/browse/ARROW-9226) - [Python] pyarrow.fs.HadoopFileSystem - retrieve options from core-site.xml or hdfs-site.xml if available +* [ARROW-9434](https://issues.apache.org/jira/browse/ARROW-9434) - [C++] Store type\_code information in UnionScalar::value +* [ARROW-9719](https://issues.apache.org/jira/browse/ARROW-9719) - [Doc][Python] Better document the new pa.fs.HadoopFileSystem +* [ARROW-10094](https://issues.apache.org/jira/browse/ARROW-10094) - [Python][Doc] Update pandas doc +* [ARROW-10415](https://issues.apache.org/jira/browse/ARROW-10415) - [R] Support for dplyr::distinct() +* [ARROW-10898](https://issues.apache.org/jira/browse/ARROW-10898) - [C++] Investigate Table sort performance +* [ARROW-11238](https://issues.apache.org/jira/browse/ARROW-11238) - [Python] Make SubTreeFileSystem print method more informative +* [ARROW-11243](https://issues.apache.org/jira/browse/ARROW-11243) - [C++] Parse time32 from string and infer in CSV reader +* [ARROW-11460](https://issues.apache.org/jira/browse/ARROW-11460) - [R] Use system libraries if present on Linux +* [ARROW-11691](https://issues.apache.org/jira/browse/ARROW-11691) - [Developer][CI] Provide a consolidated .env file for benchmark-relevant environment variables +* [ARROW-11748](https://issues.apache.org/jira/browse/ARROW-11748) - [C++] Ensure Decimal128 and Decimal256's fields are in native endian order +* [ARROW-11828](https://issues.apache.org/jira/browse/ARROW-11828) - [C++] Expose CSVWriter object in api +* [ARROW-11885](https://issues.apache.org/jira/browse/ARROW-11885) - [R] Turn off some capabilities when LIBARROW\_MINIMAL=true +* [ARROW-11981](https://issues.apache.org/jira/browse/ARROW-11981) - [C++][Dataset][Compute] Replace UnionDataset with Union ExecNode +* [ARROW-12063](https://issues.apache.org/jira/browse/ARROW-12063) - [C++] Add nulls position option to sort functions +* [ARROW-12181](https://issues.apache.org/jira/browse/ARROW-12181) - [C++][R] The "CSV dataset" in test-dataset.R is failing on RTools 3.5 +* [ARROW-12216](https://issues.apache.org/jira/browse/ARROW-12216) - [R] Proactively disable multithreading on RTools3.5 (32bit?) +* [ARROW-12359](https://issues.apache.org/jira/browse/ARROW-12359) - [C++] Deprecate or remove FileSystem::OpenAppendStream +* [ARROW-12388](https://issues.apache.org/jira/browse/ARROW-12388) - [C++][Gandiva] Implement cast numbers from varbinary functions in gandiva +* [ARROW-12410](https://issues.apache.org/jira/browse/ARROW-12410) - [C++][Gandiva] Implement regexp\_replace function on Gandiva +* [ARROW-12479](https://issues.apache.org/jira/browse/ARROW-12479) - [C++][Gandiva] Implement castBigInt, castInt, castIntervalDay and castIntervalYear extra functions +* [ARROW-12563](https://issues.apache.org/jira/browse/ARROW-12563) - Add space,add\_months and datediff functions for string +* [ARROW-12615](https://issues.apache.org/jira/browse/ARROW-12615) - [C++] Add options for handling NAs to stddev and variance +* [ARROW-12650](https://issues.apache.org/jira/browse/ARROW-12650) - [Doc][Python] Improve documentation regarding dealing with memory mapped files +* [ARROW-12657](https://issues.apache.org/jira/browse/ARROW-12657) - [C++][Python][Compute] String hex to numeric conversion and bit shifting +* [ARROW-12669](https://issues.apache.org/jira/browse/ARROW-12669) - [C++] Kernel to return Array of elements at index of list in ListArray +* [ARROW-12673](https://issues.apache.org/jira/browse/ARROW-12673) - [C++] Configure a custom handler for rows with incorrect column counts +* [ARROW-12688](https://issues.apache.org/jira/browse/ARROW-12688) - [R] Use DuckDB to query an Arrow Dataset +* [ARROW-12714](https://issues.apache.org/jira/browse/ARROW-12714) - [C++] String title case kernel +* [ARROW-12725](https://issues.apache.org/jira/browse/ARROW-12725) - [C++][Compute] GroupBy: improve performance by encoding keys in row format only when they are inserted into hash table +* [ARROW-12728](https://issues.apache.org/jira/browse/ARROW-12728) - [C++][Compute] Implement count\_distinct/distinct hash aggregate kernels +* [ARROW-12744](https://issues.apache.org/jira/browse/ARROW-12744) - [C++][Compute] Add rounding kernel +* [ARROW-12759](https://issues.apache.org/jira/browse/ARROW-12759) - [C++][Compute] Wrap grouped aggregation in an ExecNode +* [ARROW-12763](https://issues.apache.org/jira/browse/ARROW-12763) - [R] Optimize dplyr queries that use head/tail after arrange +* [ARROW-12846](https://issues.apache.org/jira/browse/ARROW-12846) - [Release] Improve upload of binaries +* [ARROW-12866](https://issues.apache.org/jira/browse/ARROW-12866) - [C++][Gandiva] Implement STRPOS function on Gandiva +* [ARROW-12871](https://issues.apache.org/jira/browse/ARROW-12871) - [R] upgrade to testthat 3e +* [ARROW-12876](https://issues.apache.org/jira/browse/ARROW-12876) - [R] Fix build flags on Raspberry Pi +* [ARROW-12944](https://issues.apache.org/jira/browse/ARROW-12944) - [C++] String capitalize kernel +* [ARROW-12946](https://issues.apache.org/jira/browse/ARROW-12946) - [C++] String swap case kernel +* [ARROW-12953](https://issues.apache.org/jira/browse/ARROW-12953) - [C++][Compute] Refactor CheckScalar\* to take Datum arguments +* [ARROW-12959](https://issues.apache.org/jira/browse/ARROW-12959) - [C++][R] Option for is\_null(NaN) to evaluate to true +* [ARROW-12965](https://issues.apache.org/jira/browse/ARROW-12965) - [Java] Java implementation of Arrow C data interface +* [ARROW-12980](https://issues.apache.org/jira/browse/ARROW-12980) - [C++] Kernels to extract datetime components should be timezone aware +* [ARROW-12981](https://issues.apache.org/jira/browse/ARROW-12981) - [R] Install source package from CRAN alone +* [ARROW-13033](https://issues.apache.org/jira/browse/ARROW-13033) - [C++] Kernel to localize naive timestamps to a timezone (preserving clock-time) +* [ARROW-13056](https://issues.apache.org/jira/browse/ARROW-13056) - [Dev][MATLAB] Expand PR labeler for supported language +* [ARROW-13067](https://issues.apache.org/jira/browse/ARROW-13067) - [C++][Compute] Implement integer to decimal cast +* [ARROW-13089](https://issues.apache.org/jira/browse/ARROW-13089) - [Python] Allow creating RecordBatch from Python dict +* [ARROW-13112](https://issues.apache.org/jira/browse/ARROW-13112) - [R] altrep vectors for strings and other types +* [ARROW-13132](https://issues.apache.org/jira/browse/ARROW-13132) - [C++] Add Scalar validation +* [ARROW-13138](https://issues.apache.org/jira/browse/ARROW-13138) - [C++] Implement kernel to extract datetime components (year, month, day, etc) from date type objects +* [ARROW-13141](https://issues.apache.org/jira/browse/ARROW-13141) - [C++][Python] HadoopFileSystem: automatically set CLASSPATH based on HADOOP\_HOME env variable? +* [ARROW-13163](https://issues.apache.org/jira/browse/ARROW-13163) - [C++][Gandiva] Implement REPEAT function on Gandiva +* [ARROW-13164](https://issues.apache.org/jira/browse/ARROW-13164) - [R] altrep vectors from Array with nulls +* [ARROW-13172](https://issues.apache.org/jira/browse/ARROW-13172) - [Java] Make TYPE\_WIDTH in Vector public +* [ARROW-13174](https://issues.apache.org/jira/browse/ARROW-13174) - [C++][Compute] Add strftime kernel +* [ARROW-13202](https://issues.apache.org/jira/browse/ARROW-13202) - [MATLAB] Enable GitHub Actions CI for MATLAB Interface on Linux +* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage +* [ARROW-13220](https://issues.apache.org/jira/browse/ARROW-13220) - [C++] Add a 'choose' kernel/scalar compute function +* [ARROW-13222](https://issues.apache.org/jira/browse/ARROW-13222) - [C++] Support variable-width types in case\_when function +* [ARROW-13227](https://issues.apache.org/jira/browse/ARROW-13227) - [C++][Compute] Document ExecNode, ExecPlan +* [ARROW-13257](https://issues.apache.org/jira/browse/ARROW-13257) - [Java][Dataset] Allow passing empty columns for projection +* [ARROW-13260](https://issues.apache.org/jira/browse/ARROW-13260) - [Doc] Host different released versions of the documentation + version switcher +* [ARROW-13268](https://issues.apache.org/jira/browse/ARROW-13268) - [C++][Compute] Add ExecNode for semi and anti-semi join +* [ARROW-13279](https://issues.apache.org/jira/browse/ARROW-13279) - [R] Use C++ DayOfWeekOptions in wday implementation instead of manually calculating via Expression +* [ARROW-13287](https://issues.apache.org/jira/browse/ARROW-13287) - [C++] [Dataset] FileSystemDataset::Write should use an async scan +* [ARROW-13295](https://issues.apache.org/jira/browse/ARROW-13295) - [C++] Implement hash\_aggregate mean/stdev/variance kernels +* [ARROW-13298](https://issues.apache.org/jira/browse/ARROW-13298) - [C++] Implement hash\_aggregate any/all Boolean kernels +* [ARROW-13307](https://issues.apache.org/jira/browse/ARROW-13307) - [C++] Remove reflection-based enums (was: Use reflection-based enums for compute options) +* [ARROW-13311](https://issues.apache.org/jira/browse/ARROW-13311) - [C++][Documentation] List hash aggregate kernels somewhere +* [ARROW-13317](https://issues.apache.org/jira/browse/ARROW-13317) - [Python] Improve documentation on what 'use\_threads' does in 'read\_feather' +* [ARROW-13326](https://issues.apache.org/jira/browse/ARROW-13326) - [R] [Archery] Add linting to dev CI +* [ARROW-13327](https://issues.apache.org/jira/browse/ARROW-13327) - [Python] Improve consistency of explicit C++ types in PyArrow files +* [ARROW-13330](https://issues.apache.org/jira/browse/ARROW-13330) - [Go][Parquet] Add Encoding Package Part 2 +* [ARROW-13344](https://issues.apache.org/jira/browse/ARROW-13344) - [R] Initial bindings for ExecPlan/ExecNode +* [ARROW-13345](https://issues.apache.org/jira/browse/ARROW-13345) - [C++] Implement logN compute function +* [ARROW-13358](https://issues.apache.org/jira/browse/ARROW-13358) - [C++] Extend type support for if\_else kernel +* [ARROW-13379](https://issues.apache.org/jira/browse/ARROW-13379) - [Dev][Docs] Improvements to archery docs +* [ARROW-13390](https://issues.apache.org/jira/browse/ARROW-13390) - [C++] Improve type support for 'coalesce' kernel +* [ARROW-13397](https://issues.apache.org/jira/browse/ARROW-13397) - [R] Update arrow.Rmd vignette +* [ARROW-13399](https://issues.apache.org/jira/browse/ARROW-13399) - [R] Update dataset.Rmd vignette +* [ARROW-13402](https://issues.apache.org/jira/browse/ARROW-13402) - [R] Update flight.Rmd vignette +* [ARROW-13403](https://issues.apache.org/jira/browse/ARROW-13403) - [R] Update developing.Rmd vignette +* [ARROW-13404](https://issues.apache.org/jira/browse/ARROW-13404) - [Python] [Doc] Make Python landing page less coupled to the rest of arrow documentation +* [ARROW-13405](https://issues.apache.org/jira/browse/ARROW-13405) - [Doc] Make "Libraries" the entry point for the documentation +* [ARROW-13416](https://issues.apache.org/jira/browse/ARROW-13416) - [C++] Implement mod compute function +* [ARROW-13420](https://issues.apache.org/jira/browse/ARROW-13420) - [JS] Update dependencies +* [ARROW-13421](https://issues.apache.org/jira/browse/ARROW-13421) - [C++] Add functionality for reading in columns as floats from delimited files where a comma has been used as a decimal separator +* [ARROW-13433](https://issues.apache.org/jira/browse/ARROW-13433) - [R] Remove CLI hack from Valgrind test +* [ARROW-13434](https://issues.apache.org/jira/browse/ARROW-13434) - [R] group\_by() with an unnammed expression +* [ARROW-13435](https://issues.apache.org/jira/browse/ARROW-13435) - [R] Add function arrow\_table() as alias for Table$create() +* [ARROW-13444](https://issues.apache.org/jira/browse/ARROW-13444) - [C++] C++20 compatibility by updating std::result\_of to std::invoke\_result +* [ARROW-13448](https://issues.apache.org/jira/browse/ARROW-13448) - [R] Bindings for strftime +* [ARROW-13453](https://issues.apache.org/jira/browse/ARROW-13453) - [R] DuckDB has not yet released 0.2.8 +* [ARROW-13455](https://issues.apache.org/jira/browse/ARROW-13455) - [C++][Docs] Typo in RecordBatch::SetColumn +* [ARROW-13458](https://issues.apache.org/jira/browse/ARROW-13458) - [C++][Docs] Typo in RecordBatch::schema +* [ARROW-13459](https://issues.apache.org/jira/browse/ARROW-13459) - [C++][Docs] Missing param docs for RecordBatch::SetColumn +* [ARROW-13461](https://issues.apache.org/jira/browse/ARROW-13461) - [Python][Packaging] Build M1 wheels for python 3.8 +* [ARROW-13463](https://issues.apache.org/jira/browse/ARROW-13463) - [Release][Python] Verify python 3.8 macOS arm64 wheel +* [ARROW-13465](https://issues.apache.org/jira/browse/ARROW-13465) - [R] to\_arrow() from duckdb +* [ARROW-13466](https://issues.apache.org/jira/browse/ARROW-13466) - [R] make installation fail if Arrow C++ dependencies cannot be installed +* [ARROW-13468](https://issues.apache.org/jira/browse/ARROW-13468) - [Release] Fix binary download/upload failures +* [ARROW-13472](https://issues.apache.org/jira/browse/ARROW-13472) - [R] Remove .engine = "duckdb" argument +* [ARROW-13475](https://issues.apache.org/jira/browse/ARROW-13475) - [Release] Don't consider rust tarballs when cleaning up old releases +* [ARROW-13476](https://issues.apache.org/jira/browse/ARROW-13476) - [Doc][Python] Ensure that ipc/io documentation uses context managers instead of manually closing streams +* [ARROW-13478](https://issues.apache.org/jira/browse/ARROW-13478) - [Release] Unnecessary rc-number argument for the version bumping post-release script +* [ARROW-13480](https://issues.apache.org/jira/browse/ARROW-13480) - [C++] [R] [Python] Dataset SyncScanner may freeze on error +* [ARROW-13482](https://issues.apache.org/jira/browse/ARROW-13482) - [C++][Compute] Provide a registry for ExecNode implementations +* [ARROW-13485](https://issues.apache.org/jira/browse/ARROW-13485) - [Release] Replace ${PREVIOUS\_RELEASE}.9000 in r/NEWS.md by post-12-bump-versions.sh +* [ARROW-13488](https://issues.apache.org/jira/browse/ARROW-13488) - [Website] Update Linux packages install information for 5.0.0 +* [ARROW-13489](https://issues.apache.org/jira/browse/ARROW-13489) - [R] Bump CI jobs after 5.0.0 +* [ARROW-13501](https://issues.apache.org/jira/browse/ARROW-13501) - [R] Bindings for count aggregation +* [ARROW-13502](https://issues.apache.org/jira/browse/ARROW-13502) - [R] Bindings for min/max aggregation +* [ARROW-13503](https://issues.apache.org/jira/browse/ARROW-13503) - [GLib][Ruby][Flight] Add support for DoGet +* [ARROW-13506](https://issues.apache.org/jira/browse/ARROW-13506) - Upgrade ORC to 1.6.9 +* [ARROW-13508](https://issues.apache.org/jira/browse/ARROW-13508) - [C++] Allow custom RetryStrategy objects to be passed to S3FileSystem +* [ARROW-13510](https://issues.apache.org/jira/browse/ARROW-13510) - [CI][R][C++] Add -Wall to fedora-clang-devel as-cran checks +* [ARROW-13511](https://issues.apache.org/jira/browse/ARROW-13511) - [CI][R] Fail in the docker build step if R deps don't install +* [ARROW-13516](https://issues.apache.org/jira/browse/ARROW-13516) - [C++] Mingw-w64 + Clang (lld) doesn't support --version-script +* [ARROW-13519](https://issues.apache.org/jira/browse/ARROW-13519) - [R] Make doc examples less noisy +* [ARROW-13520](https://issues.apache.org/jira/browse/ARROW-13520) - [C++] Implement hash\_aggregate approximate quantile kernel +* [ARROW-13521](https://issues.apache.org/jira/browse/ARROW-13521) - [C++][Docs] Add note about tdigest in compute functions docs +* [ARROW-13525](https://issues.apache.org/jira/browse/ARROW-13525) - [Python] Mention alternatives in deprecation message of ParquetDataset attributes +* [ARROW-13528](https://issues.apache.org/jira/browse/ARROW-13528) - [R] Bindings for mean, var, sd aggregation +* [ARROW-13532](https://issues.apache.org/jira/browse/ARROW-13532) - [C++][Compute] Join: add set membership test method to the grouper +* [ARROW-13534](https://issues.apache.org/jira/browse/ARROW-13534) - [C++] Improve csv chunker +* [ARROW-13540](https://issues.apache.org/jira/browse/ARROW-13540) - [C++][Compute] Add OrderByNode for ordering of rows in an ExecPlan +* [ARROW-13541](https://issues.apache.org/jira/browse/ARROW-13541) - [C++][Python] Implement ExtensionScalar +* [ARROW-13542](https://issues.apache.org/jira/browse/ARROW-13542) - [C++][Compute][Dataset] Add dataset::WriteNode for writing rows from an ExecPlan to disk +* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long +* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long +* [ARROW-13544](https://issues.apache.org/jira/browse/ARROW-13544) - [Java] Remove APIs that have been deprecated for long +* [ARROW-13548](https://issues.apache.org/jira/browse/ARROW-13548) - [C++] Implement datediff kernel +* [ARROW-13549](https://issues.apache.org/jira/browse/ARROW-13549) - [C++] Implement timestamp to date/time cast that extracts value +* [ARROW-13550](https://issues.apache.org/jira/browse/ARROW-13550) - [R] Support .groups argument to dplyr::summarize() +* [ARROW-13552](https://issues.apache.org/jira/browse/ARROW-13552) - [C++] Remove deprecated APIs +* [ARROW-13557](https://issues.apache.org/jira/browse/ARROW-13557) - [Packaging][Python] Skip test\_cancellation test case on M1 +* [ARROW-13561](https://issues.apache.org/jira/browse/ARROW-13561) - [C++] Implement week kernel that accepts WeekOptions +* [ARROW-13562](https://issues.apache.org/jira/browse/ARROW-13562) - [R] Styler followups +* [ARROW-13565](https://issues.apache.org/jira/browse/ARROW-13565) - [Packaging][Ubuntu] Drop support for 20.10 +* [ARROW-13572](https://issues.apache.org/jira/browse/ARROW-13572) - [C++][Python] Add basic ORC support to the pyarrow.datasets API +* [ARROW-13573](https://issues.apache.org/jira/browse/ARROW-13573) - [C++] Support dictionaries directly in case\_when kernel +* [ARROW-13574](https://issues.apache.org/jira/browse/ARROW-13574) - [C++] Add 'count all' option to count (hash) aggregate kernel +* [ARROW-13575](https://issues.apache.org/jira/browse/ARROW-13575) - [C++] Implement product aggregate & hash aggregate kernels +* [ARROW-13576](https://issues.apache.org/jira/browse/ARROW-13576) - [C++][Compute] Replace ExecNode::InputReceived with ::MakeTask +* [ARROW-13577](https://issues.apache.org/jira/browse/ARROW-13577) - [Python][FlightRPC] pyarrow client do\_put close method after write\_table did not throw flight error +* [ARROW-13585](https://issues.apache.org/jira/browse/ARROW-13585) - [GLib] Add support for C ABI interface +* [ARROW-13587](https://issues.apache.org/jira/browse/ARROW-13587) - [R] Handle --use-LTO override +* [ARROW-13595](https://issues.apache.org/jira/browse/ARROW-13595) - [C++] Add debug mode check for compute kernel output type +* [ARROW-13604](https://issues.apache.org/jira/browse/ARROW-13604) - [Java] Remove deprecation annotations for APIs representing unsupported operations +* [ARROW-13606](https://issues.apache.org/jira/browse/ARROW-13606) - [R] Actually disable LTO +* [ARROW-13613](https://issues.apache.org/jira/browse/ARROW-13613) - [C++] Implement sum/mean aggregations over decimals +* [ARROW-13614](https://issues.apache.org/jira/browse/ARROW-13614) - [C++] Implement min\_max aggregation over decimal +* [ARROW-13618](https://issues.apache.org/jira/browse/ARROW-13618) - [R] Use Arrow engine for summarize() by default +* [ARROW-13620](https://issues.apache.org/jira/browse/ARROW-13620) - [R] Binding for n\_distinct() +* [ARROW-13626](https://issues.apache.org/jira/browse/ARROW-13626) - [R] Bindings for log base b +* [ARROW-13627](https://issues.apache.org/jira/browse/ARROW-13627) - [C++] ScalarAggregateOptions don't make sense (in hash aggregation) +* [ARROW-13629](https://issues.apache.org/jira/browse/ARROW-13629) - [Ruby] Add support for building/converting map +* [ARROW-13633](https://issues.apache.org/jira/browse/ARROW-13633) - [Packaging][Debian] Add support for bookworm +* [ARROW-13634](https://issues.apache.org/jira/browse/ARROW-13634) - [R] Update distro() in nixlibs.R to map from "bookworm" to 12 +* [ARROW-13635](https://issues.apache.org/jira/browse/ARROW-13635) - [Packaging][Python] Define --with-lg-page for jemalloc in the arm manylinux builds +* [ARROW-13637](https://issues.apache.org/jira/browse/ARROW-13637) - [Python][Doc] Make docstrings conform to same style +* [ARROW-13642](https://issues.apache.org/jira/browse/ARROW-13642) - [C++][Compute] Implement many-to-many inner hash join +* [ARROW-13645](https://issues.apache.org/jira/browse/ARROW-13645) - [Java] Allow NullVectors to have distinct field names +* [ARROW-13646](https://issues.apache.org/jira/browse/ARROW-13646) - [Go][Parquet] Add Metadata Package +* [ARROW-13648](https://issues.apache.org/jira/browse/ARROW-13648) - [Dev] Use \#!/usr/bin/env instead of \#!/bin where possible +* [ARROW-13650](https://issues.apache.org/jira/browse/ARROW-13650) - [C++] Create dataset writer to encapsulate dataset writer logic +* [ARROW-13651](https://issues.apache.org/jira/browse/ARROW-13651) - [Ruby] Add support for converting [Symbol] to Arrow array +* [ARROW-13652](https://issues.apache.org/jira/browse/ARROW-13652) - [Python] Expose the CopyFiles utility in Python +* [ARROW-13660](https://issues.apache.org/jira/browse/ARROW-13660) - [C++][Compute] Remove \`seq\` as a parameter of ExecNode::InputReceived +* [ARROW-13670](https://issues.apache.org/jira/browse/ARROW-13670) - [C++] Do a round of compiler warning cleanups +* [ARROW-13674](https://issues.apache.org/jira/browse/ARROW-13674) - [Dev][CI] PR checks workflow should check for JIRA components +* [ARROW-13675](https://issues.apache.org/jira/browse/ARROW-13675) - [Doc][Python] Add a recipe on how to save partitioned datasets to the Cookbook +* [ARROW-13679](https://issues.apache.org/jira/browse/ARROW-13679) - [GLib][Ruby] Add support for group aggregation +* [ARROW-13680](https://issues.apache.org/jira/browse/ARROW-13680) - [C++] Create an asynchronous nursery to simplify capture logic +* [ARROW-13682](https://issues.apache.org/jira/browse/ARROW-13682) - [C++] Add TDigest::Merge(const TDigest&) +* [ARROW-13684](https://issues.apache.org/jira/browse/ARROW-13684) - [C++][Compute] Strftime kernel follow-up +* [ARROW-13686](https://issues.apache.org/jira/browse/ARROW-13686) - [Python] Update deprecated pytest yield\_fixture functions +* [ARROW-13687](https://issues.apache.org/jira/browse/ARROW-13687) - [Ruby] Add support for loading table by Arrow Dataset +* [ARROW-13691](https://issues.apache.org/jira/browse/ARROW-13691) - [C++] Add option to handle NAs to VarianceOptions +* [ARROW-13693](https://issues.apache.org/jira/browse/ARROW-13693) - [Website] arrow-site should pin down a specific Ruby version and leverage toolings like rbenv +* [ARROW-13696](https://issues.apache.org/jira/browse/ARROW-13696) - [Python] Support for MapType with Fields +* [ARROW-13699](https://issues.apache.org/jira/browse/ARROW-13699) - [Python][Doc] Refactor the FileSystem Interface documentation +* [ARROW-13700](https://issues.apache.org/jira/browse/ARROW-13700) - [Docs][C++] Clarify DayOfWeekOptions args +* [ARROW-13702](https://issues.apache.org/jira/browse/ARROW-13702) - [Python] test\_parquet\_dataset\_deprecated\_properties missing a dataset mark +* [ARROW-13704](https://issues.apache.org/jira/browse/ARROW-13704) - [C\#] Add support for reading streaming format delta dictionaries +* [ARROW-13705](https://issues.apache.org/jira/browse/ARROW-13705) - [Website] Pin node version +* [ARROW-13721](https://issues.apache.org/jira/browse/ARROW-13721) - [Doc][Cookbook] Specifying Schemas - Python +* [ARROW-13733](https://issues.apache.org/jira/browse/ARROW-13733) - [Java] Allow JDBC adapters to reuse vector schema roots +* [ARROW-13734](https://issues.apache.org/jira/browse/ARROW-13734) - [Format] Clarify allowed values for time types +* [ARROW-13736](https://issues.apache.org/jira/browse/ARROW-13736) - [C++] Reconcile PrettyPrint and StringFormatter +* [ARROW-13737](https://issues.apache.org/jira/browse/ARROW-13737) - [C++] Support scalar columns in hash aggregations (was: hash\_sum on scalar column segfaults) +* [ARROW-13739](https://issues.apache.org/jira/browse/ARROW-13739) - [R] Support dplyr::count() and tally() +* [ARROW-13740](https://issues.apache.org/jira/browse/ARROW-13740) - [R] summarize() should not eagerly evaluate +* [ARROW-13757](https://issues.apache.org/jira/browse/ARROW-13757) - [R] Fix download of C++ source for CRAN patch releases +* [ARROW-13759](https://issues.apache.org/jira/browse/ARROW-13759) - [C++] Update linting and formatting scripts to specify python3 in shebang line +* [ARROW-13760](https://issues.apache.org/jira/browse/ARROW-13760) - [C++] Bump Protobuf version to 3.15 when Flight is enabled +* [ARROW-13764](https://issues.apache.org/jira/browse/ARROW-13764) - [C++] Implement ScalarAggregateOptions for count\_distinct (grouped) +* [ARROW-13768](https://issues.apache.org/jira/browse/ARROW-13768) - [R] Allow JSON to be an optional component +* [ARROW-13772](https://issues.apache.org/jira/browse/ARROW-13772) - [R] Binding for median() and quantile() aggregation functions +* [ARROW-13776](https://issues.apache.org/jira/browse/ARROW-13776) - [C++] Offline thirdparty versions.txt is missing extensions for some files +* [ARROW-13777](https://issues.apache.org/jira/browse/ARROW-13777) - [R] mutate after group\_by should be ok as long as there are only scalar functions +* [ARROW-13778](https://issues.apache.org/jira/browse/ARROW-13778) - [R] Handle complex summarize expressions +* [ARROW-13782](https://issues.apache.org/jira/browse/ARROW-13782) - [C++] Add option to handle NAs to TDigest, Index, Mode, Quantile aggregates +* [ARROW-13783](https://issues.apache.org/jira/browse/ARROW-13783) - [Python] Improve Table.to\_string (and maybe \_\_repr\_\_) to also preview data of the table +* [ARROW-13785](https://issues.apache.org/jira/browse/ARROW-13785) - [C++] Print methods for ExecPlan and ExecNode +* [ARROW-13787](https://issues.apache.org/jira/browse/ARROW-13787) - [C++] Verify third-party downloads +* [ARROW-13789](https://issues.apache.org/jira/browse/ARROW-13789) - [Go] Implement Arrow Scalar Values for Go +* [ARROW-13793](https://issues.apache.org/jira/browse/ARROW-13793) - [C++] Migrate ORCFileReader to Result +* [ARROW-13794](https://issues.apache.org/jira/browse/ARROW-13794) - [C++] Deprecate Parquet pseudo-version "2.0" +* [ARROW-13797](https://issues.apache.org/jira/browse/ARROW-13797) - [C++] Implement column projection pushdown to ORC reader in Datasets API +* [ARROW-13803](https://issues.apache.org/jira/browse/ARROW-13803) - [C++] Segfault on filtering taxi dataset +* [ARROW-13804](https://issues.apache.org/jira/browse/ARROW-13804) - [Go] Add Support for Interval Type Month, Day, Nano +* [ARROW-13806](https://issues.apache.org/jira/browse/ARROW-13806) - [Python] Add conversion to/from Pandas/Python for Month, Day Nano Interval Type +* [ARROW-13809](https://issues.apache.org/jira/browse/ARROW-13809) - [C ABI] Add support for Month, Day, Nanosecond interval type to C-ABI +* [ARROW-13810](https://issues.apache.org/jira/browse/ARROW-13810) - [C++][Compute] Predicate IsAsciiCharacter allows invalid types and values +* [ARROW-13815](https://issues.apache.org/jira/browse/ARROW-13815) - [R] Adapt to new callstack changes in rlang +* [ARROW-13816](https://issues.apache.org/jira/browse/ARROW-13816) - [Go] Implement Consumer APIs for C Data Interface +* [ARROW-13820](https://issues.apache.org/jira/browse/ARROW-13820) - [R] Rename na.min\_count to min\_count and na.rm to skip\_nulls +* [ARROW-13821](https://issues.apache.org/jira/browse/ARROW-13821) - [R] Handle na.rm in sd, var bindings +* [ARROW-13823](https://issues.apache.org/jira/browse/ARROW-13823) - Exclude .factorypath from git and RAT plugin +* [ARROW-13824](https://issues.apache.org/jira/browse/ARROW-13824) - [C++][Compute] Make constexpr BooleanToNumber kernel +* [ARROW-13831](https://issues.apache.org/jira/browse/ARROW-13831) - [GLib][Ruby] Add support for writing by Arrow Dataset +* [ARROW-13835](https://issues.apache.org/jira/browse/ARROW-13835) - [Python] Document utility to unify schemas +* [ARROW-13842](https://issues.apache.org/jira/browse/ARROW-13842) - [C++] Bump vendored date library version +* [ARROW-13843](https://issues.apache.org/jira/browse/ARROW-13843) - [C++][CI] Exercise ToString / PrettyPrint in fuzzing setup +* [ARROW-13845](https://issues.apache.org/jira/browse/ARROW-13845) - [C++] Reconcile RandomArrayGenerator::ArrayOf variants +* [ARROW-13847](https://issues.apache.org/jira/browse/ARROW-13847) - Avoid unnecessary copies of collection +* [ARROW-13849](https://issues.apache.org/jira/browse/ARROW-13849) - [C++] Add min and max aggregation functions +* [ARROW-13852](https://issues.apache.org/jira/browse/ARROW-13852) - [R] Handle Dataset schema metadata in ExecPlan +* [ARROW-13853](https://issues.apache.org/jira/browse/ARROW-13853) - [R] String to\_title, to\_lower, to\_upper kernels +* [ARROW-13855](https://issues.apache.org/jira/browse/ARROW-13855) - [C++] [Python] Add support for exporting extension types +* [ARROW-13857](https://issues.apache.org/jira/browse/ARROW-13857) - [R][CI] Remove checkbashisms download +* [ARROW-13859](https://issues.apache.org/jira/browse/ARROW-13859) - [Java] Add code coverage support +* [ARROW-13866](https://issues.apache.org/jira/browse/ARROW-13866) - [R] Implement Options for all compute kernels available via list\_compute\_functions +* [ARROW-13869](https://issues.apache.org/jira/browse/ARROW-13869) - [R] Implement options for non-bound MatchSubstringOptions kernels +* [ARROW-13871](https://issues.apache.org/jira/browse/ARROW-13871) - [C++] JSON reader can fail if a list array key is present in one chunk but not in a later chunk +* [ARROW-13874](https://issues.apache.org/jira/browse/ARROW-13874) - [R] Implement TrimOptions +* [ARROW-13883](https://issues.apache.org/jira/browse/ARROW-13883) - [Python] Allow more than numpy.array as masks when creating arrays +* [ARROW-13890](https://issues.apache.org/jira/browse/ARROW-13890) - [R] Split up test-dataset.R and test-dplyr.R +* [ARROW-13893](https://issues.apache.org/jira/browse/ARROW-13893) - [R] Make head/tail lazy on datasets and queries +* [ARROW-13897](https://issues.apache.org/jira/browse/ARROW-13897) - [Python] TimestampScalar.as\_py() and DurationScalar.as\_py() docs inaccurately describe return types +* [ARROW-13898](https://issues.apache.org/jira/browse/ARROW-13898) - [C++][Compute] Add support for string binary transforms +* [ARROW-13899](https://issues.apache.org/jira/browse/ARROW-13899) - [Ruby] Implement slicer by compute kernels +* [ARROW-13901](https://issues.apache.org/jira/browse/ARROW-13901) - [R] Implement IndexOptions +* [ARROW-13904](https://issues.apache.org/jira/browse/ARROW-13904) - [R] Implement ModeOptions +* [ARROW-13905](https://issues.apache.org/jira/browse/ARROW-13905) - [R] Implement ReplaceSliceOptions +* [ARROW-13906](https://issues.apache.org/jira/browse/ARROW-13906) - [R] Implement PartitionNthOptions +* [ARROW-13908](https://issues.apache.org/jira/browse/ARROW-13908) - [R] Implement ExtractRegexOptions +* [ARROW-13909](https://issues.apache.org/jira/browse/ARROW-13909) - [GLib] Add GArrowVarianceOptions +* [ARROW-13909](https://issues.apache.org/jira/browse/ARROW-13909) - [GLib] Add GArrowVarianceOptions +* [ARROW-13910](https://issues.apache.org/jira/browse/ARROW-13910) - [Ruby] Arrow::Table\#[]/Arrow::RecordBatch\#[] accepts Range and selectors +* [ARROW-13919](https://issues.apache.org/jira/browse/ARROW-13919) - [GLib] Add GArrowFunctionDoc +* [ARROW-13924](https://issues.apache.org/jira/browse/ARROW-13924) - [R] Bindings for stringr::str\_starts, stringr::str\_ends, base::startsWith and base::endsWith +* [ARROW-13925](https://issues.apache.org/jira/browse/ARROW-13925) - [R] Remove system installation devdocs jobs +* [ARROW-13927](https://issues.apache.org/jira/browse/ARROW-13927) - [R] Add Karl to the contributors list for the pacakge +* [ARROW-13928](https://issues.apache.org/jira/browse/ARROW-13928) - [R] Rename the version(s) tasks so that it's clearer which is which +* [ARROW-13937](https://issues.apache.org/jira/browse/ARROW-13937) - [C++][Compute] Add explicit output values to sign function and fix unary type checks +* [ARROW-13942](https://issues.apache.org/jira/browse/ARROW-13942) - [Dev] cmake\_format autotune doesn't work +* [ARROW-13944](https://issues.apache.org/jira/browse/ARROW-13944) - [C++] Bump xsimd to latest version +* [ARROW-13958](https://issues.apache.org/jira/browse/ARROW-13958) - [Python] Migrate Python ORC bindings to use new Result-based APIs +* [ARROW-13959](https://issues.apache.org/jira/browse/ARROW-13959) - [R] Update tests for extracting components from date32 objects +* [ARROW-13962](https://issues.apache.org/jira/browse/ARROW-13962) - [R] Catch up on the NEWS +* [ARROW-13963](https://issues.apache.org/jira/browse/ARROW-13963) - [Go] Shift Bitmap Reader/Writer implementations from Parquet to Arrow bituil package +* [ARROW-13964](https://issues.apache.org/jira/browse/ARROW-13964) - [Go] Remove Parquet bitmap reader/writer implementations and use the shared arrow bitutils versions +* [ARROW-13965](https://issues.apache.org/jira/browse/ARROW-13965) - [C++] dynamic\_casts in parquet TypedColumnWriterImpl impacting performance +* [ARROW-13966](https://issues.apache.org/jira/browse/ARROW-13966) - [C++] Comparison kernel(s) for decimals +* [ARROW-13967](https://issues.apache.org/jira/browse/ARROW-13967) - [Go] Implement Concatenate function for Arrays +* [ARROW-13973](https://issues.apache.org/jira/browse/ARROW-13973) - [C++] Add a SelectKSinkNode +* [ARROW-13974](https://issues.apache.org/jira/browse/ARROW-13974) - [C++] Resolve follow-up reviews for TopK/BottomK +* [ARROW-13975](https://issues.apache.org/jira/browse/ARROW-13975) - [C++][Compute] Add decimal support to round functions +* [ARROW-13977](https://issues.apache.org/jira/browse/ARROW-13977) - [Format] Clarify leap seconds and leap days for interval type +* [ARROW-13979](https://issues.apache.org/jira/browse/ARROW-13979) - [Go] Enable -race argument for Go tests +* [ARROW-13990](https://issues.apache.org/jira/browse/ARROW-13990) - [R] Bindings for round kernels +* [ARROW-13994](https://issues.apache.org/jira/browse/ARROW-13994) - [Doc][C++] Build document misses git submodule update +* [ARROW-13995](https://issues.apache.org/jira/browse/ARROW-13995) - [R] Bindings for join node +* [ARROW-13999](https://issues.apache.org/jira/browse/ARROW-13999) - [C++][CI] Make must be installed to build LZ4 on MinGW +* [ARROW-14002](https://issues.apache.org/jira/browse/ARROW-14002) - [Python] unify\_schema should accept tuples too +* [ARROW-14003](https://issues.apache.org/jira/browse/ARROW-14003) - [C++][Python] Not providing a sort\_key in the "select\_k\_unstable" kernel crashes +* [ARROW-14005](https://issues.apache.org/jira/browse/ARROW-14005) - [R] Fix tests for PartitionNthOptions so that can run on various platforms +* [ARROW-14006](https://issues.apache.org/jira/browse/ARROW-14006) - [C++][Python] Support cast of naive timestamps to strings +* [ARROW-14007](https://issues.apache.org/jira/browse/ARROW-14007) - [C++] Fix compiler warnings in decimal promotion machinery +* [ARROW-14008](https://issues.apache.org/jira/browse/ARROW-14008) - [R][Compute] ExecPlan\_run should return RecordBatchReader instead of Table +* [ARROW-14009](https://issues.apache.org/jira/browse/ARROW-14009) - [C++] Ensure SourceNode truly feeds batches to plan in parallel +* [ARROW-14012](https://issues.apache.org/jira/browse/ARROW-14012) - [Python] Update kernel categories in compute doc to match C++ +* [ARROW-14013](https://issues.apache.org/jira/browse/ARROW-14013) - [C++][Docs] Instructions on installing on Fedora Linux +* [ARROW-14016](https://issues.apache.org/jira/browse/ARROW-14016) - [C++] Wrong type\_name used for directory partitioning +* [ARROW-14019](https://issues.apache.org/jira/browse/ARROW-14019) - [R] expect\_dplyr\_equal() test helper function ignores grouping +* [ARROW-14023](https://issues.apache.org/jira/browse/ARROW-14023) - [Ruby] Arrow::Table\#slice accepts Hash +* [ARROW-14025](https://issues.apache.org/jira/browse/ARROW-14025) - [R][C++] PreBuffer is not enabled when scanning parquet via exec nodes +* [ARROW-14030](https://issues.apache.org/jira/browse/ARROW-14030) - [GLib] Use arrow::Result based ORC API +* [ARROW-14031](https://issues.apache.org/jira/browse/ARROW-14031) - [Ruby] Use min and max separately +* [ARROW-14033](https://issues.apache.org/jira/browse/ARROW-14033) - [Ruby][Doc] Add macOS development guide for Red Arrow +* [ARROW-14033](https://issues.apache.org/jira/browse/ARROW-14033) - [Ruby][Doc] Add macOS development guide for Red Arrow +* [ARROW-14035](https://issues.apache.org/jira/browse/ARROW-14035) - [C++][Compute] Implement non-hash count\_distinct aggregate kernel +* [ARROW-14036](https://issues.apache.org/jira/browse/ARROW-14036) - [R] Binding for n\_distinct() with no grouping +* [ARROW-14043](https://issues.apache.org/jira/browse/ARROW-14043) - [Python] Add support for unsigned indexes in dictionary array? +* [ARROW-14044](https://issues.apache.org/jira/browse/ARROW-14044) - [R] Handle group\_by .drop parameter in summarize +* [ARROW-14049](https://issues.apache.org/jira/browse/ARROW-14049) - [C++][Java] Upgrade ORC to 1.7.0 +* [ARROW-14050](https://issues.apache.org/jira/browse/ARROW-14050) - [C++] tdigest, quantile return empty arrays when nulls not skipped +* [ARROW-14052](https://issues.apache.org/jira/browse/ARROW-14052) - [C++] Add appx\_median, hash\_appx\_median functions +* [ARROW-14054](https://issues.apache.org/jira/browse/ARROW-14054) - [C++][Docs] Improve clarity of row\_conversion\_example.cpp +* [ARROW-14055](https://issues.apache.org/jira/browse/ARROW-14055) - [Docs] Add canonical url to the docs +* [ARROW-14056](https://issues.apache.org/jira/browse/ARROW-14056) - [C++][Doc] Mention ArrayData +* [ARROW-14061](https://issues.apache.org/jira/browse/ARROW-14061) - [Go] Add Cgo Arrow Memory Pool Allocator +* [ARROW-14062](https://issues.apache.org/jira/browse/ARROW-14062) - [Format] Initial arrow-internal specification of compute IR +* [ARROW-14064](https://issues.apache.org/jira/browse/ARROW-14064) - [CI] Use Debian 11 +* [ARROW-14069](https://issues.apache.org/jira/browse/ARROW-14069) - [R] By default, filter out hash functions in list\_compute\_functions() +* [ARROW-14070](https://issues.apache.org/jira/browse/ARROW-14070) - [C++][CI] Remove support for VisualStudio 2015 +* [ARROW-14072](https://issues.apache.org/jira/browse/ARROW-14072) - [GLib][Parquet] Add support for getting number of rows through metadata +* [ARROW-14073](https://issues.apache.org/jira/browse/ARROW-14073) - [C++] De-duplicate sort keys +* [ARROW-14084](https://issues.apache.org/jira/browse/ARROW-14084) - [GLib][Ruby][Dataset] Add support for scanning from directory +* [ARROW-14088](https://issues.apache.org/jira/browse/ARROW-14088) - [GLib][Ruby][Dataset] Add support for filter +* [ARROW-14106](https://issues.apache.org/jira/browse/ARROW-14106) - [Go][C] Implement Exporting the C data interface +* [ARROW-14107](https://issues.apache.org/jira/browse/ARROW-14107) - [R][CI] Parallelize Windows CI jobs +* [ARROW-14111](https://issues.apache.org/jira/browse/ARROW-14111) - [C++] Add extraction function support for time32/time64 +* [ARROW-14116](https://issues.apache.org/jira/browse/ARROW-14116) - [C++][Docs] Consistent variable names in WriteCSV example +* [ARROW-14127](https://issues.apache.org/jira/browse/ARROW-14127) - [C++][Docs] Example of using compute function and output +* [ARROW-14128](https://issues.apache.org/jira/browse/ARROW-14128) - [Go] Implement MakeArrayFromScalar for nested types +* [ARROW-14132](https://issues.apache.org/jira/browse/ARROW-14132) - [C++] Test mixed quoting and escaping in CSV chunker test +* [ARROW-14135](https://issues.apache.org/jira/browse/ARROW-14135) - [Python] Missing Python tests for compute kernels +* [ARROW-14140](https://issues.apache.org/jira/browse/ARROW-14140) - [R] skip arrow\_binary/arrow\_large\_binary class from R metadata +* [ARROW-14143](https://issues.apache.org/jira/browse/ARROW-14143) - [IR] [C++] Add explicit cast node to IR +* [ARROW-14146](https://issues.apache.org/jira/browse/ARROW-14146) - [Dev] Update merge script to specify python3 in shebang line +* [ARROW-14150](https://issues.apache.org/jira/browse/ARROW-14150) - [C++] Skip delimiter checking in CSV chunker if quoting is false +* [ARROW-14155](https://issues.apache.org/jira/browse/ARROW-14155) - [Go] Add functions for creating fingerprints/hashes of data types and scalars +* [ARROW-14157](https://issues.apache.org/jira/browse/ARROW-14157) - [C++] Refactor Abseil build in ThirdpartyToolchain +* [ARROW-14165](https://issues.apache.org/jira/browse/ARROW-14165) - [C++] Improve table sort performance \#2 +* [ARROW-14178](https://issues.apache.org/jira/browse/ARROW-14178) - [C++] Boost download location has moved +* [ARROW-14180](https://issues.apache.org/jira/browse/ARROW-14180) - [Packaging] Add support for AlmaLinux 8 +* [ARROW-14189](https://issues.apache.org/jira/browse/ARROW-14189) - [Docs] Add version dropdown to the sphinx docs +* [ARROW-14191](https://issues.apache.org/jira/browse/ARROW-14191) - [C++][Dataset] Dataset writes should respect backpressure +* [ARROW-14194](https://issues.apache.org/jira/browse/ARROW-14194) - [Docs] Improve vertical spacing in the sphinx API docs +* [ARROW-14198](https://issues.apache.org/jira/browse/ARROW-14198) - [Java] Upgrade Netty and gRPC dependencies +* [ARROW-14207](https://issues.apache.org/jira/browse/ARROW-14207) - [C++] Add missing dependencies for bundled Boost targets +* [ARROW-14212](https://issues.apache.org/jira/browse/ARROW-14212) - [GLib][Ruby] Add GArrowTableConcatenateOptions +* [ARROW-14217](https://issues.apache.org/jira/browse/ARROW-14217) - [Python][CI] Add support for python 3.10 +* [ARROW-14222](https://issues.apache.org/jira/browse/ARROW-14222) - [C++] Create GcsFileSystem skeleton +* [ARROW-14228](https://issues.apache.org/jira/browse/ARROW-14228) - [R] Allow for creation of nullable fields +* [ARROW-14230](https://issues.apache.org/jira/browse/ARROW-14230) - [C++] Deprecate ArrayBuilder::Advance +* [ARROW-14232](https://issues.apache.org/jira/browse/ARROW-14232) - [C++] Update crc32c dependency to 1.1.2 +* [ARROW-14235](https://issues.apache.org/jira/browse/ARROW-14235) - [C++][Compute] Use a node counter as the label if no label is supplied +* [ARROW-14236](https://issues.apache.org/jira/browse/ARROW-14236) - [C++] Install GCS testbench for CI builds +* [ARROW-14239](https://issues.apache.org/jira/browse/ARROW-14239) - [R] Don't use rlang::as\_label +* [ARROW-14241](https://issues.apache.org/jira/browse/ARROW-14241) - [C++] Dataset ORC build failing in java-jars nightly build +* [ARROW-14243](https://issues.apache.org/jira/browse/ARROW-14243) - [C++] Split up vector\_sort.cc +* [ARROW-14244](https://issues.apache.org/jira/browse/ARROW-14244) - [C++] Investigate scalar\_temporal.cc compilation speed +* [ARROW-14258](https://issues.apache.org/jira/browse/ARROW-14258) - [R] Warn if an SF column is made into a table +* [ARROW-14259](https://issues.apache.org/jira/browse/ARROW-14259) - [R] converting from R vector to Array when the R vector is altrep +* [ARROW-14261](https://issues.apache.org/jira/browse/ARROW-14261) - [C++] Includes should be in alphabetical order +* [ARROW-14269](https://issues.apache.org/jira/browse/ARROW-14269) - [C++] Consolidate utf8 benchmark +* [ARROW-14274](https://issues.apache.org/jira/browse/ARROW-14274) - [C++] Upgrade vendored base64 code +* [ARROW-14284](https://issues.apache.org/jira/browse/ARROW-14284) - [C++][Python] Improve error message when trying use SyncScanner when requiring async +* [ARROW-14291](https://issues.apache.org/jira/browse/ARROW-14291) - [CI][C++] Add cpp/examples/ files to lint targets +* [ARROW-14295](https://issues.apache.org/jira/browse/ARROW-14295) - [Doc] Indicate location of archery +* [ARROW-14296](https://issues.apache.org/jira/browse/ARROW-14296) - [Go] Update flatbuf generated code +* [ARROW-14304](https://issues.apache.org/jira/browse/ARROW-14304) - [R] Update news for 6.0.0 +* [ARROW-14309](https://issues.apache.org/jira/browse/ARROW-14309) - [Python] CompressedInputStream doesn't support str or file objects +* [ARROW-14317](https://issues.apache.org/jira/browse/ARROW-14317) - [Doc] Update implementation status +* [ARROW-14326](https://issues.apache.org/jira/browse/ARROW-14326) - [Docs] Add C/GLib and Ruby to C Data/Stream interface supported libraries +* [ARROW-14327](https://issues.apache.org/jira/browse/ARROW-14327) - [Release] Remove conda-\* from packaging group +* [ARROW-14335](https://issues.apache.org/jira/browse/ARROW-14335) - [GLib][Ruby] Add support for expression +* [ARROW-14337](https://issues.apache.org/jira/browse/ARROW-14337) - [C++] Arrow doesn't build on M1 when SIMD acceleration is enabled +* [ARROW-14341](https://issues.apache.org/jira/browse/ARROW-14341) - [C++] Refine decimal benchmark +* [ARROW-14343](https://issues.apache.org/jira/browse/ARROW-14343) - [Packaging][Python] Enable NEON SIMD optimization for M1 wheels +* [ARROW-14345](https://issues.apache.org/jira/browse/ARROW-14345) - [C++] Implement streaming reads for GCS FileSystem +* [ARROW-14348](https://issues.apache.org/jira/browse/ARROW-14348) - [R] add group\_vars.RecordBatchReader method +* [ARROW-14349](https://issues.apache.org/jira/browse/ARROW-14349) - [IR] Remove RelBase +* [ARROW-14358](https://issues.apache.org/jira/browse/ARROW-14358) - Update CMake options in documentation +* [ARROW-14361](https://issues.apache.org/jira/browse/ARROW-14361) - [C++] Define a DEFAULT value for ARROW\_SIMD\_LEVEL +* [ARROW-14364](https://issues.apache.org/jira/browse/ARROW-14364) - [CI][C++] Support LLVM 13 +* [ARROW-14368](https://issues.apache.org/jira/browse/ARROW-14368) - [CI] ubuntu-16.04 isn't available on Azure Pipelines +* [ARROW-14369](https://issues.apache.org/jira/browse/ARROW-14369) - [C++][Python] Failed to build with g++ 4.8.5 +* [ARROW-14386](https://issues.apache.org/jira/browse/ARROW-14386) - [Packaging][Java] devtoolset is upgraded to 10 in the manylinux2014 image +* [ARROW-14387](https://issues.apache.org/jira/browse/ARROW-14387) - [Release][Ruby] Check Homebrew/MSYS2 package version before releasing +* [ARROW-14396](https://issues.apache.org/jira/browse/ARROW-14396) - [R][Doc] Remove relic note in write\_dataset that columns cannot be renamed +* [ARROW-14400](https://issues.apache.org/jira/browse/ARROW-14400) - [Go] Equals and ApproxEquals for Tables and Chunked Arrays +* [ARROW-14401](https://issues.apache.org/jira/browse/ARROW-14401) - [C++] Bundled crc32c 's include path is wrong +* [ARROW-14402](https://issues.apache.org/jira/browse/ARROW-14402) - [Release][Yum] Signing RPM is failed +* [ARROW-14404](https://issues.apache.org/jira/browse/ARROW-14404) - [Release][APT] Skip arm64 Debian GNU/Linux bookwarm verification +* [ARROW-14408](https://issues.apache.org/jira/browse/ARROW-14408) - [Packaging][Crossbow] Option for skipping artifact pattern validation +* [ARROW-14410](https://issues.apache.org/jira/browse/ARROW-14410) - [Python][Packaging] Use numpy 1.21.3 to build python 3.10 wheels for macOS and windows +* [ARROW-14452](https://issues.apache.org/jira/browse/ARROW-14452) - [Release][JS] Update Javascript testing +* [PARQUET-490](https://issues.apache.org/jira/browse/PARQUET-490) - [C++] Incorporate DELTA\_BINARY\_PACKED value encoder into library and add unit tests + + + +# Apache Arrow 5.0.0 (2021-07-28) + +## Bug Fixes + +* [ARROW-6189](https://issues.apache.org/jira/browse/ARROW-6189) - [Rust] [Parquet] Plain encoded boolean column chunks limited to 2048 values +* [ARROW-6312](https://issues.apache.org/jira/browse/ARROW-6312) - [C++] Declare required Libs.private in arrow.pc package config +* [ARROW-7948](https://issues.apache.org/jira/browse/ARROW-7948) - [Go][Integration] Decimal integration failures +* [ARROW-9594](https://issues.apache.org/jira/browse/ARROW-9594) - [Python] DictionaryArray.to\_numpy does not correctly convert null indexes to null values +* [ARROW-10910](https://issues.apache.org/jira/browse/ARROW-10910) - [Python] Segmentation Fault when None given to read\_table with legacy dataset +* [ARROW-10958](https://issues.apache.org/jira/browse/ARROW-10958) - [GLib] "Nested data conversions not implemented" through glib, but not through pyarrow +* [ARROW-11077](https://issues.apache.org/jira/browse/ARROW-11077) - [Rust] ParquetFileArrowReader panicks when trying to read nested list +* [ARROW-11146](https://issues.apache.org/jira/browse/ARROW-11146) - [CI][Python] Failing conda-python-3.8-jpype Nightly Build +* [ARROW-11161](https://issues.apache.org/jira/browse/ARROW-11161) - [Python][C++] S3Filesystem: file Content-Type not set correctly? +* [ARROW-11633](https://issues.apache.org/jira/browse/ARROW-11633) - [CI] [Documentation] Maven default skin not found +* [ARROW-11780](https://issues.apache.org/jira/browse/ARROW-11780) - [C++][Python] StructArray.from\_arrays() crashes Python interpreter +* [ARROW-11908](https://issues.apache.org/jira/browse/ARROW-11908) - [Rust] Intermittent Flight integration test failures +* [ARROW-12007](https://issues.apache.org/jira/browse/ARROW-12007) - [C++] Loading parquet file returns "Invalid UTF8 payload" error +* [ARROW-12055](https://issues.apache.org/jira/browse/ARROW-12055) - [R] is.na() evaluates to FALSE on Arrow NaN values +* [ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096) - [Python][C++] Pyarrow Parquet reader overflows INT96 timestamps when converting to Arrow Array (timestamp[ns]) +* [ARROW-12122](https://issues.apache.org/jira/browse/ARROW-12122) - [Python] Cannot install via pip M1 mac +* [ARROW-12142](https://issues.apache.org/jira/browse/ARROW-12142) - [Python] undefined symbol: \_ZN5arrow6StatusC1ENS\_10StatusCodeERKNSt7\_\_cxx1112basic\_stringIcSt11char\_traitsIcESaIcEEE +* [ARROW-12150](https://issues.apache.org/jira/browse/ARROW-12150) - [Python] Bad type inference of mixed-precision Decimals +* [ARROW-12232](https://issues.apache.org/jira/browse/ARROW-12232) - [Rust][Datafusion] Error with CAST: Unsupported SQL type Time +* [ARROW-12240](https://issues.apache.org/jira/browse/ARROW-12240) - [Python] invalid-offsetof warning from apple clang-12 +* [ARROW-12377](https://issues.apache.org/jira/browse/ARROW-12377) - [Doc][Java] Java doc build broken +* [ARROW-12407](https://issues.apache.org/jira/browse/ARROW-12407) - [Python] Deprecation warning when building PyArrow +* [ARROW-12431](https://issues.apache.org/jira/browse/ARROW-12431) - [Python] pa.array mask inverted when type is binary and value to be converted is numpy array +* [ARROW-12472](https://issues.apache.org/jira/browse/ARROW-12472) - [Python] read\_table fails when passing a PEP519 filesystem object +* [ARROW-12482](https://issues.apache.org/jira/browse/ARROW-12482) - [Doc][Python] Mention CSVStreamingReader pitfalls with type inference +* [ARROW-12491](https://issues.apache.org/jira/browse/ARROW-12491) - [Packaging] Required dependency on LZ4 \>= 1.8 missing from CentOS RPM packages +* [ARROW-12503](https://issues.apache.org/jira/browse/ARROW-12503) - [C++] Ensure using "lib/" for jemalloc's library directory +* [ARROW-12508](https://issues.apache.org/jira/browse/ARROW-12508) - [R] expect\_as\_vector implementation causes test failure on R <= 3.3 and variables defined outside of test\_that break build when no arrow install +* [ARROW-12543](https://issues.apache.org/jira/browse/ARROW-12543) - [CI][Python] Failing conda-python-3.9 Nightly Build +* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0 +* [ARROW-12569](https://issues.apache.org/jira/browse/ARROW-12569) - [R] [CI] Run revdep in CI +* [ARROW-12570](https://issues.apache.org/jira/browse/ARROW-12570) - [JS] Fix issues that blocked the v4.0.0 release +* [ARROW-12579](https://issues.apache.org/jira/browse/ARROW-12579) - [Python] Pyarrow 4.0.0 dependency numpy 1.19.4 throws errors on Apple silicon/M1 compilation +* [ARROW-12589](https://issues.apache.org/jira/browse/ARROW-12589) - [C++] Compiling on windows doesn't work when -DARROW\_WITH\_BACKTRACE=OFF +* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure +* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds +* [ARROW-12605](https://issues.apache.org/jira/browse/ARROW-12605) - [Documentation] Repair line numbers in dataset.rst +* [ARROW-12606](https://issues.apache.org/jira/browse/ARROW-12606) - [C++] Quantile and Mode functions failing on arrays with offset +* [ARROW-12610](https://issues.apache.org/jira/browse/ARROW-12610) - [C++] Skip TestS3FSGeneric TestDeleteDir and TestDeleteDirContents on windows as they are flaky +* [ARROW-12611](https://issues.apache.org/jira/browse/ARROW-12611) - [CI][Python] Nightly test-conda-python-pandas-0.24 is failing due to numpy compat issue +* [ARROW-12613](https://issues.apache.org/jira/browse/ARROW-12613) - [Python] AttributeError when comparing a Scalar with None +* [ARROW-12614](https://issues.apache.org/jira/browse/ARROW-12614) - [C++][Compute] Revert support for Tables in ExecuteScalarExpression +* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table +* [ARROW-12620](https://issues.apache.org/jira/browse/ARROW-12620) - [C++] Dataset writing can only include projected columns if input columns are also included +* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server +* [ARROW-12630](https://issues.apache.org/jira/browse/ARROW-12630) - [Dev][Integration] conda-integration docker build fails +* [ARROW-12639](https://issues.apache.org/jira/browse/ARROW-12639) - [CI][Archery] Archery build fails to create branch +* [ARROW-12640](https://issues.apache.org/jira/browse/ARROW-12640) - [C++] Fix errors from VS 2019 in cpp/src/parquet/types.h +* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive +* [ARROW-12644](https://issues.apache.org/jira/browse/ARROW-12644) - [C++][Dataset] Support reading date/time-partitioned datasets accounting for URL encoding (Spark) +* [ARROW-12646](https://issues.apache.org/jira/browse/ARROW-12646) - [C++][CI][Packaging][Python] Bump vcpkg version to its latest release +* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2 +* [ARROW-12668](https://issues.apache.org/jira/browse/ARROW-12668) - [C++][Dataset] CountRows occasionally segfaulting +* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches +* [ARROW-12672](https://issues.apache.org/jira/browse/ARROW-12672) - [C++] Segfault casting result of "fill\_null()" (not bitmap but unknown null\_count) +* [ARROW-12679](https://issues.apache.org/jira/browse/ARROW-12679) - [Java] JDBC adapter does not preserve SQL-nullability +* [ARROW-12684](https://issues.apache.org/jira/browse/ARROW-12684) - [Go][Flight] Fix nil dereference in error case +* [ARROW-12708](https://issues.apache.org/jira/browse/ARROW-12708) - [C++] Valgrind errors when calling negate\_checked +* [ARROW-12729](https://issues.apache.org/jira/browse/ARROW-12729) - [R] Fix length method for Table, RecordBatch +* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata +* [ARROW-12756](https://issues.apache.org/jira/browse/ARROW-12756) - [C++] MSVC build fails with latest gtest from vcpkg +* [ARROW-12757](https://issues.apache.org/jira/browse/ARROW-12757) - [Dev][Archery] Warning about RUST variable in "archery docker run" +* [ARROW-12762](https://issues.apache.org/jira/browse/ARROW-12762) - [Python] ListType doesn't preserve field name after pickle and unpickle +* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays +* [ARROW-12771](https://issues.apache.org/jira/browse/ARROW-12771) - [C++] Arrow compute hash\_count skips following chunked arrays in streaming execution +* [ARROW-12772](https://issues.apache.org/jira/browse/ARROW-12772) - [CI] Merge script test fails due to missing dependency +* [ARROW-12773](https://issues.apache.org/jira/browse/ARROW-12773) - [Docs] Clarify Java support for ORC and Parquet via JNI bindings +* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash +* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json +* [ARROW-12779](https://issues.apache.org/jira/browse/ARROW-12779) - [Python][FlightRPC] Flight server segfaults with certain data +* [ARROW-12780](https://issues.apache.org/jira/browse/ARROW-12780) - [CI][C++] MinGW builds failing when trying to build Gandiva +* [ARROW-12790](https://issues.apache.org/jira/browse/ARROW-12790) - [Python] Cannot read from HDFS with blanks in path names +* [ARROW-12793](https://issues.apache.org/jira/browse/ARROW-12793) - [Python] PYARROW\_BUILD\_TYPE=Debug does not work correctly +* [ARROW-12797](https://issues.apache.org/jira/browse/ARROW-12797) - [JS] Update readme with new links and remove outdated examples +* [ARROW-12798](https://issues.apache.org/jira/browse/ARROW-12798) - [JS] Use == null Comparison +* [ARROW-12799](https://issues.apache.org/jira/browse/ARROW-12799) - [JS] Use Nullish Coalescing Operator (??) For Defaults +* [ARROW-12804](https://issues.apache.org/jira/browse/ARROW-12804) - [C++] Array methods IsNull and IsValid is confused for NullType +* [ARROW-12807](https://issues.apache.org/jira/browse/ARROW-12807) - [C++] Fix merge conflicts with Future refactor/async IPC +* [ARROW-12838](https://issues.apache.org/jira/browse/ARROW-12838) - [Java][Gandiva] Fix JNI CI test for Gandiva +* [ARROW-12842](https://issues.apache.org/jira/browse/ARROW-12842) - [Java][FlightRPC] Error metadata from FlightStatusException is not propagated to client +* [ARROW-12850](https://issues.apache.org/jira/browse/ARROW-12850) - [R] is.nan() evaluates to null on Arrow null values +* [ARROW-12854](https://issues.apache.org/jira/browse/ARROW-12854) - [Dev][Release] Windows wheel verification script fails to download artifacts +* [ARROW-12857](https://issues.apache.org/jira/browse/ARROW-12857) - [C++] hash\_aggregate\_test not building on master +* [ARROW-12864](https://issues.apache.org/jira/browse/ARROW-12864) - [C++] Remove needless out argument from arrow::internal::InvertBitmap +* [ARROW-12865](https://issues.apache.org/jira/browse/ARROW-12865) - [C++][Python] Python FlightRPC server cannot find RE2 symbols +* [ARROW-12882](https://issues.apache.org/jira/browse/ARROW-12882) - [C++][Gandiva] Fix behavior of convevrt\_replace function for empty replacement char +* [ARROW-12887](https://issues.apache.org/jira/browse/ARROW-12887) - [CI] AppVeyor pip install failure during setup +* [ARROW-12906](https://issues.apache.org/jira/browse/ARROW-12906) - [Python] \`fill\_null\` called with a null value seg faults on non fixed-sized types. +* [ARROW-12907](https://issues.apache.org/jira/browse/ARROW-12907) - [Java] Memory leak possible when exception reading from channel happens +* [ARROW-12911](https://issues.apache.org/jira/browse/ARROW-12911) - [Python] Export scalar aggregate options to pc.sum (sum of zero rows gives null; should give 0) +* [ARROW-12917](https://issues.apache.org/jira/browse/ARROW-12917) - [C++][R][pyarrow] Failure importing some decimal types using the C data interface +* [ARROW-12918](https://issues.apache.org/jira/browse/ARROW-12918) - [C++] Build errors with Visual Studio 16.10.31321.278 +* [ARROW-12919](https://issues.apache.org/jira/browse/ARROW-12919) - [Developer Tools] Crossbow comment bot failing to react to comments +* [ARROW-12935](https://issues.apache.org/jira/browse/ARROW-12935) - [C++][CI] Compiler error on some clang versions +* [ARROW-12941](https://issues.apache.org/jira/browse/ARROW-12941) - [C++] csv reader skip\_row does not properly update num\_rows\_seen +* [ARROW-12942](https://issues.apache.org/jira/browse/ARROW-12942) - [C++][Compute] The result of Arrow compute hash\_min\_max is incorrect if there are new groups in the subsequent chunks +* [ARROW-12956](https://issues.apache.org/jira/browse/ARROW-12956) - [C++] Fix crash on Parquet file (OSS-Fuzz) +* [ARROW-12969](https://issues.apache.org/jira/browse/ARROW-12969) - [C++] match\_substring doesn't match empty needle to empty haystack +* [ARROW-12974](https://issues.apache.org/jira/browse/ARROW-12974) - [R] test-r-without-arrow build fails because of example requiring Arrow +* [ARROW-12983](https://issues.apache.org/jira/browse/ARROW-12983) - [C++][Python] Converter::Extend gets stuck in infinite loop causing OOM if values don't fit in single chunk +* [ARROW-12987](https://issues.apache.org/jira/browse/ARROW-12987) - [CI] test-ubuntu-18.04 nightly builds are failing due to Gandiva "TestUpper" test failure +* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty) +* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty) +* [ARROW-12989](https://issues.apache.org/jira/browse/ARROW-12989) - [CI] "Dev PR" jobs undully cancelled +* [ARROW-12991](https://issues.apache.org/jira/browse/ARROW-12991) - [CI] Travis ARM builds often crash +* [ARROW-12993](https://issues.apache.org/jira/browse/ARROW-12993) - [Python] Address boundary error with invalid Feather file and stackprinter +* [ARROW-12995](https://issues.apache.org/jira/browse/ARROW-12995) - [C++] CSV reader should validate options +* [ARROW-12998](https://issues.apache.org/jira/browse/ARROW-12998) - [C++] Datasets needs dependency on xsimd +* [ARROW-13001](https://issues.apache.org/jira/browse/ARROW-13001) - [Go] Build failure in parquet/internal/bmi on s390x +* [ARROW-13003](https://issues.apache.org/jira/browse/ARROW-13003) - [C++] unaligned access in compute/exec/ cc files +* [ARROW-13008](https://issues.apache.org/jira/browse/ARROW-13008) - [C++] Deprecation warning when compiling minimal example +* [ARROW-13010](https://issues.apache.org/jira/browse/ARROW-13010) - [C++][Compute] Support outputting to slices from kleene kernels +* [ARROW-13018](https://issues.apache.org/jira/browse/ARROW-13018) - [C++][Docs] Use consistent terminology for nulls (min\_count) in scalar aggregate kernels +* [ARROW-13026](https://issues.apache.org/jira/browse/ARROW-13026) - [C++][CI] s390x job setup fails +* [ARROW-13037](https://issues.apache.org/jira/browse/ARROW-13037) - [R] Incorrect param when creating Expression crashes R +* [ARROW-13039](https://issues.apache.org/jira/browse/ARROW-13039) - [R] Fix error message handling +* [ARROW-13041](https://issues.apache.org/jira/browse/ARROW-13041) - [C++] Unary kernels can leave uninitialized data under null entries +* [ARROW-13046](https://issues.apache.org/jira/browse/ARROW-13046) - [Release] JS package failing test prior to publish +* [ARROW-13048](https://issues.apache.org/jira/browse/ARROW-13048) - [C++] S3FileSystem fails moving filepaths containing = or + +* [ARROW-13053](https://issues.apache.org/jira/browse/ARROW-13053) - [Python] Build fails on MacOS Big Sur using homebrewed Arrow libraries +* [ARROW-13069](https://issues.apache.org/jira/browse/ARROW-13069) - [Website] Add Daniël to committer list +* [ARROW-13073](https://issues.apache.org/jira/browse/ARROW-13073) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter' +* [ARROW-13080](https://issues.apache.org/jira/browse/ARROW-13080) - [Release] Generate the API docs in ubuntu 20.10 +* [ARROW-13083](https://issues.apache.org/jira/browse/ARROW-13083) - [Python] Wrong SCM version detection both in setup.py and crossbow +* [ARROW-13085](https://issues.apache.org/jira/browse/ARROW-13085) - [Python] Apache Arrow minimal cpp build segfaults with pyarrow libs +* [ARROW-13090](https://issues.apache.org/jira/browse/ARROW-13090) - [Python] Test failure with ffspec 2021.6.0 +* [ARROW-13104](https://issues.apache.org/jira/browse/ARROW-13104) - [C++] ByteStreamSplit implementation uses invalid pointer cast +* [ARROW-13108](https://issues.apache.org/jira/browse/ARROW-13108) - [Python] Pyarrow 4.0.0 crashes upon import on macOS 10.13.6 +* [ARROW-13116](https://issues.apache.org/jira/browse/ARROW-13116) - [R] Test for RecordBatchReader to C-interface fails on arrow-r-minimal due to missing dependencies +* [ARROW-13125](https://issues.apache.org/jira/browse/ARROW-13125) - [R] Throw error when 2+ args passed to desc() in arrange() +* [ARROW-13128](https://issues.apache.org/jira/browse/ARROW-13128) - [C\#] TimestampArray conversion logic for nano and micro is wrong +* [ARROW-13135](https://issues.apache.org/jira/browse/ARROW-13135) - [C++] Fix Status propagation in END\_PARQUET\_CATCH\_EXCEPTIONS +* [ARROW-13139](https://issues.apache.org/jira/browse/ARROW-13139) - [C++] ReadaheadGenerator cannot be safely copied/moved +* [ARROW-13145](https://issues.apache.org/jira/browse/ARROW-13145) - [C++][CI] Flight test crashes on MinGW +* [ARROW-13148](https://issues.apache.org/jira/browse/ARROW-13148) - [Dev][Archery] Crossbow build submission fails +* [ARROW-13153](https://issues.apache.org/jira/browse/ARROW-13153) - [C++] \`parquet\_dataset\` loses ordering of files in \`\_metadata\` +* [ARROW-13154](https://issues.apache.org/jira/browse/ARROW-13154) - [C++] Unions can not have 126 and 127 as type\_codes +* [ARROW-13169](https://issues.apache.org/jira/browse/ARROW-13169) - [R] [C++] sorted partition keys can cause issues +* [ARROW-13173](https://issues.apache.org/jira/browse/ARROW-13173) - [C++] TestAsyncUtil.ReadaheadFailed asserts occasionally +* [ARROW-13187](https://issues.apache.org/jira/browse/ARROW-13187) - [c++][python] Possibly memory not deallocated when reading in CSV +* [ARROW-13189](https://issues.apache.org/jira/browse/ARROW-13189) - [R] Disable row-level metadata application on datasets +* [ARROW-13203](https://issues.apache.org/jira/browse/ARROW-13203) - [R] Fix optional component checks causing failures +* [ARROW-13207](https://issues.apache.org/jira/browse/ARROW-13207) - [Python][Doc] Dataset documentation still suggests deprecated scan method as the preferred iterative approach +* [ARROW-13216](https://issues.apache.org/jira/browse/ARROW-13216) - [R] Type checks test fails with rtools35 +* [ARROW-13217](https://issues.apache.org/jira/browse/ARROW-13217) - [C++][Gandiva] Correct convert\_replace function for invalid chars on string beginning +* [ARROW-13223](https://issues.apache.org/jira/browse/ARROW-13223) - [C++][CI] Fix thread sanitizer failures +* [ARROW-13225](https://issues.apache.org/jira/browse/ARROW-13225) - [Go][Flight] Implement Custom Middleware Interface and Enable Integration Tests +* [ARROW-13229](https://issues.apache.org/jira/browse/ARROW-13229) - [Python] ascii\_trim, ascii\_ltrim and ascii\_rtrim lack options +* [ARROW-13239](https://issues.apache.org/jira/browse/ARROW-13239) - [Doc][Python] Dataset.head function doesn't mention required argument +* [ARROW-13243](https://issues.apache.org/jira/browse/ARROW-13243) - [R] altrep function call in R 3.5 +* [ARROW-13246](https://issues.apache.org/jira/browse/ARROW-13246) - [C++] CSV skip\_rows\_after\_names can discard data prematurally +* [ARROW-13249](https://issues.apache.org/jira/browse/ARROW-13249) - [Java][CI] Consistent timeout in the Java JNI build +* [ARROW-13253](https://issues.apache.org/jira/browse/ARROW-13253) - [C++][FlightRPC] Segfault when sending record batch \>2GB +* [ARROW-13254](https://issues.apache.org/jira/browse/ARROW-13254) - [Python] Processes killed and semaphore objects leaked when reading pandas data +* [ARROW-13265](https://issues.apache.org/jira/browse/ARROW-13265) - [R] cli valgrind errors in nightlies +* [ARROW-13266](https://issues.apache.org/jira/browse/ARROW-13266) - [JS] Improve benchmark names & add suite name to json +* [ARROW-13281](https://issues.apache.org/jira/browse/ARROW-13281) - [C++][Gandiva] Error on timestampDiffMonth function behavior for negative diff values +* [ARROW-13284](https://issues.apache.org/jira/browse/ARROW-13284) - [C++] Wrong pkg\_check\_modules() option name +* [ARROW-13288](https://issues.apache.org/jira/browse/ARROW-13288) - [Python] Missing default values of kernel options in PyArrow +* [ARROW-13290](https://issues.apache.org/jira/browse/ARROW-13290) - Compilation fails on clang-12 and gcc-11 due to missing include +* [ARROW-13305](https://issues.apache.org/jira/browse/ARROW-13305) - [C++] Unable to install nightly on Ubuntu 21.04 due to CSV options +* [ARROW-13315](https://issues.apache.org/jira/browse/ARROW-13315) - [R] Wrap r\_task\_group includes with ARROW\_R\_WITH\_ARROW checking +* [ARROW-13321](https://issues.apache.org/jira/browse/ARROW-13321) - [C++][Python] MakeArrayFromScalar doesn't work for FixedSizeBinaryType +* [ARROW-13324](https://issues.apache.org/jira/browse/ARROW-13324) - [R] Typo in bindings for utf8\_reverse and ascii\_reverse +* [ARROW-13332](https://issues.apache.org/jira/browse/ARROW-13332) - [C++] TSAN failure in TestAsyncUtil.ReadaheadFailed +* [ARROW-13341](https://issues.apache.org/jira/browse/ARROW-13341) - [C++] Segfault in arrow-compute-plan-test ExecPlanExecution.SourceScalarAggSink +* [ARROW-13350](https://issues.apache.org/jira/browse/ARROW-13350) - [Python][CI] conda-python-3.7-pandas-0.24 nightly build failing in test\_extract\_datetime\_components +* [ARROW-13352](https://issues.apache.org/jira/browse/ARROW-13352) - [C++] Valgrind failure in case\_when kernel +* [ARROW-13353](https://issues.apache.org/jira/browse/ARROW-13353) - [Documentation] Build failing with sphinx.util.cfamily.DefinitionError +* [ARROW-13360](https://issues.apache.org/jira/browse/ARROW-13360) - [C++] Missing dependencies in C++ thirdparty offline dependencies versions.txt +* [ARROW-13363](https://issues.apache.org/jira/browse/ARROW-13363) - [R] is.nan() errors on non-floating point data +* [ARROW-13368](https://issues.apache.org/jira/browse/ARROW-13368) - [C++][Doc] Rename project to make\_struct in docs +* [ARROW-13381](https://issues.apache.org/jira/browse/ARROW-13381) - [C++] ArrayFromJSON doesn't work for float value dictionary type +* [ARROW-13382](https://issues.apache.org/jira/browse/ARROW-13382) - [C++] Aggregation over scalars fails autobrew R job +* [ARROW-13384](https://issues.apache.org/jira/browse/ARROW-13384) - [C++] Specify minimum required zstd version in cmake +* [ARROW-13391](https://issues.apache.org/jira/browse/ARROW-13391) - [C++] CSV streaming reader does not include same error information as table reader +* [ARROW-13417](https://issues.apache.org/jira/browse/ARROW-13417) - [C++] The merged generator can sometimes pull from source sync-reentrant +* [ARROW-13419](https://issues.apache.org/jira/browse/ARROW-13419) - [JS] Fix perf tests +* [ARROW-13428](https://issues.apache.org/jira/browse/ARROW-13428) - [C++][Flight] -lssl is missing with bundled gRPC and system shared OpenSSL +* [ARROW-13431](https://issues.apache.org/jira/browse/ARROW-13431) - [Release] Bump go version to 1.15; don't verify rust source anymore +* [ARROW-13432](https://issues.apache.org/jira/browse/ARROW-13432) - [Release] Fix ssh connection to the binary uploader container + + +## New Features and Improvements + +* [ARROW-2665](https://issues.apache.org/jira/browse/ARROW-2665) - [Python/C++] Add index() method to find first occurence of Python scalar +* [ARROW-3014](https://issues.apache.org/jira/browse/ARROW-3014) - [C++] Minimal writer adapter for ORC file format +* [ARROW-3316](https://issues.apache.org/jira/browse/ARROW-3316) - [R] Multi-threaded conversion from R data.frame to Arrow table / record batch +* [ARROW-5385](https://issues.apache.org/jira/browse/ARROW-5385) - [Go] implement EXTENSION datatype +* [ARROW-5640](https://issues.apache.org/jira/browse/ARROW-5640) - [Go] implement Map array +* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension +* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension +* [ARROW-7001](https://issues.apache.org/jira/browse/ARROW-7001) - [C++] Develop threading APIs to accommodate nested parallelism +* [ARROW-7114](https://issues.apache.org/jira/browse/ARROW-7114) - [JS][CI] NodeJS build fails on Github Actions Windows node +* [ARROW-7252](https://issues.apache.org/jira/browse/ARROW-7252) - [Rust] [Parquet] Reading UTF-8/JSON/ENUM field results in a lot of vec allocation +* [ARROW-7396](https://issues.apache.org/jira/browse/ARROW-7396) - [Format] Register media types (MIME types) for Apache Arrow formats to IANA +* [ARROW-8421](https://issues.apache.org/jira/browse/ARROW-8421) - [Rust] [Parquet] Implement parquet writer +* [ARROW-8459](https://issues.apache.org/jira/browse/ARROW-8459) - [Dev][Archery] Use a more recent cmake-format +* [ARROW-8527](https://issues.apache.org/jira/browse/ARROW-8527) - [C++][CSV] Add support for ReadOptions::skip\_rows \>= block\_size +* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset +* [ARROW-8676](https://issues.apache.org/jira/browse/ARROW-8676) - [Rust] Create implementation of IPC RecordBatch body buffer compression from ARROW-300 +* [ARROW-9054](https://issues.apache.org/jira/browse/ARROW-9054) - [C++] Add ScalarAggregateOptions +* [ARROW-9056](https://issues.apache.org/jira/browse/ARROW-9056) - [C++] Support scalar aggregation over scalars +* [ARROW-9140](https://issues.apache.org/jira/browse/ARROW-9140) - [R] Zero-copy Arrow to R where possible +* [ARROW-9295](https://issues.apache.org/jira/browse/ARROW-9295) - [Archery] Support rust clippy in the lint command +* [ARROW-9299](https://issues.apache.org/jira/browse/ARROW-9299) - [Python] Expose ORC metadata() in Python ORCFile +* [ARROW-9313](https://issues.apache.org/jira/browse/ARROW-9313) - [Rust] Use feature enum +* [ARROW-9421](https://issues.apache.org/jira/browse/ARROW-9421) - [C++][Parquet] Redundancies SchemaManifest::GetFieldIndices +* [ARROW-9430](https://issues.apache.org/jira/browse/ARROW-9430) - [C++/Python] Kernel for SetItem(BooleanArray, values) +* [ARROW-9697](https://issues.apache.org/jira/browse/ARROW-9697) - [C++][Dataset] num\_rows method for Dataset/Scanner +* [ARROW-10031](https://issues.apache.org/jira/browse/ARROW-10031) - [Java] Support Java benchmark in Archery +* [ARROW-10115](https://issues.apache.org/jira/browse/ARROW-10115) - [C++] CSV empty quoted string is treated as NULL +* [ARROW-10316](https://issues.apache.org/jira/browse/ARROW-10316) - [Python] Consider using \_\_wrapped\_\_ for compute function introspection +* [ARROW-10391](https://issues.apache.org/jira/browse/ARROW-10391) - [Rust] [Parquet] Nested Arrow reader +* [ARROW-10440](https://issues.apache.org/jira/browse/ARROW-10440) - [C++][Dataset][Python] Add a callback to visit file writers just before Finish() +* [ARROW-10550](https://issues.apache.org/jira/browse/ARROW-10550) - [Rust] [Parquet] Write nested types (struct, list) +* [ARROW-10557](https://issues.apache.org/jira/browse/ARROW-10557) - [C++] Add scalar string slicing/substring extract kernel +* [ARROW-10640](https://issues.apache.org/jira/browse/ARROW-10640) - [C++] An "if\_else" kernel to combine two arrays based on a mask +* [ARROW-10658](https://issues.apache.org/jira/browse/ARROW-10658) - [Python][Packaging] Wheel builds for Apple Silicon +* [ARROW-10675](https://issues.apache.org/jira/browse/ARROW-10675) - [C++][Python] Support AWS S3 Web identity credentials +* [ARROW-10797](https://issues.apache.org/jira/browse/ARROW-10797) - [C++] Investigate faster random generation for tests and benchmarks +* [ARROW-10926](https://issues.apache.org/jira/browse/ARROW-10926) - [Rust] Add parquet reader / writer for decimal types +* [ARROW-10959](https://issues.apache.org/jira/browse/ARROW-10959) - [C++] Add scalar string join kernel +* [ARROW-11061](https://issues.apache.org/jira/browse/ARROW-11061) - [Rust] Validate array properties against schema +* [ARROW-11173](https://issues.apache.org/jira/browse/ARROW-11173) - Add Map type as reader / writer in FieldReader / FieldWriter +* [ARROW-11199](https://issues.apache.org/jira/browse/ARROW-11199) - [C++][Python] Fix the unit tests for the ORC reader +* [ARROW-11206](https://issues.apache.org/jira/browse/ARROW-11206) - [C++][Compute][Python] Rename "project" kernel to "make\_struct" +* [ARROW-11342](https://issues.apache.org/jira/browse/ARROW-11342) - [Python] [Gandiva] Expose ToString and result type information +* [ARROW-11499](https://issues.apache.org/jira/browse/ARROW-11499) - [Packaging] Remove all use of bintray +* [ARROW-11514](https://issues.apache.org/jira/browse/ARROW-11514) - [R][C++] Bindings for paste(), paste0(), str\_c() +* [ARROW-11515](https://issues.apache.org/jira/browse/ARROW-11515) - [R] Bindings for strsplit +* [ARROW-11565](https://issues.apache.org/jira/browse/ARROW-11565) - [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT\_CAP function +* [ARROW-11581](https://issues.apache.org/jira/browse/ARROW-11581) - [Packaging][C++] Formalize distribution through vcpkg +* [ARROW-11608](https://issues.apache.org/jira/browse/ARROW-11608) - [CI] turbodbc integration tests are failing (build isue) +* [ARROW-11660](https://issues.apache.org/jira/browse/ARROW-11660) - [C++] Move RecordBatch::SelectColumns method from R to C++ library +* [ARROW-11673](https://issues.apache.org/jira/browse/ARROW-11673) - [C++] Casting dictionary type to use different index type +* [ARROW-11675](https://issues.apache.org/jira/browse/ARROW-11675) - [CI][C++] Resolve ctest failures on VS 2019 builds +* [ARROW-11705](https://issues.apache.org/jira/browse/ARROW-11705) - [R] Support scalar value recycling in RecordBatch/Table$create() +* [ARROW-11759](https://issues.apache.org/jira/browse/ARROW-11759) - [C++] Kernel to extract datetime components (year, month, day, etc) from timestamp type +* [ARROW-11769](https://issues.apache.org/jira/browse/ARROW-11769) - [R] Pull groups from grouped\_df into RecordBatch or Table +* [ARROW-11772](https://issues.apache.org/jira/browse/ARROW-11772) - [C++] Add asynchronous read to ipc::RecordBatchFileReader +* [ARROW-11782](https://issues.apache.org/jira/browse/ARROW-11782) - [GLib][Ruby][Dataset] Remove bindings for internal classes +* [ARROW-11787](https://issues.apache.org/jira/browse/ARROW-11787) - [R] Implement write csv +* [ARROW-11843](https://issues.apache.org/jira/browse/ARROW-11843) - [C++] Add asynchronous read to parquet::arrow::FileReader +* [ARROW-11849](https://issues.apache.org/jira/browse/ARROW-11849) - [R] Use roxygen @examplesIf tag in R docs +* [ARROW-11889](https://issues.apache.org/jira/browse/ARROW-11889) - [C++] Add parallelism to streaming CSV reader +* [ARROW-11909](https://issues.apache.org/jira/browse/ARROW-11909) - [C++] Get rid of MakeIteratorGenerator +* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds +* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds +* [ARROW-11928](https://issues.apache.org/jira/browse/ARROW-11928) - [C++][Compute] Add ExecNode hierarchy +* [ARROW-11929](https://issues.apache.org/jira/browse/ARROW-11929) - [C++][Compute] Promote Expression to the compute namespace +* [ARROW-11930](https://issues.apache.org/jira/browse/ARROW-11930) - [C++][Dataset][Compute] Refactor Dataset scans to use an ExecNode graph +* [ARROW-11932](https://issues.apache.org/jira/browse/ARROW-11932) - [C++] Provide ArrayBuilder::AppendScalar +* [ARROW-11950](https://issues.apache.org/jira/browse/ARROW-11950) - [C++][Compute] Add unary negative kernel +* [ARROW-11960](https://issues.apache.org/jira/browse/ARROW-11960) - [C++][Gandiva] Support escape in LIKE +* [ARROW-11980](https://issues.apache.org/jira/browse/ARROW-11980) - [Python] Remove "experimental" status from Table.replace\_schema\_metadata +* [ARROW-11986](https://issues.apache.org/jira/browse/ARROW-11986) - [C++][Gandiva] Implement IN expressions for doubles and floats +* [ARROW-11990](https://issues.apache.org/jira/browse/ARROW-11990) - [C++][Compute] Use Status/Result return consistently to indicate errors +* [ARROW-12004](https://issues.apache.org/jira/browse/ARROW-12004) - [C++] Result is annoying +* [ARROW-12010](https://issues.apache.org/jira/browse/ARROW-12010) - [C++][Compute] Improve performance of the hash table used in GroupIdentifier +* [ARROW-12016](https://issues.apache.org/jira/browse/ARROW-12016) - [C++] Implement array\_sort\_indices and sort\_indices for BOOL type +* [ARROW-12050](https://issues.apache.org/jira/browse/ARROW-12050) - [C++][Python][FlightRPC] Use StopToken to enable interrupting long Flight operations +* [ARROW-12074](https://issues.apache.org/jira/browse/ARROW-12074) - [C++][Compute] Add scalar arithmetic kernels for decimal inputs +* [ARROW-12083](https://issues.apache.org/jira/browse/ARROW-12083) - [R] schema use in open\_dataset +* [ARROW-12092](https://issues.apache.org/jira/browse/ARROW-12092) - [R] Make expect\_dplyr\_equal() a bit stricter +* [ARROW-12166](https://issues.apache.org/jira/browse/ARROW-12166) - [C++][Gandiva] Implements CONVERT\_TO(value, type) function +* [ARROW-12184](https://issues.apache.org/jira/browse/ARROW-12184) - [R] Bindings for na.fail, na.omit, na.exclude, na.pass +* [ARROW-12185](https://issues.apache.org/jira/browse/ARROW-12185) - [R] Bindings for any, all +* [ARROW-12198](https://issues.apache.org/jira/browse/ARROW-12198) - [R] bindings for strptime +* [ARROW-12199](https://issues.apache.org/jira/browse/ARROW-12199) - [R] bindings for stddev, variance +* [ARROW-12205](https://issues.apache.org/jira/browse/ARROW-12205) - [C++][Gandiva] Implement TO\_TIME([number] secs) and TO\_TIMESTAMP([number] secs) function +* [ARROW-12231](https://issues.apache.org/jira/browse/ARROW-12231) - [C++][Dataset] Separate datasets backed by readers from InMemoryDataset +* [ARROW-12253](https://issues.apache.org/jira/browse/ARROW-12253) - [Rust] [Ballista] Implement scalable joins +* [ARROW-12255](https://issues.apache.org/jira/browse/ARROW-12255) - [Rust] [Ballista] Integrate scheduler with DataFusion +* [ARROW-12256](https://issues.apache.org/jira/browse/ARROW-12256) - [Rust] [Ballista] Add DataFrame support +* [ARROW-12257](https://issues.apache.org/jira/browse/ARROW-12257) - [Rust] [Ballista] Publish user guide to Arrow site +* [ARROW-12261](https://issues.apache.org/jira/browse/ARROW-12261) - [Rust] [Ballista] Ballista should not have its own DataFrame API +* [ARROW-12291](https://issues.apache.org/jira/browse/ARROW-12291) - [R] Determine the type of an unevaluated expression +* [ARROW-12310](https://issues.apache.org/jira/browse/ARROW-12310) - [Java] ValueVector\#getObject should support covariance for complex types +* [ARROW-12355](https://issues.apache.org/jira/browse/ARROW-12355) - [C++] Implement efficient async CSV scanning +* [ARROW-12362](https://issues.apache.org/jira/browse/ARROW-12362) - [Rust] [DataFusion] topk\_query test failure +* [ARROW-12364](https://issues.apache.org/jira/browse/ARROW-12364) - [Python] [Dataset] Add metadata\_collector option to ds.write\_dataset() +* [ARROW-12378](https://issues.apache.org/jira/browse/ARROW-12378) - [C++][Gandiva] Implement castVARBINARY functions +* [ARROW-12386](https://issues.apache.org/jira/browse/ARROW-12386) - [C++] Support file parallelism in AsyncScanner +* [ARROW-12391](https://issues.apache.org/jira/browse/ARROW-12391) - [Rust][DataFusion] Implement date\_trunc() function +* [ARROW-12392](https://issues.apache.org/jira/browse/ARROW-12392) - [C++] Restore asynchronous streaming CSV reader +* [ARROW-12393](https://issues.apache.org/jira/browse/ARROW-12393) - [JS] Optimally use closure compiler +* [ARROW-12403](https://issues.apache.org/jira/browse/ARROW-12403) - [Rust] [Ballista] Integration tests should check that query results are correct +* [ARROW-12415](https://issues.apache.org/jira/browse/ARROW-12415) - [CI] [Python] ERROR: Failed building wheel for pygit2 on ARM64 +* [ARROW-12424](https://issues.apache.org/jira/browse/ARROW-12424) - [Go][Parquet] Add Schema Package +* [ARROW-12428](https://issues.apache.org/jira/browse/ARROW-12428) - [Python] pyarrow.parquet.read\_\* should use pre\_buffer=True +* [ARROW-12434](https://issues.apache.org/jira/browse/ARROW-12434) - [Rust] [Ballista] Show executed plans with metrics +* [ARROW-12442](https://issues.apache.org/jira/browse/ARROW-12442) - [CI] Set job timeouts on GitHub Actions +* [ARROW-12443](https://issues.apache.org/jira/browse/ARROW-12443) - [C++][Gandiva] Implement castVARCHAR function for binary input +* [ARROW-12444](https://issues.apache.org/jira/browse/ARROW-12444) - [RUST] [CI] Remove Rust and point integration tests to arrow-rs repo +* [ARROW-12445](https://issues.apache.org/jira/browse/ARROW-12445) - [Rust] Design and implement packaging process to bundle Rust in signed tar +* [ARROW-12468](https://issues.apache.org/jira/browse/ARROW-12468) - [Python][R] Expose UseAsync to python/R +* [ARROW-12478](https://issues.apache.org/jira/browse/ARROW-12478) - [C++] Support LLVM 12 +* [ARROW-12484](https://issues.apache.org/jira/browse/ARROW-12484) - [CI] Change jinja macros to not require CROSSBOW\_TOKEN to upload artifacts in Github Actions +* [ARROW-12489](https://issues.apache.org/jira/browse/ARROW-12489) - [Developer] autotune is broken +* [ARROW-12490](https://issues.apache.org/jira/browse/ARROW-12490) - [Dev] Use miniforge for all platforms +* [ARROW-12492](https://issues.apache.org/jira/browse/ARROW-12492) - [Python] Add an helper method to decode a DictionaryArray back to a plain Array +* [ARROW-12496](https://issues.apache.org/jira/browse/ARROW-12496) - [C++][Dataset] Ensure Scanner tests fully cover async +* [ARROW-12499](https://issues.apache.org/jira/browse/ARROW-12499) - [C++][Compute][R] Add ScalarAggregateOptions to Any and All kernels +* [ARROW-12500](https://issues.apache.org/jira/browse/ARROW-12500) - [C++][Dataset] Consolidate similar tests for file formats +* [ARROW-12501](https://issues.apache.org/jira/browse/ARROW-12501) - [CI][Ruby] Remove needless workaround for MinGW build +* [ARROW-12507](https://issues.apache.org/jira/browse/ARROW-12507) - [CI] Remove duplicated cron/nightly builds +* [ARROW-12512](https://issues.apache.org/jira/browse/ARROW-12512) - [C++][Dataset] Implement CSV writing support +* [ARROW-12514](https://issues.apache.org/jira/browse/ARROW-12514) - [Release] Don't run Gandiva related Ruby test with ARROW\_GANDIVA=OFF +* [ARROW-12517](https://issues.apache.org/jira/browse/ARROW-12517) - [Go] Expose App Metadata in Flight client +* [ARROW-12518](https://issues.apache.org/jira/browse/ARROW-12518) - [Python] Expose Parquet statistics has\_null\_count / has\_distinct\_count +* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates +* [ARROW-12522](https://issues.apache.org/jira/browse/ARROW-12522) - [C++] Implement asynchronous/"lazy" variants of ReadRangeCache +* [ARROW-12525](https://issues.apache.org/jira/browse/ARROW-12525) - [JS] Vector toJSON returns an array +* [ARROW-12527](https://issues.apache.org/jira/browse/ARROW-12527) - [Dev] Don't try getting JIRA information for MINOR PR +* [ARROW-12528](https://issues.apache.org/jira/browse/ARROW-12528) - [JS] Support typed arrays in Table.new +* [ARROW-12530](https://issues.apache.org/jira/browse/ARROW-12530) - [C++] Remove Buffer::mutable\_data\_ member and use const\_cast on data\_ only if is\_mutable\_ is true +* [ARROW-12533](https://issues.apache.org/jira/browse/ARROW-12533) - [C++] Random real generator is slow on Arm64 Linux when built with clang +* [ARROW-12534](https://issues.apache.org/jira/browse/ARROW-12534) - [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values +* [ARROW-12537](https://issues.apache.org/jira/browse/ARROW-12537) - [JS] Docs build should not include test sources +* [ARROW-12541](https://issues.apache.org/jira/browse/ARROW-12541) - [Docs] Improve styling/readability of tables in the new doc theme +* [ARROW-12551](https://issues.apache.org/jira/browse/ARROW-12551) - [Java][Release] Java post-release tests fail due to missing testing data +* [ARROW-12554](https://issues.apache.org/jira/browse/ARROW-12554) - Allow duplicates in the value\_set for compute::is\_in +* [ARROW-12555](https://issues.apache.org/jira/browse/ARROW-12555) - [Java][Release] Java post-release script misses dataset JNI bindings +* [ARROW-12556](https://issues.apache.org/jira/browse/ARROW-12556) - [C++][Gandiva] Implement BYTESUBSTRING functions on Gandiva +* [ARROW-12560](https://issues.apache.org/jira/browse/ARROW-12560) - [C++] Investigate utilizing aggressive thread task creation when adding callback to finished future +* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values +* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values +* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind +* [ARROW-12575](https://issues.apache.org/jira/browse/ARROW-12575) - [R] Use unary negative kernel +* [ARROW-12577](https://issues.apache.org/jira/browse/ARROW-12577) - [Website] Use Artifactory instead of Bintray in all places +* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS +* [ARROW-12581](https://issues.apache.org/jira/browse/ARROW-12581) - [C++][FlightRPC] Benchmark compression with real data +* [ARROW-12584](https://issues.apache.org/jira/browse/ARROW-12584) - [C++][Python] Expose method for benchmarking tools to release unused memory from the allocators +* [ARROW-12591](https://issues.apache.org/jira/browse/ARROW-12591) - [Java][Gandiva] Create single Gandiva jar for MacOS and Linux +* [ARROW-12593](https://issues.apache.org/jira/browse/ARROW-12593) - [Packaging][Ubuntu] Add support for Ubuntu 21.04 +* [ARROW-12597](https://issues.apache.org/jira/browse/ARROW-12597) - [C++] Implement OptionalParallelForAsync +* [ARROW-12598](https://issues.apache.org/jira/browse/ARROW-12598) - [C++][Dataset] Implement row-count for CSV or allow selecting 0 columns from CSV +* [ARROW-12599](https://issues.apache.org/jira/browse/ARROW-12599) - [Doc][Python] Documentation missing for pyarrow.Table +* [ARROW-12600](https://issues.apache.org/jira/browse/ARROW-12600) - [CI] Push docker images from crossbow tasks +* [ARROW-12602](https://issues.apache.org/jira/browse/ARROW-12602) - [R] Add BuildInfo from C++ to arrow\_info +* [ARROW-12608](https://issues.apache.org/jira/browse/ARROW-12608) - [C++] Add split\_pattern\_regex function +* [ARROW-12612](https://issues.apache.org/jira/browse/ARROW-12612) - [C++][Compute] Add Expression to type\_fwd.h +* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git +* [ARROW-12621](https://issues.apache.org/jira/browse/ARROW-12621) - [C++][Gandiva] Add alias to sha1 and sha256 functions +* [ARROW-12631](https://issues.apache.org/jira/browse/ARROW-12631) - [Python] pyarrow.dataset.write\_table should accept a Scanner to write +* [ARROW-12643](https://issues.apache.org/jira/browse/ARROW-12643) - Add documentation for experimental repos +* [ARROW-12645](https://issues.apache.org/jira/browse/ARROW-12645) - [Python] Fix numpydoc validation +* [ARROW-12648](https://issues.apache.org/jira/browse/ARROW-12648) - [C++][FlightRPC] Allow using TLS in benchmark +* [ARROW-12649](https://issues.apache.org/jira/browse/ARROW-12649) - [Python/Packaging] Move conda-aarch64 to Azure with cross-compilation +* [ARROW-12653](https://issues.apache.org/jira/browse/ARROW-12653) - [Archery] allow me to add a comment to crossbow requests +* [ARROW-12658](https://issues.apache.org/jira/browse/ARROW-12658) - [C++] Bump aws-c-common to v0.5.10 +* [ARROW-12660](https://issues.apache.org/jira/browse/ARROW-12660) - [R] Post-4.0 adjustments for CRAN +* [ARROW-12661](https://issues.apache.org/jira/browse/ARROW-12661) - [C++] CSV add skip rows after column names +* [ARROW-12662](https://issues.apache.org/jira/browse/ARROW-12662) - [Website] Force to use squash merge +* [ARROW-12667](https://issues.apache.org/jira/browse/ARROW-12667) - [Python] Ensure test coverage for conversion of strided numpy arrays +* [ARROW-12675](https://issues.apache.org/jira/browse/ARROW-12675) - [C++] CSV should include line/row numbers in parsing error messages +* [ARROW-12677](https://issues.apache.org/jira/browse/ARROW-12677) - [Python] Add a mask argument to pyarrow.StructArray.from\_arrays +* [ARROW-12685](https://issues.apache.org/jira/browse/ARROW-12685) - [C++][Compute] Add unary absolute value kernel +* [ARROW-12686](https://issues.apache.org/jira/browse/ARROW-12686) - [C++][Python][FlightRPC] Support export\_to\_c in DoGet/inherit from RecordBatchReader +* [ARROW-12687](https://issues.apache.org/jira/browse/ARROW-12687) - [C++][Python][Dataset] Support C Data Interface with Scanner +* [ARROW-12689](https://issues.apache.org/jira/browse/ARROW-12689) - [R] Implement ArrowArrayStream C interface +* [ARROW-12692](https://issues.apache.org/jira/browse/ARROW-12692) - [R] Improve tests and comments for strsplit() bindings +* [ARROW-12694](https://issues.apache.org/jira/browse/ARROW-12694) - [R][CI] rtools35 job failing on 32-bit build tests +* [ARROW-12696](https://issues.apache.org/jira/browse/ARROW-12696) - [R] Improve testing of error messages converted to warnings +* [ARROW-12699](https://issues.apache.org/jira/browse/ARROW-12699) - [CI][Packaging][Java] Generate a jar compatible with Linux and MacOS for all Arrow components +* [ARROW-12702](https://issues.apache.org/jira/browse/ARROW-12702) - [JS] Upgrade Webpack and terser +* [ARROW-12703](https://issues.apache.org/jira/browse/ARROW-12703) - [JS] Separate Table from DataFrame +* [ARROW-12704](https://issues.apache.org/jira/browse/ARROW-12704) - [JS] use optional chaining +* [ARROW-12709](https://issues.apache.org/jira/browse/ARROW-12709) - [C++] Add variadic string join kernel +* [ARROW-12713](https://issues.apache.org/jira/browse/ARROW-12713) - [C++] String reverse kernel +* [ARROW-12715](https://issues.apache.org/jira/browse/ARROW-12715) - [C++] SQL-style glob string match kernel +* [ARROW-12716](https://issues.apache.org/jira/browse/ARROW-12716) - [C++] Left/right/center string padding kernels +* [ARROW-12717](https://issues.apache.org/jira/browse/ARROW-12717) - [C++] Substring find position kernel +* [ARROW-12719](https://issues.apache.org/jira/browse/ARROW-12719) - [C++][Python] pyarrow.fs.S3FileSystem pass extra kwargs i.e ACL +* [ARROW-12721](https://issues.apache.org/jira/browse/ARROW-12721) - [CI] Fix path for uploading aarch64 conda artifacts from the nightly builds +* [ARROW-12722](https://issues.apache.org/jira/browse/ARROW-12722) - [R] Raise error when attemping to print table with duplicated naming +* [ARROW-12730](https://issues.apache.org/jira/browse/ARROW-12730) - [MATLAB] Update featherreadmex and featherwritemex to build against latest arrow c++ APIs +* [ARROW-12731](https://issues.apache.org/jira/browse/ARROW-12731) - [R] Use InMemoryDataset for Table/RecordBatch in dplyr code +* [ARROW-12736](https://issues.apache.org/jira/browse/ARROW-12736) - [C++] Eliminate unnecessary copy in FieldPath::Get() +* [ARROW-12738](https://issues.apache.org/jira/browse/ARROW-12738) - [CI] [Gandiva] Nightly build error in azure-conda-osx-clang-py38 (and py39, py\*-r\*) +* [ARROW-12741](https://issues.apache.org/jira/browse/ARROW-12741) - [CI] Configure GitHub Token for Nightly Builds +* [ARROW-12745](https://issues.apache.org/jira/browse/ARROW-12745) - [C++][Compute] Add floor, ceiling, and truncate kernels +* [ARROW-12749](https://issues.apache.org/jira/browse/ARROW-12749) - [C++] Unnecessary copy cause by constructing RecordBatch/Table/Schema from lvalues +* [ARROW-12750](https://issues.apache.org/jira/browse/ARROW-12750) - [CI] [R] Actually pass parameterized docker options to the templates +* [ARROW-12751](https://issues.apache.org/jira/browse/ARROW-12751) - [C++] Add variadic row-wise min/max kernels (least/greatest) +* [ARROW-12758](https://issues.apache.org/jira/browse/ARROW-12758) - [R] Add examples to more function documentation +* [ARROW-12760](https://issues.apache.org/jira/browse/ARROW-12760) - [C++][Python][R] S3FileSystem: IO thread parallelism limited to 8 threads +* [ARROW-12761](https://issues.apache.org/jira/browse/ARROW-12761) - [R] Better error handling for write\_to\_raw +* [ARROW-12764](https://issues.apache.org/jira/browse/ARROW-12764) - [CI] Fix arguments in Conda Windows builds +* [ARROW-12777](https://issues.apache.org/jira/browse/ARROW-12777) - [R] Convert all inputs to Arrow objects in match\_arrow and is\_in +* [ARROW-12781](https://issues.apache.org/jira/browse/ARROW-12781) - [R] Implement is.type() functions for dplyr +* [ARROW-12785](https://issues.apache.org/jira/browse/ARROW-12785) - [CI] the r-devdocs build errors when brew installing gcc +* [ARROW-12791](https://issues.apache.org/jira/browse/ARROW-12791) - [R] Better error handling for DatasetFactory$Finish() when no format specified +* [ARROW-12796](https://issues.apache.org/jira/browse/ARROW-12796) - [JS] Support JSON output from benchmarks +* [ARROW-12800](https://issues.apache.org/jira/browse/ARROW-12800) - [JS] Drop IE Support and remove text encoder and decoder polyfills +* [ARROW-12801](https://issues.apache.org/jira/browse/ARROW-12801) - [CI][Packaging][Java] Include all modules in script that generate Arrow jars +* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark +* [ARROW-12808](https://issues.apache.org/jira/browse/ARROW-12808) - [JS] Document browser support +* [ARROW-12810](https://issues.apache.org/jira/browse/ARROW-12810) - [Python] Run tests with AWS\_EC2\_METADATA\_DISABLED=true +* [ARROW-12812](https://issues.apache.org/jira/browse/ARROW-12812) - [Packaging][Java] Improve JNI jars build +* [ARROW-12824](https://issues.apache.org/jira/browse/ARROW-12824) - [R][CI] Upgrade builds for R 4.1 release +* [ARROW-12827](https://issues.apache.org/jira/browse/ARROW-12827) - [C++] [Dataset] Review error pass-through in the datasets API +* [ARROW-12829](https://issues.apache.org/jira/browse/ARROW-12829) - [GLib][Ruby] Add support for Apache Arrow Flight +* [ARROW-12831](https://issues.apache.org/jira/browse/ARROW-12831) - [CI][macOS] Remove needless Homebrew workaround +* [ARROW-12832](https://issues.apache.org/jira/browse/ARROW-12832) - [JS] Write benchmarks in TypeScript +* [ARROW-12833](https://issues.apache.org/jira/browse/ARROW-12833) - [JS] Construct perf data in JS +* [ARROW-12835](https://issues.apache.org/jira/browse/ARROW-12835) - [C++] Implement case insenstive match in match\_substring(\_regex) and match\_like +* [ARROW-12836](https://issues.apache.org/jira/browse/ARROW-12836) - [C++] Installation on IBM i fails because of CxxFlags +* [ARROW-12841](https://issues.apache.org/jira/browse/ARROW-12841) - [R] Add examples to more function documentation - part 2 +* [ARROW-12843](https://issues.apache.org/jira/browse/ARROW-12843) - [C++][Compute] Add is\_inf kernel for floating point arrays +* [ARROW-12848](https://issues.apache.org/jira/browse/ARROW-12848) - [Release] Mail template points to 404 +* [ARROW-12851](https://issues.apache.org/jira/browse/ARROW-12851) - [Go][Parquet] Add Encoding Package Part 1 +* [ARROW-12856](https://issues.apache.org/jira/browse/ARROW-12856) - [C++][Gandiva] Implement castBIT and castBOOLEAN functions on Gandiva +* [ARROW-12859](https://issues.apache.org/jira/browse/ARROW-12859) - [C++] Add ScalarFromJSON for easier testing +* [ARROW-12861](https://issues.apache.org/jira/browse/ARROW-12861) - [C++][Compute] Add sign function kernels +* [ARROW-12867](https://issues.apache.org/jira/browse/ARROW-12867) - [R] Bindings for abs() +* [ARROW-12868](https://issues.apache.org/jira/browse/ARROW-12868) - [R] Bindings for find\_substring and find\_substring\_regex +* [ARROW-12869](https://issues.apache.org/jira/browse/ARROW-12869) - [R] Bindings for utf8\_reverse and ascii\_reverse +* [ARROW-12870](https://issues.apache.org/jira/browse/ARROW-12870) - [R] Bindings for stringr::str\_like +* [ARROW-12875](https://issues.apache.org/jira/browse/ARROW-12875) - [JS] Upgrade Jest and other minor updates +* [ARROW-12883](https://issues.apache.org/jira/browse/ARROW-12883) - [R] [CI] version compatibility fails on R 4.1 +* [ARROW-12891](https://issues.apache.org/jira/browse/ARROW-12891) - [C++][Compute][Dataset] Extract subtree pruning logic to compute:: +* [ARROW-12894](https://issues.apache.org/jira/browse/ARROW-12894) - [R] Bump R version +* [ARROW-12895](https://issues.apache.org/jira/browse/ARROW-12895) - [CI] Use "concurrency" setting on Github Actions +* [ARROW-12898](https://issues.apache.org/jira/browse/ARROW-12898) - [Release][C\#] Package upload script is broken +* [ARROW-12900](https://issues.apache.org/jira/browse/ARROW-12900) - [Python][Documentation] an np import in Reading Datasets docs +* [ARROW-12901](https://issues.apache.org/jira/browse/ARROW-12901) - [R] Follow on to more examples +* [ARROW-12909](https://issues.apache.org/jira/browse/ARROW-12909) - [R][Release] Build of ubuntu-docs is failing +* [ARROW-12912](https://issues.apache.org/jira/browse/ARROW-12912) - [Website] Use .asf.yaml for publishing +* [ARROW-12915](https://issues.apache.org/jira/browse/ARROW-12915) - [Release] Build of ubuntu-docs is failing on thrift +* [ARROW-12936](https://issues.apache.org/jira/browse/ARROW-12936) - [C++][Gandiva] Implement ASCII Hive function on Gandiva +* [ARROW-12937](https://issues.apache.org/jira/browse/ARROW-12937) - [C++] Allow specifying default metadata for new S3 files +* [ARROW-12939](https://issues.apache.org/jira/browse/ARROW-12939) - [R] Simplify RTask stop handling +* [ARROW-12940](https://issues.apache.org/jira/browse/ARROW-12940) - [R] Expose C interface as R6 methods +* [ARROW-12948](https://issues.apache.org/jira/browse/ARROW-12948) - [C++] Add string slice replace kernel +* [ARROW-12949](https://issues.apache.org/jira/browse/ARROW-12949) - [C++] Add string starts-with/ends-with kernels +* [ARROW-12950](https://issues.apache.org/jira/browse/ARROW-12950) - [C++] Add substring count kernel +* [ARROW-12951](https://issues.apache.org/jira/browse/ARROW-12951) - [C++] Refactor StringTransform +* [ARROW-12952](https://issues.apache.org/jira/browse/ARROW-12952) - [C++] Add regex count kernel +* [ARROW-12955](https://issues.apache.org/jira/browse/ARROW-12955) - [C++] Add additional type support for if\_else kernel +* [ARROW-12957](https://issues.apache.org/jira/browse/ARROW-12957) - [R] rchk issues on cran +* [ARROW-12961](https://issues.apache.org/jira/browse/ARROW-12961) - [C++] MSVC issues warning building PyArrow on Windows +* [ARROW-12962](https://issues.apache.org/jira/browse/ARROW-12962) - [GLib][Ruby] Add Arrow:Scalar +* [ARROW-12964](https://issues.apache.org/jira/browse/ARROW-12964) - [R] Add bindings for ifelse() and if\_else() +* [ARROW-12966](https://issues.apache.org/jira/browse/ARROW-12966) - [Python] Expose Python binding for ElementWiseAggregateOptions +* [ARROW-12967](https://issues.apache.org/jira/browse/ARROW-12967) - [R] Add bindings for pmin() and pmax() +* [ARROW-12968](https://issues.apache.org/jira/browse/ARROW-12968) - [R] [CI] Add an rchk job to our nightlies +* [ARROW-12972](https://issues.apache.org/jira/browse/ARROW-12972) - [CI] ][C++] archive\_write\_add\_filter\_zstd error on CentOS + ARM64 +* [ARROW-12975](https://issues.apache.org/jira/browse/ARROW-12975) - [C++][Python] if\_else kernel doesn't support upcasting +* [ARROW-12982](https://issues.apache.org/jira/browse/ARROW-12982) - [C++] Re-enable unused-variable warning +* [ARROW-12984](https://issues.apache.org/jira/browse/ARROW-12984) - [C++] Passing options parameter of Count/Index aggregation by reference +* [ARROW-12985](https://issues.apache.org/jira/browse/ARROW-12985) - [Python][Packaging] Unable to install pygit2 in the arm64 wheel builds +* [ARROW-12986](https://issues.apache.org/jira/browse/ARROW-12986) - [C++][Gandiva] Implement new cache eviction policy in Gandiva +* [ARROW-12992](https://issues.apache.org/jira/browse/ARROW-12992) - [R] bindings for substr(), substring(), str\_sub() +* [ARROW-12994](https://issues.apache.org/jira/browse/ARROW-12994) - [R] Fix tests that assume UTC local tz +* [ARROW-12996](https://issues.apache.org/jira/browse/ARROW-12996) - [C++] CSV stream reader has no progress indication +* [ARROW-13002](https://issues.apache.org/jira/browse/ARROW-13002) - [C++] Add a check for the utf8proc's version in CMake +* [ARROW-13005](https://issues.apache.org/jira/browse/ARROW-13005) - [C++] Support filter/take for union data type. +* [ARROW-13006](https://issues.apache.org/jira/browse/ARROW-13006) - [C++][Gandiva] Implement BASE64 and UNBASE64 Hive functions on Gandiva +* [ARROW-13009](https://issues.apache.org/jira/browse/ARROW-13009) - [Doc][Dev] Document builds mailing-list +* [ARROW-13022](https://issues.apache.org/jira/browse/ARROW-13022) - [R] bindings for lubridate's year, isoyear, quarter, month, day, wday, yday, isoweek, hour, minute, and second functions +* [ARROW-13025](https://issues.apache.org/jira/browse/ARROW-13025) - [C++][Compute] Enhance FunctionOptions with equality, debug representability, and serializability +* [ARROW-13027](https://issues.apache.org/jira/browse/ARROW-13027) - [C++] Fix ASAN stack traces in CI +* [ARROW-13030](https://issues.apache.org/jira/browse/ARROW-13030) - [CI][Go] Setup Arm64 golang CI +* [ARROW-13031](https://issues.apache.org/jira/browse/ARROW-13031) - [JS] Support arm in closure compiler on macOS +* [ARROW-13032](https://issues.apache.org/jira/browse/ARROW-13032) - [Java] Update gauva version +* [ARROW-13034](https://issues.apache.org/jira/browse/ARROW-13034) - [Python][Docs] Update outdated examples for hdfs/azure on the Parquet doc page +* [ARROW-13036](https://issues.apache.org/jira/browse/ARROW-13036) - [Doc] Mention recommended file extension(s) for Arrow IPC +* [ARROW-13042](https://issues.apache.org/jira/browse/ARROW-13042) - [C++] Automatic checks that kernels don't leave uninitialized data in output +* [ARROW-13043](https://issues.apache.org/jira/browse/ARROW-13043) - [GLib][Ruby] Add GArrowEqualOptions +* [ARROW-13044](https://issues.apache.org/jira/browse/ARROW-13044) - [Java] Union vectors should extend ValueVector +* [ARROW-13045](https://issues.apache.org/jira/browse/ARROW-13045) - [Packaging][RPM][deb] Don't install system utf8proc if it's old +* [ARROW-13047](https://issues.apache.org/jira/browse/ARROW-13047) - [Website] Add kiszk to committer list +* [ARROW-13049](https://issues.apache.org/jira/browse/ARROW-13049) - [C++][Gandiva] Implement BIN Hive function on Gandiva +* [ARROW-13050](https://issues.apache.org/jira/browse/ARROW-13050) - [C++][Gandiva] Implement SPACE Hive function on Gandiva +* [ARROW-13054](https://issues.apache.org/jira/browse/ARROW-13054) - [C++] Add option to specify the first day of the week for the "day\_of\_week" temporal kernel +* [ARROW-13064](https://issues.apache.org/jira/browse/ARROW-13064) - [C++] Add a general "if, ifelse, ..., else" kernel ("CASE WHEN") +* [ARROW-13065](https://issues.apache.org/jira/browse/ARROW-13065) - [Packaging][RPM] Add missing required LZ4 version information +* [ARROW-13068](https://issues.apache.org/jira/browse/ARROW-13068) - [GLib][Dataset] Change prefix to gadataset\_ from gad\_ +* [ARROW-13070](https://issues.apache.org/jira/browse/ARROW-13070) - [R] bindings for sd and var +* [ARROW-13072](https://issues.apache.org/jira/browse/ARROW-13072) - [C++] Add bitwise arithmetic compute functions +* [ARROW-13074](https://issues.apache.org/jira/browse/ARROW-13074) - [Python] Start with deprecating ParquetDataset custom attributes +* [ARROW-13075](https://issues.apache.org/jira/browse/ARROW-13075) - [Python] Expose C data interface API for pyarrow.Field +* [ARROW-13076](https://issues.apache.org/jira/browse/ARROW-13076) - [Java] Enable ExtensionType to use StructVector and UnionVector for underlying storage +* [ARROW-13082](https://issues.apache.org/jira/browse/ARROW-13082) - [CI] Forward R argument to ubuntu-docs build +* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_ +* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_ +* [ARROW-13091](https://issues.apache.org/jira/browse/ARROW-13091) - [Python] Add compression\_level argument to IpcWriteOptions constructor +* [ARROW-13092](https://issues.apache.org/jira/browse/ARROW-13092) - [C++] CreateDir should fail if the target exists and is not a directory +* [ARROW-13095](https://issues.apache.org/jira/browse/ARROW-13095) - [C++] Implement trigonometric compute functions +* [ARROW-13096](https://issues.apache.org/jira/browse/ARROW-13096) - [C++] Implement logarithm compute functions +* [ARROW-13097](https://issues.apache.org/jira/browse/ARROW-13097) - [C++] Provide a simple reflection utility for {{struct}}s +* [ARROW-13098](https://issues.apache.org/jira/browse/ARROW-13098) - [Dev][Archery] Reorganize docker submodule to its own subpackage +* [ARROW-13100](https://issues.apache.org/jira/browse/ARROW-13100) - [MATLAB] Integrate GoogleTest with MATLAB Interface C++ Code +* [ARROW-13101](https://issues.apache.org/jira/browse/ARROW-13101) - [Python][Doc] pyarrow.FixedSizeListArray does not appear in the documentation +* [ARROW-13110](https://issues.apache.org/jira/browse/ARROW-13110) - [C++] Deadlock can happen when using BackgroundGenerator without transferring callbacks +* [ARROW-13113](https://issues.apache.org/jira/browse/ARROW-13113) - [R] use RTasks to manage parallel in converting arrow to R +* [ARROW-13117](https://issues.apache.org/jira/browse/ARROW-13117) - [R] Retain schema in new Expressions +* [ARROW-13119](https://issues.apache.org/jira/browse/ARROW-13119) - [R] Set empty schema in scalar Expressions +* [ARROW-13124](https://issues.apache.org/jira/browse/ARROW-13124) - [Ruby] Add support for memory view +* [ARROW-13127](https://issues.apache.org/jira/browse/ARROW-13127) - [R] Valgrind nightly errors +* [ARROW-13136](https://issues.apache.org/jira/browse/ARROW-13136) - [C++] Add a "coalesce" variadic scalar kernel +* [ARROW-13137](https://issues.apache.org/jira/browse/ARROW-13137) - [C++][Documentation] Make in-table references consistent +* [ARROW-13140](https://issues.apache.org/jira/browse/ARROW-13140) - [C++/Python] Upgrade libthrift pin in the nightlies +* [ARROW-13142](https://issues.apache.org/jira/browse/ARROW-13142) - [Python] Use vector append when converting from list of non-strided numpy arrays +* [ARROW-13147](https://issues.apache.org/jira/browse/ARROW-13147) - [Java] Respect the rounding policy when allocating vector buffers +* [ARROW-13157](https://issues.apache.org/jira/browse/ARROW-13157) - [C++] Add find\_substring\_regex kernel and implement ignore\_case for find\_substring +* [ARROW-13158](https://issues.apache.org/jira/browse/ARROW-13158) - [Python] Fix repr and contains of StructScalar with duplicate field names +* [ARROW-13162](https://issues.apache.org/jira/browse/ARROW-13162) - [C++][Gandiva] Add new alias for extract date functions in Gandiva registry +* [ARROW-13171](https://issues.apache.org/jira/browse/ARROW-13171) - [R] Add binding for str\_pad() +* [ARROW-13190](https://issues.apache.org/jira/browse/ARROW-13190) - [C++] [Gandiva] Change behavior of INITCAP function +* [ARROW-13194](https://issues.apache.org/jira/browse/ARROW-13194) - [Java][Document] Create prose document about Java algorithms +* [ARROW-13195](https://issues.apache.org/jira/browse/ARROW-13195) - [R] Problem with rlang reverse dependency checks +* [ARROW-13199](https://issues.apache.org/jira/browse/ARROW-13199) - [R] add ubuntu 21.04 to nightly builds +* [ARROW-13200](https://issues.apache.org/jira/browse/ARROW-13200) - [R] Add binding for case\_when() +* [ARROW-13201](https://issues.apache.org/jira/browse/ARROW-13201) - [R] Add binding for coalesce() +* [ARROW-13210](https://issues.apache.org/jira/browse/ARROW-13210) - [Python][CI] Fix vcpkg caching mechanism for the macOS wheels +* [ARROW-13211](https://issues.apache.org/jira/browse/ARROW-13211) - [C++][CI] Remove outdated Github Actions ARM builds +* [ARROW-13212](https://issues.apache.org/jira/browse/ARROW-13212) - [Release] Support deploying to test PyPI in the python post release script +* [ARROW-13215](https://issues.apache.org/jira/browse/ARROW-13215) - [R] [CI] Add ENV TZ to docker files +* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage +* [ARROW-13219](https://issues.apache.org/jira/browse/ARROW-13219) - [C++][GLib] Demote/deprecate CompareOptions +* [ARROW-13224](https://issues.apache.org/jira/browse/ARROW-13224) - [Python][Doc] Documentation missing for pyarrow.dataset.write\_dataset +* [ARROW-13226](https://issues.apache.org/jira/browse/ARROW-13226) - [Python] Add a general purpose cython trampolining utility +* [ARROW-13228](https://issues.apache.org/jira/browse/ARROW-13228) - [C++] S3 CreateBucket fails because AWS treats us-east-1 differently than other regions +* [ARROW-13230](https://issues.apache.org/jira/browse/ARROW-13230) - Add CSV Writer documentation +* [ARROW-13234](https://issues.apache.org/jira/browse/ARROW-13234) - [C++] Add string padding option to determine which side the extra space goes on +* [ARROW-13235](https://issues.apache.org/jira/browse/ARROW-13235) - [C++] Make type\_name equal to options class name for all FunctionOptionTypes +* [ARROW-13236](https://issues.apache.org/jira/browse/ARROW-13236) - [Python] Improve repr of pyarrow.compute.FunctionOptions +* [ARROW-13238](https://issues.apache.org/jira/browse/ARROW-13238) - [C++][Dataset][Compute] Substitute ExecPlan impl for dataset scans +* [ARROW-13242](https://issues.apache.org/jira/browse/ARROW-13242) - [C++] Improve decimal random generation +* [ARROW-13244](https://issues.apache.org/jira/browse/ARROW-13244) - [C++] Add facility to get current thread id +* [ARROW-13258](https://issues.apache.org/jira/browse/ARROW-13258) - [Python] Improve the repr of ParquetFileFragment +* [ARROW-13262](https://issues.apache.org/jira/browse/ARROW-13262) - [R] transmute() fails after pulling data into R +* [ARROW-13273](https://issues.apache.org/jira/browse/ARROW-13273) - [C++] Don't use .pc only in CMake paths for Requires.private +* [ARROW-13274](https://issues.apache.org/jira/browse/ARROW-13274) - [JS] Remove Webpack +* [ARROW-13275](https://issues.apache.org/jira/browse/ARROW-13275) - [JS] Fix perf tests +* [ARROW-13276](https://issues.apache.org/jira/browse/ARROW-13276) - [GLib][Ruby][Flight] Add support for ListFlights +* [ARROW-13277](https://issues.apache.org/jira/browse/ARROW-13277) - [JS] Add declaration maps +* [ARROW-13280](https://issues.apache.org/jira/browse/ARROW-13280) - [R] Bindings for log and trig functions +* [ARROW-13282](https://issues.apache.org/jira/browse/ARROW-13282) - [C++] Remove obsolete generated files +* [ARROW-13283](https://issues.apache.org/jira/browse/ARROW-13283) - [Developer Tools] Support passing through memory limits in archery docker run +* [ARROW-13286](https://issues.apache.org/jira/browse/ARROW-13286) - [CI] Require docker-compose 1.27.0 or later +* [ARROW-13289](https://issues.apache.org/jira/browse/ARROW-13289) - [C++] Log functions don't have int kernels +* [ARROW-13291](https://issues.apache.org/jira/browse/ARROW-13291) - [GLib][CI] Require gobject-introspection 3.4.5 or later +* [ARROW-13296](https://issues.apache.org/jira/browse/ARROW-13296) - [C++] Provide reflection-compatible enum replacement +* [ARROW-13299](https://issues.apache.org/jira/browse/ARROW-13299) - [JS] Upgrade ix and rxjs +* [ARROW-13303](https://issues.apache.org/jira/browse/ARROW-13303) - [JS] Revise bundles +* [ARROW-13306](https://issues.apache.org/jira/browse/ARROW-13306) - [Java][JDBC] use ResultSetMetaData.getColumnLabel instead of ResultSetMetaData.getColumnName +* [ARROW-13313](https://issues.apache.org/jira/browse/ARROW-13313) - [C++][Compute] Add ScalarAggregateNode +* [ARROW-13320](https://issues.apache.org/jira/browse/ARROW-13320) - [Website] Add MIME types to FAQ +* [ARROW-13323](https://issues.apache.org/jira/browse/ARROW-13323) - [Archery] Validate docker compose configuration +* [ARROW-13343](https://issues.apache.org/jira/browse/ARROW-13343) - [R] Update NEWS.md for 5.0 +* [ARROW-13346](https://issues.apache.org/jira/browse/ARROW-13346) - [C++] Remove compile time parsing from EnumType +* [ARROW-13355](https://issues.apache.org/jira/browse/ARROW-13355) - [R] ensure that sf is installed in our revdep job +* [ARROW-13357](https://issues.apache.org/jira/browse/ARROW-13357) - [R] bindings for sign() +* [ARROW-13365](https://issues.apache.org/jira/browse/ARROW-13365) - [R] bindings for floor/ceiling/truncate +* [ARROW-13385](https://issues.apache.org/jira/browse/ARROW-13385) - [C++][Compute] Document out-of-source addition to the FunctionRegistry +* [ARROW-13386](https://issues.apache.org/jira/browse/ARROW-13386) - [R][C++] CSV streaming changes break Rtools 35 32-bit build +* [ARROW-13418](https://issues.apache.org/jira/browse/ARROW-13418) - [R] typo in python.r +* [ARROW-13461](https://issues.apache.org/jira/browse/ARROW-13461) - [Python][Packaging] Build M1 wheels for python 3.8 +* [PARQUET-1798](https://issues.apache.org/jira/browse/PARQUET-1798) - [C++] Review logic around automatic assignment of field\_id's +* [PARQUET-1998](https://issues.apache.org/jira/browse/PARQUET-1998) - [C++] Implement LZ4\_RAW compression +* [PARQUET-2056](https://issues.apache.org/jira/browse/PARQUET-2056) - [C++] Add ability for retrieving dictionary and indices separately for ColumnReader + + + +# Apache Arrow 4.0.1 (2021-05-26) + +## Bug Fixes + +* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0 +* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure +* [ARROW-12603](https://issues.apache.org/jira/browse/ARROW-12603) - [R] open\_dataset ignoring provided schema when using select +* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds +* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table +* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server +* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive +* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2 +* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches +* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata +* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays +* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash +* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json +* [ARROW-12855](https://issues.apache.org/jira/browse/ARROW-12855) - error: no member named 'TableReader' in namespace during compilation + + +## New Features and Improvements + +* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds +* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates +* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind +* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS +* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git +* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark +* [ARROW-13533](https://issues.apache.org/jira/browse/ARROW-13533) - Buy Yellow Xanax Bars R039 | Buy Yellow Xanax Bars 2mg Online With Creditcard + + + +# Apache Arrow 4.0.0 (2021-04-26) + +## Bug Fixes + +* [ARROW-4784](https://issues.apache.org/jira/browse/ARROW-4784) - [C++][CI] Re-enable flaky mingw tests. +* [ARROW-6818](https://issues.apache.org/jira/browse/ARROW-6818) - [Doc] Format docs confusing +* [ARROW-7288](https://issues.apache.org/jira/browse/ARROW-7288) - [C++][R] read\_parquet() freezes on Windows with Japanese locale +* [ARROW-7830](https://issues.apache.org/jira/browse/ARROW-7830) - [C++] Parquet library version doesn't change with releases +* [ARROW-9451](https://issues.apache.org/jira/browse/ARROW-9451) - [Python] Unsigned integer types will accept string values in pyarrow.array +* [ARROW-9634](https://issues.apache.org/jira/browse/ARROW-9634) - [C++][Python] Restore non-UTC time zones when reading Parquet file that was previously Arrow +* [ARROW-9878](https://issues.apache.org/jira/browse/ARROW-9878) - [Python] table to\_pandas self\_destruct=True + split\_blocks=True cannot prevent doubling memory +* [ARROW-10038](https://issues.apache.org/jira/browse/ARROW-10038) - [C++] SetCpuThreadPoolCapacity(1) spins up nCPUs threads +* [ARROW-10056](https://issues.apache.org/jira/browse/ARROW-10056) - [C++] Increase flatbuffers max\_tables parameter in order to read wide tables +* [ARROW-10364](https://issues.apache.org/jira/browse/ARROW-10364) - [Dev][Archery] Test is failed with semver 2.13.0 +* [ARROW-10370](https://issues.apache.org/jira/browse/ARROW-10370) - [Python] Spurious s3fs-related test failures +* [ARROW-10403](https://issues.apache.org/jira/browse/ARROW-10403) - [C++] Implement unique kernel for dictionary type +* [ARROW-10405](https://issues.apache.org/jira/browse/ARROW-10405) - [C++] IsIn kernel should be able to lookup dictionary in string +* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests +* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler +* [ARROW-10514](https://issues.apache.org/jira/browse/ARROW-10514) - [C++][Parquet] Data inconsistency in parquet-reader output modes +* [ARROW-10953](https://issues.apache.org/jira/browse/ARROW-10953) - [R] Validate when creating Table with schema +* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer +* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer +* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer +* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer +* [ARROW-11066](https://issues.apache.org/jira/browse/ARROW-11066) - [Java] Is there a bug in flight AddWritableBuffer +* [ARROW-11134](https://issues.apache.org/jira/browse/ARROW-11134) - [C++][CI] ARM64 job on Travis-CI doesn't run tests +* [ARROW-11147](https://issues.apache.org/jira/browse/ARROW-11147) - [Python][CI] Parquet tests failing in nightly build with Dask master +* [ARROW-11180](https://issues.apache.org/jira/browse/ARROW-11180) - [Developer] cmake-format pre-commit hook doesn't run +* [ARROW-11192](https://issues.apache.org/jira/browse/ARROW-11192) - [Documentation] Describe opening Visual Studio so it inherits a working env +* [ARROW-11223](https://issues.apache.org/jira/browse/ARROW-11223) - [Java] BaseVariableWidthVector/BaseLargeVariableWidthVector setNull and getBufferSizeFor is buggy +* [ARROW-11235](https://issues.apache.org/jira/browse/ARROW-11235) - [Python] S3 test failures inside non-default regions +* [ARROW-11239](https://issues.apache.org/jira/browse/ARROW-11239) - [Rust] array::transform::tests::test\_struct failed +* [ARROW-11269](https://issues.apache.org/jira/browse/ARROW-11269) - [Rust] Unable to read Parquet file because of mismatch in column-derived and embedded schemas +* [ARROW-11277](https://issues.apache.org/jira/browse/ARROW-11277) - [C++] Fix compilation error in dataset expressions on macOS 10.11 +* [ARROW-11299](https://issues.apache.org/jira/browse/ARROW-11299) - [Python] build warning in python +* [ARROW-11303](https://issues.apache.org/jira/browse/ARROW-11303) - [Release][C++] Enable mimalloc in the windows verification script +* [ARROW-11305](https://issues.apache.org/jira/browse/ARROW-11305) - [Rust]: parquet-rowcount binary tries to open itself as a parquet file +* [ARROW-11311](https://issues.apache.org/jira/browse/ARROW-11311) - [Rust] unset\_bit is toggling bits, not unsetting them +* [ARROW-11313](https://issues.apache.org/jira/browse/ARROW-11313) - [Rust] Size hint of iterators is incorrect +* [ARROW-11315](https://issues.apache.org/jira/browse/ARROW-11315) - [Packaging][APT][arm64] Add missing gir1.2 files +* [ARROW-11320](https://issues.apache.org/jira/browse/ARROW-11320) - [C++] Spurious test failure when creating temporary dir +* [ARROW-11322](https://issues.apache.org/jira/browse/ARROW-11322) - [Rust] Arrow \`memory\` made private is a breaking API change +* [ARROW-11323](https://issues.apache.org/jira/browse/ARROW-11323) - [Rust][DataFusion] ComputeError("concat requires input of at least one array")) with queries with ORDER BY or GROUP BY that return no +* [ARROW-11328](https://issues.apache.org/jira/browse/ARROW-11328) - [R] Collecting zero columns from a dataset returns entire dataset +* [ARROW-11334](https://issues.apache.org/jira/browse/ARROW-11334) - [Python][CI] Nightly pandas builds failing because of internal pandas change +* [ARROW-11337](https://issues.apache.org/jira/browse/ARROW-11337) - [C++] Compilation error with ThreadSanitizer +* [ARROW-11357](https://issues.apache.org/jira/browse/ARROW-11357) - [Rust] take primitive implementation is unsound +* [ARROW-11376](https://issues.apache.org/jira/browse/ARROW-11376) - [C++] ThreadedTaskGroup failure with Thread Sanitizer enabled +* [ARROW-11379](https://issues.apache.org/jira/browse/ARROW-11379) - [C++][Dataset] Reading dataset with filtering on timestamp partition field crashes +* [ARROW-11387](https://issues.apache.org/jira/browse/ARROW-11387) - [Rust] Arrow 3.0.0 release with simd feature doesn't compile without feature=avx512. +* [ARROW-11391](https://issues.apache.org/jira/browse/ARROW-11391) - [C++] HdfsOutputStream::Write unsafely truncates integers exceeding INT32\_MAX +* [ARROW-11394](https://issues.apache.org/jira/browse/ARROW-11394) - [Rust] Slice + Concat incorrect for structs +* [ARROW-11400](https://issues.apache.org/jira/browse/ARROW-11400) - [Python] Pickled ParquetFileFragment has invalid partition\_expresion with dictionary type in pyarrow 2.0 +* [ARROW-11403](https://issues.apache.org/jira/browse/ARROW-11403) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter' +* [ARROW-11412](https://issues.apache.org/jira/browse/ARROW-11412) - [Python] Expressions not working with logical boolean operators (and, or, not) +* [ARROW-11412](https://issues.apache.org/jira/browse/ARROW-11412) - [Python] Expressions not working with logical boolean operators (and, or, not) +* [ARROW-11427](https://issues.apache.org/jira/browse/ARROW-11427) - [C++] Arrow uses AVX512 instructions even when not supported by the OS +* [ARROW-11448](https://issues.apache.org/jira/browse/ARROW-11448) - [C++] tdigest build failure on Windows with Visual Studio +* [ARROW-11451](https://issues.apache.org/jira/browse/ARROW-11451) - [C++] Fix gcc-4.8 build error +* [ARROW-11452](https://issues.apache.org/jira/browse/ARROW-11452) - [Rust] Parquet reader cannot read file where a struct column has the same name as struct member columns +* [ARROW-11461](https://issues.apache.org/jira/browse/ARROW-11461) - [Flight][Go] GetSchema does not work with Java Flight Server +* [ARROW-11464](https://issues.apache.org/jira/browse/ARROW-11464) - [Python] pyarrow.parquet.read\_pandas doesn't conform to its docs +* [ARROW-11470](https://issues.apache.org/jira/browse/ARROW-11470) - [C++] Overflow occurs on integer multiplications in ComputeRowMajorStrides, ComputeColumnMajorStrides, and CheckTensorStridesValidity +* [ARROW-11472](https://issues.apache.org/jira/browse/ARROW-11472) - [Python][CI] Kartothek integrations build is failing with numpy 1.20 +* [ARROW-11472](https://issues.apache.org/jira/browse/ARROW-11472) - [Python][CI] Kartothek integrations build is failing with numpy 1.20 +* [ARROW-11480](https://issues.apache.org/jira/browse/ARROW-11480) - [Python] Segmentation fault reading parquet with date filter with INT96 column +* [ARROW-11483](https://issues.apache.org/jira/browse/ARROW-11483) - [Java][C++][Integration] C++ integration test creates JSON files incompatible with Java +* [ARROW-11488](https://issues.apache.org/jira/browse/ARROW-11488) - [Rust]: StructBuilder's Drop impl leaks memory +* [ARROW-11490](https://issues.apache.org/jira/browse/ARROW-11490) - [C++] BM\_ArrowBinaryDict/EncodeLowLevel is not deterministic +* [ARROW-11494](https://issues.apache.org/jira/browse/ARROW-11494) - [Rust] Fix take bench +* [ARROW-11497](https://issues.apache.org/jira/browse/ARROW-11497) - [Python] pyarrow parquet writer for list does not conform with Apache Parquet specification +* [ARROW-11538](https://issues.apache.org/jira/browse/ARROW-11538) - [Python] Segfault reading Parquet dataset with Timestamp filter +* [ARROW-11547](https://issues.apache.org/jira/browse/ARROW-11547) - [Packaging][Conda][Drone] Nightly builds are failed by undefined variable error +* [ARROW-11548](https://issues.apache.org/jira/browse/ARROW-11548) - [C++] RandomArrayGenerator::List size mismatch +* [ARROW-11551](https://issues.apache.org/jira/browse/ARROW-11551) - [C++][Gandiva] castTIMESTAMP(utf8) function doesn't show error out for invalid inputs +* [ARROW-11560](https://issues.apache.org/jira/browse/ARROW-11560) - [FlightRPC][C++][Python] Interrupting a Flight server results in abort +* [ARROW-11567](https://issues.apache.org/jira/browse/ARROW-11567) - [C++][Compute] Variance kernel has precision issue +* [ARROW-11577](https://issues.apache.org/jira/browse/ARROW-11577) - [Rust] Concat kernel panics on slices of string arrays +* [ARROW-11582](https://issues.apache.org/jira/browse/ARROW-11582) - [R] write\_dataset "format" argument default and validation could be better +* [ARROW-11586](https://issues.apache.org/jira/browse/ARROW-11586) - [Rust] [Datafusion] Invalid SQL sometimes panics +* [ARROW-11595](https://issues.apache.org/jira/browse/ARROW-11595) - [C++][NIGHTLY:test-conda-cpp-valgrind] GenerateBitsUnrolled triggers valgrind on uninit inputs +* [ARROW-11596](https://issues.apache.org/jira/browse/ARROW-11596) - [Python][Dataset] SIGSEGV when executing scan tasks with Python executors +* [ARROW-11603](https://issues.apache.org/jira/browse/ARROW-11603) - [Rust] Fix clippy error +* [ARROW-11607](https://issues.apache.org/jira/browse/ARROW-11607) - [Python] Error when reading table with list values from parquet +* [ARROW-11614](https://issues.apache.org/jira/browse/ARROW-11614) - [C++][Gandiva] Fix round() logic to return positive zero when argument is zero +* [ARROW-11617](https://issues.apache.org/jira/browse/ARROW-11617) - [C++][Gandiva] Fix nested if-else optimisation in gandiva +* [ARROW-11620](https://issues.apache.org/jira/browse/ARROW-11620) - [Rust] [DataFusion] Inconsistent use of Box and Arc for TableProvider +* [ARROW-11630](https://issues.apache.org/jira/browse/ARROW-11630) - [Rust] Introduce partial\_sort and limit option for sort kernel +* [ARROW-11632](https://issues.apache.org/jira/browse/ARROW-11632) - [Rust] csv::Reader doesn't propagate schema metadata to RecordBatches +* [ARROW-11639](https://issues.apache.org/jira/browse/ARROW-11639) - [C++][Gandiva] Fix signbit compilation issue in Ubuntu nightly build +* [ARROW-11642](https://issues.apache.org/jira/browse/ARROW-11642) - [C++] Incorrect preprocessor directive for Windows in JVM detection +* [ARROW-11657](https://issues.apache.org/jira/browse/ARROW-11657) - [R] group\_by with .drop specified errors +* [ARROW-11658](https://issues.apache.org/jira/browse/ARROW-11658) - [R] Handle mutate/rename inside group\_by +* [ARROW-11663](https://issues.apache.org/jira/browse/ARROW-11663) - [DataFusion] Master does not compile +* [ARROW-11668](https://issues.apache.org/jira/browse/ARROW-11668) - [C++] Sporadic UBSAN error in FutureStessTest.TryAddCallback +* [ARROW-11672](https://issues.apache.org/jira/browse/ARROW-11672) - [R] Fix string function test failure on R 3.3 +* [ARROW-11681](https://issues.apache.org/jira/browse/ARROW-11681) - [Rust] IPC writers shouldn't unwrap in destructors +* [ARROW-11686](https://issues.apache.org/jira/browse/ARROW-11686) - [C++]flight-test-integration-client sometimes exits by SIGABRT but does not print the stack trace +* [ARROW-11687](https://issues.apache.org/jira/browse/ARROW-11687) - [Rust][DataFusion] RepartitionExec Hanging +* [ARROW-11694](https://issues.apache.org/jira/browse/ARROW-11694) - [C++] Array Take may dereference absent null bitmap +* [ARROW-11695](https://issues.apache.org/jira/browse/ARROW-11695) - [C++][FlightRPC][Packaging] Update support for disabling TLS server verification for recent gRPC versions +* [ARROW-11717](https://issues.apache.org/jira/browse/ARROW-11717) - [Integration] Intermittent (but frequent) flight integration failures with auth:basic\_proto +* [ARROW-11718](https://issues.apache.org/jira/browse/ARROW-11718) - [Rust] IPC writers shouldn't implicitly finish on drop +* [ARROW-11741](https://issues.apache.org/jira/browse/ARROW-11741) - [C++] Decimal cast failure on big-endian +* [ARROW-11743](https://issues.apache.org/jira/browse/ARROW-11743) - [R] Use pkgdown's new found ability to autolink Jiras +* [ARROW-11746](https://issues.apache.org/jira/browse/ARROW-11746) - [Developer][Archery] Fix prefer real time check +* [ARROW-11756](https://issues.apache.org/jira/browse/ARROW-11756) - [R] passing a partition as a schema leads to segfaults +* [ARROW-11758](https://issues.apache.org/jira/browse/ARROW-11758) - [C++][Compute] Summation kernel round-off error +* [ARROW-11767](https://issues.apache.org/jira/browse/ARROW-11767) - [C++] Scalar::hash may segfault for null scalars +* [ARROW-11771](https://issues.apache.org/jira/browse/ARROW-11771) - [Developer][Archery] Move benchmark tests (so CI runs them) +* [ARROW-11781](https://issues.apache.org/jira/browse/ARROW-11781) - [Python] Reading small amount of files from a partitioned dataset is unexpectedly slow +* [ARROW-11784](https://issues.apache.org/jira/browse/ARROW-11784) - [Rust][DataFusion] CoalesceBatchesStream doesn't honor Stream interface +* [ARROW-11785](https://issues.apache.org/jira/browse/ARROW-11785) - [R] Fallback when filtering Table with unsupported expression fails +* [ARROW-11786](https://issues.apache.org/jira/browse/ARROW-11786) - [C++] CMake output noisy +* [ARROW-11788](https://issues.apache.org/jira/browse/ARROW-11788) - [Java] Appending Empty List Vector yields NPE +* [ARROW-11791](https://issues.apache.org/jira/browse/ARROW-11791) - [Rust][DataFusion] RepartitionExec Blocking +* [ARROW-11802](https://issues.apache.org/jira/browse/ARROW-11802) - [Rust][DataFusion] Mixing of crossbeam channel and async tasks can lead to deadlock +* [ARROW-11819](https://issues.apache.org/jira/browse/ARROW-11819) - [Rust] Add link to the doc +* [ARROW-11821](https://issues.apache.org/jira/browse/ARROW-11821) - [Rust] Edit Rust README +* [ARROW-11830](https://issues.apache.org/jira/browse/ARROW-11830) - [C++] gRPC compilation tests occur every time +* [ARROW-11832](https://issues.apache.org/jira/browse/ARROW-11832) - [R] Handle conversion of extra nested struct column +* [ARROW-11836](https://issues.apache.org/jira/browse/ARROW-11836) - Target libarrow\_bundled\_dependencies.a is not alreay created but is already required. +* [ARROW-11845](https://issues.apache.org/jira/browse/ARROW-11845) - [Rust] Debug implementation of Date32Array panics if array contains negative values +* [ARROW-11850](https://issues.apache.org/jira/browse/ARROW-11850) - [GLib] GARROW\_VERSION\_0\_16 macro is missing +* [ARROW-11855](https://issues.apache.org/jira/browse/ARROW-11855) - [C++] [Python] Memory leak in to\_pandas when converting chunked struct array +* [ARROW-11857](https://issues.apache.org/jira/browse/ARROW-11857) - [Python] Resource temporarily unavailable when using the new Dataset API with Pandas +* [ARROW-11860](https://issues.apache.org/jira/browse/ARROW-11860) - [Rust] [DataFusion] Add DataFusion logos +* [ARROW-11866](https://issues.apache.org/jira/browse/ARROW-11866) - [C++] Arrow Flight SetShutdownOnSignals cause potential mutex deadlock in gRPC +* [ARROW-11872](https://issues.apache.org/jira/browse/ARROW-11872) - [C++] Array Validation of GPU buffers fails due to incorrect validation check +* [ARROW-11880](https://issues.apache.org/jira/browse/ARROW-11880) - [R] Handle empty or NULL transmute() args properly +* [ARROW-11881](https://issues.apache.org/jira/browse/ARROW-11881) - [Rust][DataFusion] Fix Clippy Lint +* [ARROW-11896](https://issues.apache.org/jira/browse/ARROW-11896) - [Rust] Hang / failure in CI on AMD64 Debian 10 Rust stable test workspace +* [ARROW-11904](https://issues.apache.org/jira/browse/ARROW-11904) - [C++] "pure virtual method called" crash at the end of arrow-csv-test +* [ARROW-11905](https://issues.apache.org/jira/browse/ARROW-11905) - [C++] SIMD info always returning none on MacOS +* [ARROW-11914](https://issues.apache.org/jira/browse/ARROW-11914) - [R] [CI] r-sanitizer nightly is broken +* [ARROW-11918](https://issues.apache.org/jira/browse/ARROW-11918) - [R] [Documentation] Docs cleanups +* [ARROW-11923](https://issues.apache.org/jira/browse/ARROW-11923) - [CI] Update branch name for dask dev integration tests +* [ARROW-11937](https://issues.apache.org/jira/browse/ARROW-11937) - [C++] GZip codec hangs if flushed twice +* [ARROW-11941](https://issues.apache.org/jira/browse/ARROW-11941) - [Dev] "DEBUG=1 merge\_arrow\_pr.py" updates Jira issue +* [ARROW-11942](https://issues.apache.org/jira/browse/ARROW-11942) - [C++] If tasks are submitted quickly the thread pool may fail to spin up new threads +* [ARROW-11945](https://issues.apache.org/jira/browse/ARROW-11945) - [R] filter doesn't accept negative numbers as valid +* [ARROW-11956](https://issues.apache.org/jira/browse/ARROW-11956) - [C++] Fix system re2 dependency detection for static library +* [ARROW-11965](https://issues.apache.org/jira/browse/ARROW-11965) - [R][Docs] Fix install.packages command in R dev docs +* [ARROW-11970](https://issues.apache.org/jira/browse/ARROW-11970) - [C++][CI] Fix Valgrind failures +* [ARROW-11971](https://issues.apache.org/jira/browse/ARROW-11971) - [Packaging] Vcpkg patch doesn't apply on windows due to line endings +* [ARROW-11975](https://issues.apache.org/jira/browse/ARROW-11975) - [CI][GLib] Failed to update gcc +* [ARROW-11976](https://issues.apache.org/jira/browse/ARROW-11976) - [C++] Sporadic TSAN error in TestThreadPool.SetCapacity +* [ARROW-11983](https://issues.apache.org/jira/browse/ARROW-11983) - [Python] ImportError calling pyarrow from\_pandas within ThreadPool +* [ARROW-11997](https://issues.apache.org/jira/browse/ARROW-11997) - [Python] concat\_tables crashes python interpreter +* [ARROW-12003](https://issues.apache.org/jira/browse/ARROW-12003) - [R] Fix NOTE re undefined global function group\_by\_drop\_default +* [ARROW-12006](https://issues.apache.org/jira/browse/ARROW-12006) - [Java] Fix checkstyle config to work on Windows +* [ARROW-12012](https://issues.apache.org/jira/browse/ARROW-12012) - [Java] [JDBC] BinaryConsumer cannot reallocate memory correctly +* [ARROW-12013](https://issues.apache.org/jira/browse/ARROW-12013) - [C++][FlightRPC] Failed to detect gRPC version +* [ARROW-12015](https://issues.apache.org/jira/browse/ARROW-12015) - [Rust] [DataFusion] Integrate doc-comment crate to ensure readme examples remain valid +* [ARROW-12028](https://issues.apache.org/jira/browse/ARROW-12028) - [Rust][DataFusion] Unsupported GROUP BY for Timestamp(Millisecond, None) +* [ARROW-12029](https://issues.apache.org/jira/browse/ARROW-12029) - Remove args from FeatherReader$create v2 +* [ARROW-12033](https://issues.apache.org/jira/browse/ARROW-12033) - [Docs] Fix link in developers/benchmarks.html +* [ARROW-12041](https://issues.apache.org/jira/browse/ARROW-12041) - [C++] Fix string description of tensor IPC messages +* [ARROW-12051](https://issues.apache.org/jira/browse/ARROW-12051) - [GLib] Intermittent CI failure in test\_add\_column\_type(TestCSVReader::\#read::options) +* [ARROW-12057](https://issues.apache.org/jira/browse/ARROW-12057) - [Python] Remove direct usage of pandas' Block subclasses +* [ARROW-12065](https://issues.apache.org/jira/browse/ARROW-12065) - [C++][Python] Segfault reading JSON file +* [ARROW-12067](https://issues.apache.org/jira/browse/ARROW-12067) - [Python][Doc] Document pyarrow\_(un)wrap\_scalar +* [ARROW-12073](https://issues.apache.org/jira/browse/ARROW-12073) - [R] Fix R CMD check NOTE about ‘X\_\_\_\_\_X’ +* [ARROW-12076](https://issues.apache.org/jira/browse/ARROW-12076) - [Rust] Fix build +* [ARROW-12077](https://issues.apache.org/jira/browse/ARROW-12077) - [C++] Out-of-bounds write in ListArray::FromArrays +* [ARROW-12086](https://issues.apache.org/jira/browse/ARROW-12086) - [C++] offline builds does not use ARROW\_$LIBRARY\_URL to search for packages +* [ARROW-12088](https://issues.apache.org/jira/browse/ARROW-12088) - [Python][C++] Warning about offsetof in pyarrow.dataset.RecordBatchIterator +* [ARROW-12089](https://issues.apache.org/jira/browse/ARROW-12089) - [Doc] Fix warnings when building Sphinx docs +* [ARROW-12100](https://issues.apache.org/jira/browse/ARROW-12100) - [C\#] Cannot round-trip record batch with PyArrow +* [ARROW-12103](https://issues.apache.org/jira/browse/ARROW-12103) - [C++] "load of misaligned address" in Parquet reader +* [ARROW-12112](https://issues.apache.org/jira/browse/ARROW-12112) - [CI] No space left on device - AMD64 Conda Integration test +* [ARROW-12112](https://issues.apache.org/jira/browse/ARROW-12112) - [CI] No space left on device - AMD64 Conda Integration test +* [ARROW-12113](https://issues.apache.org/jira/browse/ARROW-12113) - [R] Fix rlang deprecation warning from check\_select\_helpers() +* [ARROW-12130](https://issues.apache.org/jira/browse/ARROW-12130) - [C++] Arm64 buid failed if -DARROW\_SIMD\_LEVEL=NONE +* [ARROW-12138](https://issues.apache.org/jira/browse/ARROW-12138) - [Go][IPC] +* [ARROW-12140](https://issues.apache.org/jira/browse/ARROW-12140) - [C++][CI] Valgrind failure on Grouper tests +* [ARROW-12145](https://issues.apache.org/jira/browse/ARROW-12145) - [Developer][Archery] Flaky test: test\_static\_runner\_from\_json +* [ARROW-12149](https://issues.apache.org/jira/browse/ARROW-12149) - [Dev] Archery benchmark test case is failing +* [ARROW-12154](https://issues.apache.org/jira/browse/ARROW-12154) - [C++][Gandiva] Fix gandiva crash in certain OS/CPU combinations +* [ARROW-12155](https://issues.apache.org/jira/browse/ARROW-12155) - [R] Require Table columns to be same length +* [ARROW-12161](https://issues.apache.org/jira/browse/ARROW-12161) - [C++][R] Async streaming CSV reader deadlocking when being run synchronously from datasets +* [ARROW-12161](https://issues.apache.org/jira/browse/ARROW-12161) - [C++][R] Async streaming CSV reader deadlocking when being run synchronously from datasets +* [ARROW-12169](https://issues.apache.org/jira/browse/ARROW-12169) - [C++] Fix compressed file reading with an empty stream at end of file +* [ARROW-12171](https://issues.apache.org/jira/browse/ARROW-12171) - [Rust] Clippy error +* [ARROW-12172](https://issues.apache.org/jira/browse/ARROW-12172) - [Python][Packaging] Pass python version as setuptools pretend version in the macOS wheel builds +* [ARROW-12178](https://issues.apache.org/jira/browse/ARROW-12178) - [CI] Update setuptools in the ubuntu images +* [ARROW-12186](https://issues.apache.org/jira/browse/ARROW-12186) - [Rust][DataFusion] Fix regexp\_match test +* [ARROW-12209](https://issues.apache.org/jira/browse/ARROW-12209) - [JS] @apache-arrow/ts nor apache-arrow does not compile +* [ARROW-12220](https://issues.apache.org/jira/browse/ARROW-12220) - [C++][CI] Thread sanitizer failure +* [ARROW-12226](https://issues.apache.org/jira/browse/ARROW-12226) - [C++] ASAN error in s3fs\_test.cc +* [ARROW-12227](https://issues.apache.org/jira/browse/ARROW-12227) - [R] Fix RE2 and median nightly build failures +* [ARROW-12235](https://issues.apache.org/jira/browse/ARROW-12235) - [Rust][DataFusion] LIMIT returns incorrect results when used with several small partitions +* [ARROW-12241](https://issues.apache.org/jira/browse/ARROW-12241) - [Python] Parallel csv reader cancellation test kills pytest +* [ARROW-12250](https://issues.apache.org/jira/browse/ARROW-12250) - [Rust] Failing test arrow::arrow\_writer::tests::fixed\_size\_binary\_single\_column +* [ARROW-12254](https://issues.apache.org/jira/browse/ARROW-12254) - [Rust][DataFusion] Limit keeps polling input after limit is reached +* [ARROW-12258](https://issues.apache.org/jira/browse/ARROW-12258) - [R] Never do as.data.frame() on collect(as\_data\_frame = FALSE) +* [ARROW-12262](https://issues.apache.org/jira/browse/ARROW-12262) - [Doc][C++][Python] Docs built and pushed with S3 and Flight disabled +* [ARROW-12267](https://issues.apache.org/jira/browse/ARROW-12267) - [Rust] JSON writer does not support timestamp types +* [ARROW-12273](https://issues.apache.org/jira/browse/ARROW-12273) - [JS] Coveralls does not work anymore +* [ARROW-12279](https://issues.apache.org/jira/browse/ARROW-12279) - [Rust][DataFusion] Add test for null handling in hash join (ARROW-12266) +* [ARROW-12294](https://issues.apache.org/jira/browse/ARROW-12294) - [Rust] Fix Boolean Kleene Kernels with no Remainder +* [ARROW-12299](https://issues.apache.org/jira/browse/ARROW-12299) - [Python] pq.write\_to\_dataset does not recognize S3FileSystem +* [ARROW-12300](https://issues.apache.org/jira/browse/ARROW-12300) - [C++] ArrowCUDA erroneously links to CUDA Runtime while only using CUDA Driver API +* [ARROW-12313](https://issues.apache.org/jira/browse/ARROW-12313) - [Rust] [Ballista] Benchmark documentation out of date +* [ARROW-12314](https://issues.apache.org/jira/browse/ARROW-12314) - [Python] pq.read\_pandas with use\_legacy\_dataset=False does not accept columns as a set (kartothek integration failure) +* [ARROW-12327](https://issues.apache.org/jira/browse/ARROW-12327) - [Dev] Use pull request's head remote when submitting crossbow jobs via the comment bot +* [ARROW-12330](https://issues.apache.org/jira/browse/ARROW-12330) - [Developer] Restore values in counters column of Archery benchmark +* [ARROW-12334](https://issues.apache.org/jira/browse/ARROW-12334) - [Rust] [Ballista] Aggregate queries producing incorrect results +* [ARROW-12342](https://issues.apache.org/jira/browse/ARROW-12342) - [Packaging] Fix tabulation in crossbow templates for submitting nightly builds +* [ARROW-12357](https://issues.apache.org/jira/browse/ARROW-12357) - [Archery] Error running "crossbow submit ..." +* [ARROW-12379](https://issues.apache.org/jira/browse/ARROW-12379) - [C++][CI] Thread sanitizer failure in SerialExecutor +* [ARROW-12382](https://issues.apache.org/jira/browse/ARROW-12382) - [C++][CI] Conda nightly jobs fail due to not bundling xsimd +* [ARROW-12385](https://issues.apache.org/jira/browse/ARROW-12385) - [R] [CI] fix cran picking in CI +* [ARROW-12390](https://issues.apache.org/jira/browse/ARROW-12390) - [Rust] Inline Inline from\_trusted\_len\_iter, try\_from\_trusted\_len\_iter, extend\_from\_slice +* [ARROW-12401](https://issues.apache.org/jira/browse/ARROW-12401) - [R] Fix guard around dataset\_\_\_Scanner\_\_TakeRows +* [ARROW-12405](https://issues.apache.org/jira/browse/ARROW-12405) - [Packaging] Fix apt artifact patterns and artifact uploading from travis +* [ARROW-12408](https://issues.apache.org/jira/browse/ARROW-12408) - [R] Delete Scan() bindings +* [ARROW-12421](https://issues.apache.org/jira/browse/ARROW-12421) - [Rust] [DataFusion] topk\_query test fails in master +* [ARROW-12421](https://issues.apache.org/jira/browse/ARROW-12421) - [Rust] [DataFusion] topk\_query test fails in master +* [ARROW-12429](https://issues.apache.org/jira/browse/ARROW-12429) - [C++] MergedGeneratorTestFixture is incorrectly instantiated +* [ARROW-12433](https://issues.apache.org/jira/browse/ARROW-12433) - [Rust] Builds failing due to new flatbuffer release introducing const generics +* [ARROW-12437](https://issues.apache.org/jira/browse/ARROW-12437) - [Rust] [Ballista] Ballista plans must not include RepartitionExec +* [ARROW-12440](https://issues.apache.org/jira/browse/ARROW-12440) - [Release] Various packaging, release script and release verification script fixes +* [ARROW-12466](https://issues.apache.org/jira/browse/ARROW-12466) - [Python] Comparing array to None raises error +* [ARROW-12475](https://issues.apache.org/jira/browse/ARROW-12475) - [C++] Build warning from thread\_pool\_benchmark.cc +* [ARROW-12487](https://issues.apache.org/jira/browse/ARROW-12487) - [C++][Dataset] ScanBatches() hangs if there's an error during scanning +* [ARROW-12495](https://issues.apache.org/jira/browse/ARROW-12495) - [C++][Python] NumPy buffer sets is\_mutable\_ to true but does not set mutable\_data\_ when the NumPy array is writable +* [ARROW-12794](https://issues.apache.org/jira/browse/ARROW-12794) - C++/R: read\_parquet halts process when accessed multiple times +* [PARQUET-1655](https://issues.apache.org/jira/browse/PARQUET-1655) - [C++] Decimal comparisons used for min/max statistics are not correct +* [PARQUET-2008](https://issues.apache.org/jira/browse/PARQUET-2008) - [C++] Wrong information written in RowGroup::total\_byte\_size + + +## New Features and Improvements + +* [ARROW-951](https://issues.apache.org/jira/browse/ARROW-951) - [JS] Fix generated API documentation +* [ARROW-2229](https://issues.apache.org/jira/browse/ARROW-2229) - [C++] Write CSV files from RecordBatch, Table +* [ARROW-3690](https://issues.apache.org/jira/browse/ARROW-3690) - [Rust] Add Rust to the format integration testing +* [ARROW-6103](https://issues.apache.org/jira/browse/ARROW-6103) - [Java] Stop using the maven release plugin +* [ARROW-6248](https://issues.apache.org/jira/browse/ARROW-6248) - [Python] Use FileNotFoundError in HadoopFileSystem.open() in Python 3 +* [ARROW-6455](https://issues.apache.org/jira/browse/ARROW-6455) - [C++] Implement ExtensionType for non-UTF8 Unicode data +* [ARROW-6604](https://issues.apache.org/jira/browse/ARROW-6604) - [C++] Add support for nested types to MakeArrayFromScalar +* [ARROW-7215](https://issues.apache.org/jira/browse/ARROW-7215) - [C++][Gandiva] Implement castVARCHAR(numeric\_type) functions in Gandiva +* [ARROW-7364](https://issues.apache.org/jira/browse/ARROW-7364) - [Rust] Add cast options to cast kernel +* [ARROW-7633](https://issues.apache.org/jira/browse/ARROW-7633) - [C++][CI] Create fuzz targets for tensors and sparse tensors +* [ARROW-7808](https://issues.apache.org/jira/browse/ARROW-7808) - [Java][Dataset] Implement Datasets Java API +* [ARROW-7906](https://issues.apache.org/jira/browse/ARROW-7906) - [C++][Python] Full functionality for ORC format +* [ARROW-8049](https://issues.apache.org/jira/browse/ARROW-8049) - [C++] Upgrade bundled Thrift version to 0.13.0 +* [ARROW-8282](https://issues.apache.org/jira/browse/ARROW-8282) - [C++/Python][Dataset] Support schema evolution for integer columns +* [ARROW-8284](https://issues.apache.org/jira/browse/ARROW-8284) - [C++][Dataset] Schema evolution for timestamp columns +* [ARROW-8630](https://issues.apache.org/jira/browse/ARROW-8630) - [C++][Dataset] Pass schema including all materialized fields to catch CSV edge cases +* [ARROW-8631](https://issues.apache.org/jira/browse/ARROW-8631) - [C++][Dataset] Add ConvertOptions and ReadOptions to CsvFileFormat +* [ARROW-8658](https://issues.apache.org/jira/browse/ARROW-8658) - [C++][Dataset] Implement subtree pruning for FileSystemDataset::GetFragments +* [ARROW-8672](https://issues.apache.org/jira/browse/ARROW-8672) - [Java] Implement RecordBatch IPC buffer compression from ARROW-300 +* [ARROW-8732](https://issues.apache.org/jira/browse/ARROW-8732) - [C++] Let Futures support cancellation +* [ARROW-8771](https://issues.apache.org/jira/browse/ARROW-8771) - [C++] Add boost/process library to build support +* [ARROW-8796](https://issues.apache.org/jira/browse/ARROW-8796) - [Rust] Allow parquet to be written directly to memory +* [ARROW-8797](https://issues.apache.org/jira/browse/ARROW-8797) - [C++] Support Flight RPC among diffent endian platforms +* [ARROW-8900](https://issues.apache.org/jira/browse/ARROW-8900) - [C++] Respect HTTP(S)\_PROXY for S3 Filesystems and/or expose proxy options as parameters +* [ARROW-8919](https://issues.apache.org/jira/browse/ARROW-8919) - [C++] Add "DispatchBest" APIs to compute::Function that selects a kernel that may require implicit casts to invoke +* [ARROW-9128](https://issues.apache.org/jira/browse/ARROW-9128) - [C++] Implement string space trimming kernels: trim, ltrim, and rtrim +* [ARROW-9149](https://issues.apache.org/jira/browse/ARROW-9149) - [C++] Improve configurability of RandomArrayGenerator::ArrayOf +* [ARROW-9196](https://issues.apache.org/jira/browse/ARROW-9196) - [C++] Make temporal casts work on Scalar inputs +* [ARROW-9318](https://issues.apache.org/jira/browse/ARROW-9318) - [C++][Parquet] Encryption key management tools +* [ARROW-9731](https://issues.apache.org/jira/browse/ARROW-9731) - [C++][Dataset] Port "head" method from R to C++ Dataset Scanner +* [ARROW-9749](https://issues.apache.org/jira/browse/ARROW-9749) - [C++][Dataset] Extract format-specific scan options from FileFormat +* [ARROW-9777](https://issues.apache.org/jira/browse/ARROW-9777) - [Rust] Implement IPC changes to catch up to 1.0.0 format +* [ARROW-9856](https://issues.apache.org/jira/browse/ARROW-9856) - [R] Add bindings for string compute functions +* [ARROW-10014](https://issues.apache.org/jira/browse/ARROW-10014) - [C++] TaskGroup::Finish should execute tasks +* [ARROW-10089](https://issues.apache.org/jira/browse/ARROW-10089) - [R] inject base class for Array, ChunkedArray and Scalar +* [ARROW-10183](https://issues.apache.org/jira/browse/ARROW-10183) - [C++] Create a ForEach library function that runs on an iterator of futures +* [ARROW-10195](https://issues.apache.org/jira/browse/ARROW-10195) - [C++] Add string struct extract kernel using re2 +* [ARROW-10250](https://issues.apache.org/jira/browse/ARROW-10250) - [FlightRPC][C++] Remove default constructor for FlightClientOptions +* [ARROW-10255](https://issues.apache.org/jira/browse/ARROW-10255) - [JS] Reorganize imports and exports to be more friendly to ESM tree-shaking +* [ARROW-10297](https://issues.apache.org/jira/browse/ARROW-10297) - [Rust] Parameter for parquet-read to output data in json format +* [ARROW-10299](https://issues.apache.org/jira/browse/ARROW-10299) - [Rust] Support reading and writing V5 of IPC metadata +* [ARROW-10305](https://issues.apache.org/jira/browse/ARROW-10305) - [R] Filter with regular expressions +* [ARROW-10306](https://issues.apache.org/jira/browse/ARROW-10306) - [C++] Add string replacement kernel +* [ARROW-10349](https://issues.apache.org/jira/browse/ARROW-10349) - [Python] Build and publish aarch64 wheels +* [ARROW-10354](https://issues.apache.org/jira/browse/ARROW-10354) - [Rust] [DataFusion] Add support for regex extract +* [ARROW-10360](https://issues.apache.org/jira/browse/ARROW-10360) - [CI] Bump github actions cache version +* [ARROW-10372](https://issues.apache.org/jira/browse/ARROW-10372) - [C++][Dataset] Read compressed CSVs +* [ARROW-10406](https://issues.apache.org/jira/browse/ARROW-10406) - [C++] Unify dictionaries when writing IPC file in a single shot +* [ARROW-10420](https://issues.apache.org/jira/browse/ARROW-10420) - [C++] FileSystem::OpenInput{File,Stream} should accept a MemoryPool +* [ARROW-10421](https://issues.apache.org/jira/browse/ARROW-10421) - [R] Feather reader/writer should accept a MemoryPool +* [ARROW-10438](https://issues.apache.org/jira/browse/ARROW-10438) - [C++][Dataset] Partitioning::Format on nulls +* [ARROW-10520](https://issues.apache.org/jira/browse/ARROW-10520) - [C++][R] Implement add/remove/replace for RecordBatch +* [ARROW-10570](https://issues.apache.org/jira/browse/ARROW-10570) - [R] Use Converter API to convert SEXP to Array/ChunkedArray +* [ARROW-10580](https://issues.apache.org/jira/browse/ARROW-10580) - [C++] When Validating, ensure DenseUnionArray offsets are increasing +* [ARROW-10606](https://issues.apache.org/jira/browse/ARROW-10606) - [C++][Compute] Support casts to and from Decimal256 type. +* [ARROW-10655](https://issues.apache.org/jira/browse/ARROW-10655) - [C++] Add LRU cache facility +* [ARROW-10734](https://issues.apache.org/jira/browse/ARROW-10734) - [R] Build and test on Solaris +* [ARROW-10735](https://issues.apache.org/jira/browse/ARROW-10735) - [R] Remove arrow-without-arrow wrapping +* [ARROW-10766](https://issues.apache.org/jira/browse/ARROW-10766) - [Rust] Compute nested definition and repetition for list arrays +* [ARROW-10816](https://issues.apache.org/jira/browse/ARROW-10816) - [Rust] [DataFusion] Implement INTERVAL +* [ARROW-10831](https://issues.apache.org/jira/browse/ARROW-10831) - [C++][Compute] Implemement quantile kernel +* [ARROW-10846](https://issues.apache.org/jira/browse/ARROW-10846) - [C++] Add async filesystem operations +* [ARROW-10880](https://issues.apache.org/jira/browse/ARROW-10880) - [Java] Support compressing RecordBatch IPC buffers by LZ4 +* [ARROW-10882](https://issues.apache.org/jira/browse/ARROW-10882) - [Python][Dataset] Writing dataset from python iterator of record batches +* [ARROW-10895](https://issues.apache.org/jira/browse/ARROW-10895) - [C++][Gandiva] Implement bool to varchar cast function in Gandiva +* [ARROW-10903](https://issues.apache.org/jira/browse/ARROW-10903) - [Rust] Implement FromIter\>\> constructor for FixedSizeBinaryArray +* [ARROW-11022](https://issues.apache.org/jira/browse/ARROW-11022) - [Rust] [DataFusion] Upgrade to tokio 1.0 +* [ARROW-11070](https://issues.apache.org/jira/browse/ARROW-11070) - [C++] Implement power / exponentiation compute kernel +* [ARROW-11074](https://issues.apache.org/jira/browse/ARROW-11074) - [Rust][DataFusion] Implement predicate push-down for parquet tables +* [ARROW-11081](https://issues.apache.org/jira/browse/ARROW-11081) - [Java] Make IPC option immutable +* [ARROW-11108](https://issues.apache.org/jira/browse/ARROW-11108) - [Rust] Improve performance of MutableBuffer +* [ARROW-11141](https://issues.apache.org/jira/browse/ARROW-11141) - [Rust]: Miri checks +* [ARROW-11149](https://issues.apache.org/jira/browse/ARROW-11149) - [Rust] create\_batch\_empty - support List, LargeList +* [ARROW-11150](https://issues.apache.org/jira/browse/ARROW-11150) - [Rust] Set up bi-weekly Rust sync call and update website +* [ARROW-11154](https://issues.apache.org/jira/browse/ARROW-11154) - [CI][C++] Move homebrew crossbow tests off of Travis-CI +* [ARROW-11156](https://issues.apache.org/jira/browse/ARROW-11156) - [Rust][DataFusion] Create hashes vectorized in hash join +* [ARROW-11174](https://issues.apache.org/jira/browse/ARROW-11174) - [C++][Dataset] Make Expressions available for projection +* [ARROW-11179](https://issues.apache.org/jira/browse/ARROW-11179) - [Format] Make comments in fb files friendly to rust doc +* [ARROW-11183](https://issues.apache.org/jira/browse/ARROW-11183) - [Rust] [Parquet] LogicalType::TIMESTAMP\_NANOS missing +* [ARROW-11191](https://issues.apache.org/jira/browse/ARROW-11191) - [C++] Use FnOnce for TaskGroup's tasks instead of std::function +* [ARROW-11216](https://issues.apache.org/jira/browse/ARROW-11216) - [Rust] Improve documentation for StringDictionaryBuilder +* [ARROW-11220](https://issues.apache.org/jira/browse/ARROW-11220) - [Rust] DF Implement GROUP BY support for Boolean +* [ARROW-11222](https://issues.apache.org/jira/browse/ARROW-11222) - [Rust] [Arrow] catch up with flatbuffers 0.8.1 +* [ARROW-11246](https://issues.apache.org/jira/browse/ARROW-11246) - DF - Add type to Unexpected accumulator state message +* [ARROW-11254](https://issues.apache.org/jira/browse/ARROW-11254) - [Rust][DataFusion] Add SIMD and snmalloc flags as options to benchmarks +* [ARROW-11260](https://issues.apache.org/jira/browse/ARROW-11260) - [C++][Dataset] Don't require dictionaries for reading dataset with schema-based Partitioning +* [ARROW-11265](https://issues.apache.org/jira/browse/ARROW-11265) - [Rust] Made bool not convertable to bytes +* [ARROW-11268](https://issues.apache.org/jira/browse/ARROW-11268) - [Rust][DataFusion] Support specifying repartitions in MemTable +* [ARROW-11270](https://issues.apache.org/jira/browse/ARROW-11270) - [Rust] Use slices for simple array data buffer access +* [ARROW-11279](https://issues.apache.org/jira/browse/ARROW-11279) - [Rust][Parquet] ArrowWriter Definition Levels Memory Usage +* [ARROW-11284](https://issues.apache.org/jira/browse/ARROW-11284) - [R] Support dplyr verb transmute() +* [ARROW-11289](https://issues.apache.org/jira/browse/ARROW-11289) - [Rust] [DataFusion] Support GROUP BY for Dictionary columns +* [ARROW-11290](https://issues.apache.org/jira/browse/ARROW-11290) - [Rust][DataFusion] Address hash aggregate performance with high number of groups +* [ARROW-11291](https://issues.apache.org/jira/browse/ARROW-11291) - [Rust] implement extend for MutableBuffer (from iterator) +* [ARROW-11300](https://issues.apache.org/jira/browse/ARROW-11300) - [Rust][DataFusion] Improve hash aggregate performance with large number of groups in +* [ARROW-11308](https://issues.apache.org/jira/browse/ARROW-11308) - [Rust] [Parquet] Add Arrow decimal array writer +* [ARROW-11309](https://issues.apache.org/jira/browse/ARROW-11309) - [Release][C\#] Use .NET 3.1 for verification +* [ARROW-11310](https://issues.apache.org/jira/browse/ARROW-11310) - [Rust] Implement arrow JSON writer +* [ARROW-11314](https://issues.apache.org/jira/browse/ARROW-11314) - [Release][APT][Yum] Add support for verifying arm64 packages +* [ARROW-11317](https://issues.apache.org/jira/browse/ARROW-11317) - [Rust] Test the prettyprint feature in CI +* [ARROW-11318](https://issues.apache.org/jira/browse/ARROW-11318) - [Rust] Support pretty printing timestamp, date, and time types +* [ARROW-11319](https://issues.apache.org/jira/browse/ARROW-11319) - [Rust] [DataFusion] Improve test comparisons to record batch +* [ARROW-11321](https://issues.apache.org/jira/browse/ARROW-11321) - [Rust][DataFusion] Fix DataFusion compilation error +* [ARROW-11325](https://issues.apache.org/jira/browse/ARROW-11325) - [Packaging][C\#] Release Apache.Arrow.Flight and Apache.Arrow.Flight.AspNetCore +* [ARROW-11329](https://issues.apache.org/jira/browse/ARROW-11329) - [Rust] Do not rebuild the library on every change +* [ARROW-11330](https://issues.apache.org/jira/browse/ARROW-11330) - [Rust][DataFusion] Add ExpressionVisitor pattern +* [ARROW-11332](https://issues.apache.org/jira/browse/ARROW-11332) - [Rust] Use MutableBuffer in take\_string instead of Vec +* [ARROW-11333](https://issues.apache.org/jira/browse/ARROW-11333) - [Rust] Suport creating arbitrary nested empty arrays +* [ARROW-11336](https://issues.apache.org/jira/browse/ARROW-11336) - [C++][Doc] Improve Developing on Windows docs +* [ARROW-11338](https://issues.apache.org/jira/browse/ARROW-11338) - [R] Bindings for quantile and median +* [ARROW-11340](https://issues.apache.org/jira/browse/ARROW-11340) - [C++] Add vcpkg.json manifest to cpp project root +* [ARROW-11343](https://issues.apache.org/jira/browse/ARROW-11343) - [DataFusion] Simplified example +* [ARROW-11346](https://issues.apache.org/jira/browse/ARROW-11346) - [C++][Compute] Implement quantile kernel benchmark +* [ARROW-11349](https://issues.apache.org/jira/browse/ARROW-11349) - [Rust] Add from\_iter\_values to create arrays from T instead of Option +* [ARROW-11350](https://issues.apache.org/jira/browse/ARROW-11350) - [C++] Bump dependency versions +* [ARROW-11354](https://issues.apache.org/jira/browse/ARROW-11354) - [Rust] Speed-up casts of dates and times +* [ARROW-11355](https://issues.apache.org/jira/browse/ARROW-11355) - [Rust] Align Date type with spec +* [ARROW-11358](https://issues.apache.org/jira/browse/ARROW-11358) - [Rust] Add benchmark for concatenating small arrays +* [ARROW-11360](https://issues.apache.org/jira/browse/ARROW-11360) - [Rust] [DataFusion] Improve CSV "No files found" error message +* [ARROW-11361](https://issues.apache.org/jira/browse/ARROW-11361) - [Rust] Build buffers from iterator of booleans +* [ARROW-11362](https://issues.apache.org/jira/browse/ARROW-11362) - [Rust][DataFusion] Use iterator APIs in to\_array\_of\_size to improve performance +* [ARROW-11365](https://issues.apache.org/jira/browse/ARROW-11365) - [Rust] [Parquet] Implement parsers for v2 of the text schema +* [ARROW-11366](https://issues.apache.org/jira/browse/ARROW-11366) - [Rust][DataFusion] Add Constant Folding / Support boolean literal in equality expression +* [ARROW-11367](https://issues.apache.org/jira/browse/ARROW-11367) - [C++] Implement approximante quantile utility +* [ARROW-11369](https://issues.apache.org/jira/browse/ARROW-11369) - [DataFusion] Split expressions.rs +* [ARROW-11372](https://issues.apache.org/jira/browse/ARROW-11372) - Support RC verification on macOS-ARM64 +* [ARROW-11373](https://issues.apache.org/jira/browse/ARROW-11373) - [Python][Docs] Add example of specifying type for a column when reading csv file +* [ARROW-11374](https://issues.apache.org/jira/browse/ARROW-11374) - [Python] Make legacy pyarrow.filesystem / pyarrow.serialize warnings more visisble +* [ARROW-11375](https://issues.apache.org/jira/browse/ARROW-11375) - [Rust] CI fails due to deprecation warning in clippy +* [ARROW-11377](https://issues.apache.org/jira/browse/ARROW-11377) - [C++][CI] Add ThreadSanitizer nightly build +* [ARROW-11383](https://issues.apache.org/jira/browse/ARROW-11383) - [Rust] use trusted len on bit ops +* [ARROW-11386](https://issues.apache.org/jira/browse/ARROW-11386) - [Release] Fix post documents update script +* [ARROW-11389](https://issues.apache.org/jira/browse/ARROW-11389) - [Rust] Inconsistent comments for datatypes +* [ARROW-11395](https://issues.apache.org/jira/browse/ARROW-11395) - [DataFusion] Support custom optimizations +* [ARROW-11401](https://issues.apache.org/jira/browse/ARROW-11401) - [Rust][DataFusion] Pass slices instead of Vec in DataFrame API +* [ARROW-11404](https://issues.apache.org/jira/browse/ARROW-11404) - [Rust][DataFusion] Upgrade to aHash 0.7 +* [ARROW-11405](https://issues.apache.org/jira/browse/ARROW-11405) - [DataFusion] Support multiple custom nodes +* [ARROW-11406](https://issues.apache.org/jira/browse/ARROW-11406) - [CI][C++] Fix caching on Travis-CI builds +* [ARROW-11408](https://issues.apache.org/jira/browse/ARROW-11408) - Add window support to datafusion readme +* [ARROW-11411](https://issues.apache.org/jira/browse/ARROW-11411) - [Packaging][Linux] Disable arm64 nightly builds +* [ARROW-11414](https://issues.apache.org/jira/browse/ARROW-11414) - [Rust] Reduce copies in Schema::try\_merge +* [ARROW-11417](https://issues.apache.org/jira/browse/ARROW-11417) - [Integration] Add integration test for buffer compression +* [ARROW-11418](https://issues.apache.org/jira/browse/ARROW-11418) - [Doc] Add IPC buffer compression to support matrix +* [ARROW-11421](https://issues.apache.org/jira/browse/ARROW-11421) - [Rust][DataFusion] Support group by Date32 +* [ARROW-11422](https://issues.apache.org/jira/browse/ARROW-11422) - [C\#] Add support for decimals +* [ARROW-11423](https://issues.apache.org/jira/browse/ARROW-11423) - [R] value\_counts and some StructArray methods +* [ARROW-11425](https://issues.apache.org/jira/browse/ARROW-11425) - [C++][Compute] Improve quantile kernel for integers +* [ARROW-11426](https://issues.apache.org/jira/browse/ARROW-11426) - [Rust][DataFusion] EXTRACT support +* [ARROW-11428](https://issues.apache.org/jira/browse/ARROW-11428) - [Rust] Add power kernel +* [ARROW-11429](https://issues.apache.org/jira/browse/ARROW-11429) - Make string comparisson kernels generic over Utf8 and LargeUtf8 +* [ARROW-11430](https://issues.apache.org/jira/browse/ARROW-11430) - [Rust] Kernel to combine two arrays based on boolean mask +* [ARROW-11431](https://issues.apache.org/jira/browse/ARROW-11431) - [Rust] [DataFusion] Add support for the SQL HAVING clause +* [ARROW-11435](https://issues.apache.org/jira/browse/ARROW-11435) - Allow creating ParquetPartition from external crate +* [ARROW-11436](https://issues.apache.org/jira/browse/ARROW-11436) - [Rust] Allow non-sized iterators in Primitive::from\_iter +* [ARROW-11437](https://issues.apache.org/jira/browse/ARROW-11437) - [Rust] Simplify benches +* [ARROW-11438](https://issues.apache.org/jira/browse/ARROW-11438) - Unsupported ast node Value(Boolean(true)) in sqltorel +* [ARROW-11439](https://issues.apache.org/jira/browse/ARROW-11439) - [Rust] Add year support to temporal kernel +* [ARROW-11440](https://issues.apache.org/jira/browse/ARROW-11440) - [Rust] [DataFusion] Add method to CsvExec to get CSV schema +* [ARROW-11442](https://issues.apache.org/jira/browse/ARROW-11442) - [Rust] Expose the logic used to interpret date/times +* [ARROW-11443](https://issues.apache.org/jira/browse/ARROW-11443) - [Rust] Write datetime information for Date64 Type in csv writer +* [ARROW-11444](https://issues.apache.org/jira/browse/ARROW-11444) - [Rust][DataFusion] Pass slices instead of &Vec to functions +* [ARROW-11446](https://issues.apache.org/jira/browse/ARROW-11446) - [DataFusion] Support scalars in builtin functions +* [ARROW-11447](https://issues.apache.org/jira/browse/ARROW-11447) - [Rust] Add shift kernel +* [ARROW-11449](https://issues.apache.org/jira/browse/ARROW-11449) - [CI][R][Windows] Use ccache +* [ARROW-11457](https://issues.apache.org/jira/browse/ARROW-11457) - [Rust] Make string comparisson kernels generic over Utf8 and LargeUtf8 +* [ARROW-11459](https://issues.apache.org/jira/browse/ARROW-11459) - [Rust] Allow ListArray of primitives to be built from iterator +* [ARROW-11462](https://issues.apache.org/jira/browse/ARROW-11462) - [Developer] Remove needless quote from the default DOCKER\_VOLUME\_PREFIX +* [ARROW-11463](https://issues.apache.org/jira/browse/ARROW-11463) - [Python] Allow configuration of IpcWriterOptions 64Bit from PyArrow +* [ARROW-11466](https://issues.apache.org/jira/browse/ARROW-11466) - [Flight][Go] Add BasicAuth and BearerToken handlers for Go +* [ARROW-11467](https://issues.apache.org/jira/browse/ARROW-11467) - [R] Fix reference to json\_table\_reader() in R docs +* [ARROW-11468](https://issues.apache.org/jira/browse/ARROW-11468) - [R] Allow user to pass schema to read\_json\_arrow() +* [ARROW-11474](https://issues.apache.org/jira/browse/ARROW-11474) - [C++] Update bundled re2 version +* [ARROW-11476](https://issues.apache.org/jira/browse/ARROW-11476) - [Rust][DataFusion] Test running of TPCH benchmarks in CI +* [ARROW-11477](https://issues.apache.org/jira/browse/ARROW-11477) - [R][Doc] Reorganize and improve README and vignette content +* [ARROW-11478](https://issues.apache.org/jira/browse/ARROW-11478) - [R] Consider ways to make arrow.skip\_nul option more user-friendly +* [ARROW-11479](https://issues.apache.org/jira/browse/ARROW-11479) - [Rust][Parquet] Add method to return compressed size of row group +* [ARROW-11481](https://issues.apache.org/jira/browse/ARROW-11481) - [Rust] More cast implementations +* [ARROW-11484](https://issues.apache.org/jira/browse/ARROW-11484) - [Rust] Derive Clone for ExecutionContext +* [ARROW-11486](https://issues.apache.org/jira/browse/ARROW-11486) - [Website] Use Jekyll 4 and webpack to support Ruby 3.0 or later +* [ARROW-11489](https://issues.apache.org/jira/browse/ARROW-11489) - [Rust][DataFusion] Make DataFrame Send+Sync +* [ARROW-11491](https://issues.apache.org/jira/browse/ARROW-11491) - [Rust] Support json schema inference for nested list and struct +* [ARROW-11493](https://issues.apache.org/jira/browse/ARROW-11493) - [CI][Packaging][deb][RPM] Test built packages +* [ARROW-11500](https://issues.apache.org/jira/browse/ARROW-11500) - [R] Allow bundled build script to run on Solaris +* [ARROW-11501](https://issues.apache.org/jira/browse/ARROW-11501) - [C++] endianness check does not work on Solaris +* [ARROW-11504](https://issues.apache.org/jira/browse/ARROW-11504) - [Rust] verify Datatype in ListArray::from(ArrayDataRef) +* [ARROW-11505](https://issues.apache.org/jira/browse/ARROW-11505) - [Rust] Add support for LargeUtf8 in csv-writer +* [ARROW-11507](https://issues.apache.org/jira/browse/ARROW-11507) - [R] Bindings for GetRuntimeInfo +* [ARROW-11510](https://issues.apache.org/jira/browse/ARROW-11510) - [Python] Add note that pip \>= 19.0 is required to get binary packages +* [ARROW-11511](https://issues.apache.org/jira/browse/ARROW-11511) - [Rust] Replace Arc by ArrayData +* [ARROW-11512](https://issues.apache.org/jira/browse/ARROW-11512) - [Packaging][deb] Add missing gRPC dependency for Ubuntu 21.04 +* [ARROW-11513](https://issues.apache.org/jira/browse/ARROW-11513) - [R] Bindings for sub/gsub +* [ARROW-11516](https://issues.apache.org/jira/browse/ARROW-11516) - [R] Allow all C++ compute functions to be called by name in dplyr +* [ARROW-11539](https://issues.apache.org/jira/browse/ARROW-11539) - [Developer][Archery] Change items\_per\_seconds units +* [ARROW-11541](https://issues.apache.org/jira/browse/ARROW-11541) - [C++][Compute] Implement approximate quantile kernel +* [ARROW-11542](https://issues.apache.org/jira/browse/ARROW-11542) - [Rust] json reader should not crash when reading nested list +* [ARROW-11544](https://issues.apache.org/jira/browse/ARROW-11544) - [Rust] [DataFusion] Implement as\_any for AggregateExpr +* [ARROW-11545](https://issues.apache.org/jira/browse/ARROW-11545) - [Rust] [DataFusion] SendableRecordBatchStream should implement Sync +* [ARROW-11556](https://issues.apache.org/jira/browse/ARROW-11556) - [C++] Minor benchmark improvements +* [ARROW-11557](https://issues.apache.org/jira/browse/ARROW-11557) - [Rust] Add table de-registration to DataFusion ExecutionContext +* [ARROW-11559](https://issues.apache.org/jira/browse/ARROW-11559) - [C++] Improve flatbuffers verification limits +* [ARROW-11559](https://issues.apache.org/jira/browse/ARROW-11559) - [C++] Improve flatbuffers verification limits +* [ARROW-11561](https://issues.apache.org/jira/browse/ARROW-11561) - [Rust][DataFusion] Add Send + Sync to MemTable::load +* [ARROW-11563](https://issues.apache.org/jira/browse/ARROW-11563) - [Rust] Support Cast(Utf8, TimeStamp(Nanoseconds, None)) +* [ARROW-11568](https://issues.apache.org/jira/browse/ARROW-11568) - [C++][Compute] Mode kernel performance is bad in some conditions +* [ARROW-11570](https://issues.apache.org/jira/browse/ARROW-11570) - [Rust] ScalarValue - support Date64 +* [ARROW-11571](https://issues.apache.org/jira/browse/ARROW-11571) - [CI] Cancel stale Github Actions workflow runs +* [ARROW-11572](https://issues.apache.org/jira/browse/ARROW-11572) - [Rust] Add a kernel for division by single scalar +* [ARROW-11573](https://issues.apache.org/jira/browse/ARROW-11573) - [Developer][Archery] Google benchmark now reports run type +* [ARROW-11574](https://issues.apache.org/jira/browse/ARROW-11574) - [Rust][DataFusion] Upgrade sqlparser to 0.8 to support parsing all TPC-H queries +* [ARROW-11575](https://issues.apache.org/jira/browse/ARROW-11575) - [Developer][Archery] Expose execution time in benchmark results +* [ARROW-11576](https://issues.apache.org/jira/browse/ARROW-11576) - [Rust] Remove unused variable in example +* [ARROW-11580](https://issues.apache.org/jira/browse/ARROW-11580) - [C++] Add CMake option ARROW\_DEPENDENCY\_SOURCE=VCPKG +* [ARROW-11581](https://issues.apache.org/jira/browse/ARROW-11581) - [Packaging][C++] Formalize distribution through vcpkg +* [ARROW-11589](https://issues.apache.org/jira/browse/ARROW-11589) - [R] Add methods for modifying Schemas +* [ARROW-11590](https://issues.apache.org/jira/browse/ARROW-11590) - [C++] Move CSV background generator to IO thread pool +* [ARROW-11591](https://issues.apache.org/jira/browse/ARROW-11591) - [C++][Compute] Prototype version of hash aggregation +* [ARROW-11592](https://issues.apache.org/jira/browse/ARROW-11592) - [Rust] Typo in comment +* [ARROW-11594](https://issues.apache.org/jira/browse/ARROW-11594) - [Rust] Support pretty printing with NullArrays +* [ARROW-11597](https://issues.apache.org/jira/browse/ARROW-11597) - [Rust] Split datatypes in a module +* [ARROW-11598](https://issues.apache.org/jira/browse/ARROW-11598) - [Rust] Split buffer.rs in smaller files +* [ARROW-11599](https://issues.apache.org/jira/browse/ARROW-11599) - [Rust] Add function to create array with all nulls +* [ARROW-11601](https://issues.apache.org/jira/browse/ARROW-11601) - [C++][Dataset] Expose pre-buffering in ParquetFileFormatReaderOptions +* [ARROW-11606](https://issues.apache.org/jira/browse/ARROW-11606) - [Rust] [DataFusion] Need guidance on HashAggregateExec reconstruction +* [ARROW-11610](https://issues.apache.org/jira/browse/ARROW-11610) - [C++] Download boost from sourceforge instead of bintray +* [ARROW-11611](https://issues.apache.org/jira/browse/ARROW-11611) - [C++] Update third party dependency mirrors +* [ARROW-11612](https://issues.apache.org/jira/browse/ARROW-11612) - [C++] Rebuild trimmed boost bundle for 1.75.0 +* [ARROW-11613](https://issues.apache.org/jira/browse/ARROW-11613) - [R] Move nightly C++ builds off of bintray +* [ARROW-11616](https://issues.apache.org/jira/browse/ARROW-11616) - [Rust][DataFusion] Expose collect\_partitioned for DataFrame +* [ARROW-11621](https://issues.apache.org/jira/browse/ARROW-11621) - [CI][Gandiva][Linux] Fix Crossbow setup failure +* [ARROW-11626](https://issues.apache.org/jira/browse/ARROW-11626) - [Rust][DataFusion] Move DataFusion examples to own project to reduce nr dependencies +* [ARROW-11627](https://issues.apache.org/jira/browse/ARROW-11627) - [Rust] Typed allocator +* [ARROW-11637](https://issues.apache.org/jira/browse/ARROW-11637) - [CI][Conda] Update nightly clean target platforms and packages list +* [ARROW-11641](https://issues.apache.org/jira/browse/ARROW-11641) - [CI] Use docker buildkit's inline cache to reuse build cache across different hosts +* [ARROW-11649](https://issues.apache.org/jira/browse/ARROW-11649) - [R] Add support for null\_fallback to R +* [ARROW-11651](https://issues.apache.org/jira/browse/ARROW-11651) - [Rust][DataFusion] Implement Postgres Length Functions +* [ARROW-11653](https://issues.apache.org/jira/browse/ARROW-11653) - Ascii/unicode functions +* [ARROW-11655](https://issues.apache.org/jira/browse/ARROW-11655) - Pad/trim functions +* [ARROW-11656](https://issues.apache.org/jira/browse/ARROW-11656) - Left over functions/fixes +* [ARROW-11659](https://issues.apache.org/jira/browse/ARROW-11659) - [R] Preserve group\_by .drop argument +* [ARROW-11662](https://issues.apache.org/jira/browse/ARROW-11662) - [C++] Support sorting for decimal data type. +* [ARROW-11664](https://issues.apache.org/jira/browse/ARROW-11664) - [Rust] Cast to LargeUtf8 +* [ARROW-11665](https://issues.apache.org/jira/browse/ARROW-11665) - [Python] Document precision and scale parameters of decimal128() +* [ARROW-11666](https://issues.apache.org/jira/browse/ARROW-11666) - [Integration] Add endianness "gold" integration file for decimal256 +* [ARROW-11667](https://issues.apache.org/jira/browse/ARROW-11667) - [Rust] Add docs for utf8 comparison functions +* [ARROW-11669](https://issues.apache.org/jira/browse/ARROW-11669) - [Rust] [DataFusion] Remove concurrency field from GlobalLimitExec +* [ARROW-11671](https://issues.apache.org/jira/browse/ARROW-11671) - [Rust][DataFusion] Clean up docs on Expr +* [ARROW-11677](https://issues.apache.org/jira/browse/ARROW-11677) - [C++][Dataset] Write documentation +* [ARROW-11680](https://issues.apache.org/jira/browse/ARROW-11680) - [C++] Add vendored version of folly's spsc queue +* [ARROW-11683](https://issues.apache.org/jira/browse/ARROW-11683) - [R] Support dplyr::mutate() +* [ARROW-11685](https://issues.apache.org/jira/browse/ARROW-11685) - [C++] Typo in future\_test.cc +* [ARROW-11688](https://issues.apache.org/jira/browse/ARROW-11688) - [Rust] Casts between utf8 and large-utf8 +* [ARROW-11690](https://issues.apache.org/jira/browse/ARROW-11690) - [Rust][DataFusion] Avoid Expr::clone in Expr builder methods +* [ARROW-11692](https://issues.apache.org/jira/browse/ARROW-11692) - [Rust][DataFusion] Improve documentation on Optimizer +* [ARROW-11693](https://issues.apache.org/jira/browse/ARROW-11693) - [C++] Add string length kernel +* [ARROW-11700](https://issues.apache.org/jira/browse/ARROW-11700) - [R] Internationalize error handling in tidy eval +* [ARROW-11701](https://issues.apache.org/jira/browse/ARROW-11701) - [R] Implement dplyr::relocate() +* [ARROW-11703](https://issues.apache.org/jira/browse/ARROW-11703) - [R] Implement dplyr::arrange() +* [ARROW-11704](https://issues.apache.org/jira/browse/ARROW-11704) - [R] Wire up dplyr::mutate() for datasets +* [ARROW-11707](https://issues.apache.org/jira/browse/ARROW-11707) - Support CSV schema inference without IO +* [ARROW-11708](https://issues.apache.org/jira/browse/ARROW-11708) - Clean up Rust 2021 linting warning +* [ARROW-11709](https://issues.apache.org/jira/browse/ARROW-11709) - [Rust][DataFusion] Move \`expressions\` and \`inputs\` into LogicalPlan rather than helpers in util +* [ARROW-11710](https://issues.apache.org/jira/browse/ARROW-11710) - [Rust][DataFusion] Implement ExprRewriter to avoid tree traversal redundancy +* [ARROW-11719](https://issues.apache.org/jira/browse/ARROW-11719) - Support merged schema for memory table +* [ARROW-11721](https://issues.apache.org/jira/browse/ARROW-11721) - json schema inference should return Schema type instead of SchemaRef +* [ARROW-11722](https://issues.apache.org/jira/browse/ARROW-11722) - Improve error message in FFI +* [ARROW-11724](https://issues.apache.org/jira/browse/ARROW-11724) - [C++] Namespace collisions with protobuf 3.15 +* [ARROW-11725](https://issues.apache.org/jira/browse/ARROW-11725) - [Rust][DataFusion] Make use of the new divide\_scalar kernel in arrow +* [ARROW-11727](https://issues.apache.org/jira/browse/ARROW-11727) - [C++][FlightRPC] Use TDigest to estimate latency quantiles in benchmark +* [ARROW-11730](https://issues.apache.org/jira/browse/ARROW-11730) - [C++] Add implicit Future(Status) constructor for convenience +* [ARROW-11733](https://issues.apache.org/jira/browse/ARROW-11733) - [Rust][DataFusion] Support hash repartitioning +* [ARROW-11734](https://issues.apache.org/jira/browse/ARROW-11734) - [C++] vendored safe-math.h does not compile on Solaris +* [ARROW-11735](https://issues.apache.org/jira/browse/ARROW-11735) - [R] Allow Parquet and Arrow Dataset to be optional components +* [ARROW-11736](https://issues.apache.org/jira/browse/ARROW-11736) - [R] Allow string compute functions to be optional +* [ARROW-11737](https://issues.apache.org/jira/browse/ARROW-11737) - [C++] Patch vendored xxhash for Solaris +* [ARROW-11738](https://issues.apache.org/jira/browse/ARROW-11738) - [Rust][DataFusion] Concat Functions +* [ARROW-11740](https://issues.apache.org/jira/browse/ARROW-11740) - [C++] posix\_memalign not declared in scope on Solaris +* [ARROW-11742](https://issues.apache.org/jira/browse/ARROW-11742) - [Rust] [DataFusion] Add Expr::is\_null and Expr::is\_not\_null functions +* [ARROW-11744](https://issues.apache.org/jira/browse/ARROW-11744) - [C++] Add xsimd dependency +* [ARROW-11745](https://issues.apache.org/jira/browse/ARROW-11745) - [C++] Improve configurability of random data generation +* [ARROW-11750](https://issues.apache.org/jira/browse/ARROW-11750) - [Python][Dataset] Add support for project expressions +* [ARROW-11752](https://issues.apache.org/jira/browse/ARROW-11752) - [R] Replace usage of testthat::expect\_is() +* [ARROW-11753](https://issues.apache.org/jira/browse/ARROW-11753) - [Rust][DataFusion] Add test for Join Statement: Schema contains duplicate unqualified field name +* [ARROW-11754](https://issues.apache.org/jira/browse/ARROW-11754) - [R] Support dplyr::compute() +* [ARROW-11761](https://issues.apache.org/jira/browse/ARROW-11761) - [C++] Increase public API testing +* [ARROW-11766](https://issues.apache.org/jira/browse/ARROW-11766) - [R] Better handling for missing compression codecs on Linux +* [ARROW-11768](https://issues.apache.org/jira/browse/ARROW-11768) - [C++][CI] Make s390x build non-optional +* [ARROW-11773](https://issues.apache.org/jira/browse/ARROW-11773) - [Rust] Allow json writer to write out JSON arrays as well as newline formatted objects +* [ARROW-11774](https://issues.apache.org/jira/browse/ARROW-11774) - [R] one-line install from source on macOS +* [ARROW-11775](https://issues.apache.org/jira/browse/ARROW-11775) - [Rust][DataFusion] Feature Flags for Dependencies +* [ARROW-11777](https://issues.apache.org/jira/browse/ARROW-11777) - [Rust] impl AsRef for StringBuilder/BinaryBuilder +* [ARROW-11778](https://issues.apache.org/jira/browse/ARROW-11778) - Cast from large-utf8 to numerical arrays +* [ARROW-11779](https://issues.apache.org/jira/browse/ARROW-11779) - [Rust] make alloc module public +* [ARROW-11790](https://issues.apache.org/jira/browse/ARROW-11790) - [Rust][DataFusion] Change plan builder signature to take Vec rather than &[Expr] +* [ARROW-11794](https://issues.apache.org/jira/browse/ARROW-11794) - [Go] Add concurrent-safe ipc.FileReader.RecordAt(i) +* [ARROW-11795](https://issues.apache.org/jira/browse/ARROW-11795) - [MATLAB] Migrate MATLAB Interface for Apache Arrow design doc to Markdown +* [ARROW-11797](https://issues.apache.org/jira/browse/ARROW-11797) - [C++][Dataset] Provide Scanner methods to yield/visit scanned batches +* [ARROW-11798](https://issues.apache.org/jira/browse/ARROW-11798) - [Integration] Update testing submodule +* [ARROW-11799](https://issues.apache.org/jira/browse/ARROW-11799) - [Rust] String and Binary arrays created with incorrect length from unbound iterator +* [ARROW-11801](https://issues.apache.org/jira/browse/ARROW-11801) - [C++] Remove bad header guard in filesystem/type\_fwd.h +* [ARROW-11803](https://issues.apache.org/jira/browse/ARROW-11803) - [Rust] [Parquet] Support v2 LogicalType +* [ARROW-11806](https://issues.apache.org/jira/browse/ARROW-11806) - [Rust][DataFusion] Optimize inner join creation of indices +* [ARROW-11820](https://issues.apache.org/jira/browse/ARROW-11820) - Added macro create\_native to construct impl +* [ARROW-11822](https://issues.apache.org/jira/browse/ARROW-11822) - Support case sensitive for function +* [ARROW-11824](https://issues.apache.org/jira/browse/ARROW-11824) - [Rust] [Parquet] Use logical types in Arrow writer +* [ARROW-11825](https://issues.apache.org/jira/browse/ARROW-11825) - [Rust][DataFusion] Add mimalloc as option to benchmarks +* [ARROW-11833](https://issues.apache.org/jira/browse/ARROW-11833) - [C++] Vendored fast\_float errors for emscripten (architecture flag missing) +* [ARROW-11837](https://issues.apache.org/jira/browse/ARROW-11837) - [C++][Dataset] Expose originating fragment as a property of ScanTask +* [ARROW-11838](https://issues.apache.org/jira/browse/ARROW-11838) - [C++] Support reading IPC data with shared dictionaries +* [ARROW-11839](https://issues.apache.org/jira/browse/ARROW-11839) - [C++] Rewrite bit-unpacking optimizations using xsimd +* [ARROW-11842](https://issues.apache.org/jira/browse/ARROW-11842) - [Rust][Parquet] Use more efficient clone\_from in get\_batch\_with\_dict +* [ARROW-11852](https://issues.apache.org/jira/browse/ARROW-11852) - [Documentation] Update CONTRIBUTING to explain Contributor role +* [ARROW-11856](https://issues.apache.org/jira/browse/ARROW-11856) - [C++] Remove unused reference to RecordBatchStreamWriter +* [ARROW-11858](https://issues.apache.org/jira/browse/ARROW-11858) - [GLib] Gandiva Filter in GLib +* [ARROW-11859](https://issues.apache.org/jira/browse/ARROW-11859) - [GLib] GArrowArray: concatenate is missing +* [ARROW-11861](https://issues.apache.org/jira/browse/ARROW-11861) - [R][Packaging] Apply changes in r/tools/autobrew upstream +* [ARROW-11864](https://issues.apache.org/jira/browse/ARROW-11864) - [R] Document arrow.int64\_downcast option +* [ARROW-11870](https://issues.apache.org/jira/browse/ARROW-11870) - [Dev] Automatically run merge script in venv +* [ARROW-11876](https://issues.apache.org/jira/browse/ARROW-11876) - [Website] Update governance page +* [ARROW-11877](https://issues.apache.org/jira/browse/ARROW-11877) - [C++] Add initial microbenchmarks for Dataset internals +* [ARROW-11879](https://issues.apache.org/jira/browse/ARROW-11879) - [Rust][DataFusion] ExecutionContext::sql should optimize query plan +* [ARROW-11883](https://issues.apache.org/jira/browse/ARROW-11883) - [C++] Add ConcatMap, MergeMap, and an async-reentrant version of Map +* [ARROW-11887](https://issues.apache.org/jira/browse/ARROW-11887) - [C++] Add asynchronous read to streaming CSV reader +* [ARROW-11894](https://issues.apache.org/jira/browse/ARROW-11894) - [Rust][DataFusion] Change flight server example to use DataFrame API +* [ARROW-11895](https://issues.apache.org/jira/browse/ARROW-11895) - [Rust][DataFusion] Add support for extra column statistics +* [ARROW-11898](https://issues.apache.org/jira/browse/ARROW-11898) - [Rust] Pretty print columns +* [ARROW-11899](https://issues.apache.org/jira/browse/ARROW-11899) - [Java] Refactor the compression codec implementation into core/Arrow specific parts +* [ARROW-11900](https://issues.apache.org/jira/browse/ARROW-11900) - [Website] Add Yibo to committer list +* [ARROW-11906](https://issues.apache.org/jira/browse/ARROW-11906) - [R] Make FeatherReader print method more informative +* [ARROW-11907](https://issues.apache.org/jira/browse/ARROW-11907) - [C++] Use our own executor in S3FileSystem +* [ARROW-11910](https://issues.apache.org/jira/browse/ARROW-11910) - [Packaging][Ubuntu] Drop support for 16.04 +* [ARROW-11911](https://issues.apache.org/jira/browse/ARROW-11911) - [Website] Add protobuf vs arrow to FAQ +* [ARROW-11912](https://issues.apache.org/jira/browse/ARROW-11912) - [R] Remove args from FeatherReader$create +* [ARROW-11913](https://issues.apache.org/jira/browse/ARROW-11913) - [Rust] Improve performance of StringBuilder +* [ARROW-11920](https://issues.apache.org/jira/browse/ARROW-11920) - [R] Add r/libarrow to make clean +* [ARROW-11921](https://issues.apache.org/jira/browse/ARROW-11921) - [R] Set LC\_COLLATE in r/data-raw/codegen.R +* [ARROW-11924](https://issues.apache.org/jira/browse/ARROW-11924) - [C++] Provide streaming output from GetFileInfo +* [ARROW-11925](https://issues.apache.org/jira/browse/ARROW-11925) - [R] Add \`between\` method for arrow\_dplyr\_query +* [ARROW-11927](https://issues.apache.org/jira/browse/ARROW-11927) - [Rust][DataFusion] Support limit push down +* [ARROW-11931](https://issues.apache.org/jira/browse/ARROW-11931) - [Go][CI] Bump CI to use Go 1.15 +* [ARROW-11935](https://issues.apache.org/jira/browse/ARROW-11935) - [C++] Add push generator +* [ARROW-11944](https://issues.apache.org/jira/browse/ARROW-11944) - [Developer] Achery benchmark diff regression: cannot compare jsons +* [ARROW-11949](https://issues.apache.org/jira/browse/ARROW-11949) - [Ruby] Accept raw Ruby objects as sort key and options +* [ARROW-11951](https://issues.apache.org/jira/browse/ARROW-11951) - [Rust] Remove OffsetSize::prefix +* [ARROW-11952](https://issues.apache.org/jira/browse/ARROW-11952) - [Rust] Make ArrayData --\> GenericListArray fallable instead of \`panic!\` +* [ARROW-11954](https://issues.apache.org/jira/browse/ARROW-11954) - [C++] arrow/util/io\_util.cc does not compile on Solaris +* [ARROW-11955](https://issues.apache.org/jira/browse/ARROW-11955) - [Rust][DataFusion] Support Union +* [ARROW-11958](https://issues.apache.org/jira/browse/ARROW-11958) - [GLib] GArrowChunkedArray: combine is missing +* [ARROW-11959](https://issues.apache.org/jira/browse/ARROW-11959) - [Rust][DataFusion] Fix logging of optimized plan +* [ARROW-11962](https://issues.apache.org/jira/browse/ARROW-11962) - [Rust][DataFusion] Update Datafusion Docs / readme +* [ARROW-11969](https://issues.apache.org/jira/browse/ARROW-11969) - [Rust][DataFusion] Improve Examples in documentation +* [ARROW-11972](https://issues.apache.org/jira/browse/ARROW-11972) - [C++][Dataset] Extract IpcFragmentScanOptions, ParquetFragmentScanOptions +* [ARROW-11973](https://issues.apache.org/jira/browse/ARROW-11973) - [Rust] Boolean AND/OR kernels should follow sql behaviour regarding null values +* [ARROW-11977](https://issues.apache.org/jira/browse/ARROW-11977) - [Rust] Add documentation examples for sort kernel +* [ARROW-11982](https://issues.apache.org/jira/browse/ARROW-11982) - [Rust] Donate Ballista Distributed Compute Platform +* [ARROW-11984](https://issues.apache.org/jira/browse/ARROW-11984) - [C++][Gandiva] Implement SHA1 and SHA256 functions +* [ARROW-11987](https://issues.apache.org/jira/browse/ARROW-11987) - [C++][Gandiva] Implement trigonometric functions on Gandiva +* [ARROW-11988](https://issues.apache.org/jira/browse/ARROW-11988) - [C++][Gandiva] Implements the last\_day function +* [ARROW-11992](https://issues.apache.org/jira/browse/ARROW-11992) - [Rust][Parquet] Add upgrade notes on 4.0 rename of LogicalType \#9731 +* [ARROW-11993](https://issues.apache.org/jira/browse/ARROW-11993) - [C++] Don't download xsimd if ARROW\_SIMD\_LEVEL=NONE +* [ARROW-11996](https://issues.apache.org/jira/browse/ARROW-11996) - [R] Make r/configure run successfully on Solaris +* [ARROW-11999](https://issues.apache.org/jira/browse/ARROW-11999) - [Java] Support parallel vector element search with user-specified comparator +* [ARROW-12000](https://issues.apache.org/jira/browse/ARROW-12000) - [Documentation] Add note about deviation from style guide on struct/classes +* [ARROW-12005](https://issues.apache.org/jira/browse/ARROW-12005) - [R] Fix a bash typo in configure +* [ARROW-12017](https://issues.apache.org/jira/browse/ARROW-12017) - [R] [Documentation] Make proper developing arrow docs +* [ARROW-12019](https://issues.apache.org/jira/browse/ARROW-12019) - [Rust] [Parquet] Update README for 2.6.0 support +* [ARROW-12020](https://issues.apache.org/jira/browse/ARROW-12020) - [Rust][DataFusion] Adding SHOW TABLES and SHOW COLUMNS + partial information\_schema support to DataFusion +* [ARROW-12031](https://issues.apache.org/jira/browse/ARROW-12031) - [C++][CSV] infer CSV timestamps columns with fractional seconds +* [ARROW-12032](https://issues.apache.org/jira/browse/ARROW-12032) - [Rust] Optimize comparison kernels using trusted\_len iterator for bools +* [ARROW-12034](https://issues.apache.org/jira/browse/ARROW-12034) - [Docs] Formalize Minor PRs +* [ARROW-12037](https://issues.apache.org/jira/browse/ARROW-12037) - [Rust] [DataFusion] Support catalogs and schemas for table namespacing +* [ARROW-12038](https://issues.apache.org/jira/browse/ARROW-12038) - [Rust][DataFusion] Upgrade hashbrown to 0.11 +* [ARROW-12039](https://issues.apache.org/jira/browse/ARROW-12039) - [CI][C++][Gandiva] Fix gandiva nightly linux build failure +* [ARROW-12040](https://issues.apache.org/jira/browse/ARROW-12040) - [R] [CI] [C++] test-r-rstudio-r-base-3.6-opensuse15 timing out during tests +* [ARROW-12043](https://issues.apache.org/jira/browse/ARROW-12043) - [Rust] [Parquet] Write fixed size binary arrays +* [ARROW-12045](https://issues.apache.org/jira/browse/ARROW-12045) - First Chunk of ported Parquet Code +* [ARROW-12047](https://issues.apache.org/jira/browse/ARROW-12047) - [Rust] Clippy parquet +* [ARROW-12048](https://issues.apache.org/jira/browse/ARROW-12048) - [Rust][DataFusion] Support Common Table Expressions +* [ARROW-12052](https://issues.apache.org/jira/browse/ARROW-12052) - [Rust] Implement child data in C FFI +* [ARROW-12056](https://issues.apache.org/jira/browse/ARROW-12056) - [C++] Create sequencing AsyncGenerator +* [ARROW-12058](https://issues.apache.org/jira/browse/ARROW-12058) - [Python] Enable arithmetic operations on Expressions +* [ARROW-12068](https://issues.apache.org/jira/browse/ARROW-12068) - [Python] Stop using distutils +* [ARROW-12069](https://issues.apache.org/jira/browse/ARROW-12069) - [C++][Gandiva]Implement IN expressions for Decimal types +* [ARROW-12070](https://issues.apache.org/jira/browse/ARROW-12070) - [GLib] Drop support for GNU Autotools +* [ARROW-12071](https://issues.apache.org/jira/browse/ARROW-12071) - [GLib] Keep input stream reference of GArrowJSONReader +* [ARROW-12075](https://issues.apache.org/jira/browse/ARROW-12075) - [Rust][DataFusion] Add CTE to list of supported features +* [ARROW-12081](https://issues.apache.org/jira/browse/ARROW-12081) - [R] Bindings for utf8\_length +* [ARROW-12082](https://issues.apache.org/jira/browse/ARROW-12082) - [R][Dataset] Allow create dataset from vector of file paths +* [ARROW-12094](https://issues.apache.org/jira/browse/ARROW-12094) - [C++][R] Fix/workaround re2 building on clang/libc++ +* [ARROW-12097](https://issues.apache.org/jira/browse/ARROW-12097) - [C++] Modify BackgroundGenerator so it creates fewer threads +* [ARROW-12098](https://issues.apache.org/jira/browse/ARROW-12098) - [R] Catch cpp build failures on linux +* [ARROW-12104](https://issues.apache.org/jira/browse/ARROW-12104) - Next Chunk of ported Code +* [ARROW-12106](https://issues.apache.org/jira/browse/ARROW-12106) - [Rust][DataFusion] Support \`SELECT \* from information\_schema.tables\` +* [ARROW-12107](https://issues.apache.org/jira/browse/ARROW-12107) - [Rust][DataFusion] Support \`SELECT \* from information\_schema.columns\` +* [ARROW-12108](https://issues.apache.org/jira/browse/ARROW-12108) - [Rust][DataFusion] Support \`SHOW TABLES\` +* [ARROW-12109](https://issues.apache.org/jira/browse/ARROW-12109) - [Rust][DataFusion] Support \`SHOW COLUMNS\` +* [ARROW-12110](https://issues.apache.org/jira/browse/ARROW-12110) - [Java] Implement ZSTD buffer compression for java +* [ARROW-12111](https://issues.apache.org/jira/browse/ARROW-12111) - [Java] place files generated by flatc under source control +* [ARROW-12116](https://issues.apache.org/jira/browse/ARROW-12116) - [Rust] Fix or ignore 1.51 clippy lints +* [ARROW-12119](https://issues.apache.org/jira/browse/ARROW-12119) - [Rust][DataFusion] Improve performance of to\_array\_of\_size +* [ARROW-12120](https://issues.apache.org/jira/browse/ARROW-12120) - [Rust] Generate random arrays and batches +* [ARROW-12121](https://issues.apache.org/jira/browse/ARROW-12121) - [Rust] [Parquet] Arrow writer benchmarks +* [ARROW-12123](https://issues.apache.org/jira/browse/ARROW-12123) - [Rust][DataFusion] Use smallvec for indices for better join performance +* [ARROW-12128](https://issues.apache.org/jira/browse/ARROW-12128) - [CI][Crossbow] Remove (or fix) test-ubuntu-16.04-cpp job +* [ARROW-12131](https://issues.apache.org/jira/browse/ARROW-12131) - [CI][GLib] Ensure upgrading MSYS2 +* [ARROW-12133](https://issues.apache.org/jira/browse/ARROW-12133) - [C++][Gandiva] Add option to disable setting mcpu flag to host cpu during llvm ir compilation +* [ARROW-12134](https://issues.apache.org/jira/browse/ARROW-12134) - [C++] Add regex string match kernel +* [ARROW-12136](https://issues.apache.org/jira/browse/ARROW-12136) - [Rust][DataFusion] Reduce default batch\_size to 8192 +* [ARROW-12139](https://issues.apache.org/jira/browse/ARROW-12139) - [Python][Packaging] Use vcpkg to build macOS wheels +* [ARROW-12141](https://issues.apache.org/jira/browse/ARROW-12141) - [R] Bindings for grepl +* [ARROW-12143](https://issues.apache.org/jira/browse/ARROW-12143) - [CI] R builds should timeout and fail after some threshold and dump the output. +* [ARROW-12146](https://issues.apache.org/jira/browse/ARROW-12146) - [C++][Gandiva] Implement CONVERT\_FROM(expression, ‘UTF8’, replacement char) function +* [ARROW-12151](https://issues.apache.org/jira/browse/ARROW-12151) - [Docs] Add Jira component + summary conventions to the docs +* [ARROW-12153](https://issues.apache.org/jira/browse/ARROW-12153) - [Rust] [Parquet] Return file metadata after writing Parquet file +* [ARROW-12160](https://issues.apache.org/jira/browse/ARROW-12160) - [Rust] Add an \`into\_inner()\` method to ipc::writer::StreamWriter +* [ARROW-12164](https://issues.apache.org/jira/browse/ARROW-12164) - [Java] Make BaseAllocator.Config public +* [ARROW-12165](https://issues.apache.org/jira/browse/ARROW-12165) - [Rust] Inline append functions in builders for performance +* [ARROW-12168](https://issues.apache.org/jira/browse/ARROW-12168) - [Go][IPC] Implement Compression handling for IPC +* [ARROW-12170](https://issues.apache.org/jira/browse/ARROW-12170) - [Rust][DataFusion] Introduce repartition optimization +* [ARROW-12173](https://issues.apache.org/jira/browse/ARROW-12173) - [GLib] Remove \#include +* [ARROW-12176](https://issues.apache.org/jira/browse/ARROW-12176) - parquet/low-level-api/reader-writer.cc has some typos. +* [ARROW-12187](https://issues.apache.org/jira/browse/ARROW-12187) - [C++][FlightRPC] Enable compression in Flight benchmark +* [ARROW-12188](https://issues.apache.org/jira/browse/ARROW-12188) - [Docs] Switch to pydata-sphinx-theme for the main sphinx docs +* [ARROW-12190](https://issues.apache.org/jira/browse/ARROW-12190) - [Rust][DataFusion] Implement partitioned hash join +* [ARROW-12192](https://issues.apache.org/jira/browse/ARROW-12192) - [Website] Use downloadable URL for archive download +* [ARROW-12193](https://issues.apache.org/jira/browse/ARROW-12193) - [Dev][Release] Use downloadable URL for archive download +* [ARROW-12194](https://issues.apache.org/jira/browse/ARROW-12194) - [Rust] [Parquet] Update zstd version +* [ARROW-12197](https://issues.apache.org/jira/browse/ARROW-12197) - [R] dplyr bindings for cast, dictionary\_encode +* [ARROW-12200](https://issues.apache.org/jira/browse/ARROW-12200) - [R] Export and document list\_compute\_functions +* [ARROW-12204](https://issues.apache.org/jira/browse/ARROW-12204) - [Rust][CI] Reduce size of rust build artifacts in integration test +* [ARROW-12206](https://issues.apache.org/jira/browse/ARROW-12206) - [Python] Fix Table docstrings +* [ARROW-12208](https://issues.apache.org/jira/browse/ARROW-12208) - [C++] Add the ability to run async tasks without using the CPU thread pool +* [ARROW-12210](https://issues.apache.org/jira/browse/ARROW-12210) - [Rust][DataFusion] Document SHOW TABLES / SHOW COLUMNS / InformationSchema +* [ARROW-12214](https://issues.apache.org/jira/browse/ARROW-12214) - [Rust][DataFusion] Add some tests for limit +* [ARROW-12215](https://issues.apache.org/jira/browse/ARROW-12215) - [C++] fixed size binary columns cannot be null in CSV reader +* [ARROW-12217](https://issues.apache.org/jira/browse/ARROW-12217) - [C++] Cleanup cpp examples source file names +* [ARROW-12222](https://issues.apache.org/jira/browse/ARROW-12222) - [Dev][Packaging] Include build url in the crossbow console report +* [ARROW-12224](https://issues.apache.org/jira/browse/ARROW-12224) - [Rust] Use stable rust for no default test, clean up CI tests +* [ARROW-12228](https://issues.apache.org/jira/browse/ARROW-12228) - [CI] Create base image for conda environments +* [ARROW-12236](https://issues.apache.org/jira/browse/ARROW-12236) - [R][CI] Add check that all docs pages are listed in \_pkgdown.yml +* [ARROW-12237](https://issues.apache.org/jira/browse/ARROW-12237) - [Packaging][Debian] Add support for bulleye +* [ARROW-12238](https://issues.apache.org/jira/browse/ARROW-12238) - [JS] Remove trailing spaces +* [ARROW-12239](https://issues.apache.org/jira/browse/ARROW-12239) - [JS] Switch to yarn +* [ARROW-12242](https://issues.apache.org/jira/browse/ARROW-12242) - [Python][Doc] Tweak nightly build instructions +* [ARROW-12246](https://issues.apache.org/jira/browse/ARROW-12246) - [CI] Sync conda recipes with upstream feedstock +* [ARROW-12248](https://issues.apache.org/jira/browse/ARROW-12248) - [C++] Allow static builds to change memory allocators +* [ARROW-12249](https://issues.apache.org/jira/browse/ARROW-12249) - [R] [CI] Fix test-r-install-local nightlies +* [ARROW-12251](https://issues.apache.org/jira/browse/ARROW-12251) - [Rust] [Ballista] Add Ballista tests to CI +* [ARROW-12263](https://issues.apache.org/jira/browse/ARROW-12263) - [Dev][Packaging] Move Crossbow to Archery +* [ARROW-12269](https://issues.apache.org/jira/browse/ARROW-12269) - [JS] Move to eslint +* [ARROW-12274](https://issues.apache.org/jira/browse/ARROW-12274) - [JS] Document how to run tests without building +* [ARROW-12277](https://issues.apache.org/jira/browse/ARROW-12277) - [Rust][DataFusion] Min/Max are not supported for timestamp types +* [ARROW-12278](https://issues.apache.org/jira/browse/ARROW-12278) - [Rust][DataFusion]Use Timestamp(Nanosecond, None) for SQL TIMESTAMP Type +* [ARROW-12280](https://issues.apache.org/jira/browse/ARROW-12280) - [Developer] Remove @-mentions from commit messages in merge tool +* [ARROW-12281](https://issues.apache.org/jira/browse/ARROW-12281) - [JS] Remove shx, trash, and rimraf +* [ARROW-12283](https://issues.apache.org/jira/browse/ARROW-12283) - [R] Bindings for basic type convert functions in dplyr verbs +* [ARROW-12286](https://issues.apache.org/jira/browse/ARROW-12286) - [C++] Create AsyncGenerator from Future\> +* [ARROW-12287](https://issues.apache.org/jira/browse/ARROW-12287) - [C++] Create enumerating generator +* [ARROW-12288](https://issues.apache.org/jira/browse/ARROW-12288) - [C++] Create Scanner interface +* [ARROW-12289](https://issues.apache.org/jira/browse/ARROW-12289) - [C++] Create basic AsyncScanner implementation +* [ARROW-12303](https://issues.apache.org/jira/browse/ARROW-12303) - [JS] Use iterators instead of generators in critical code paths +* [ARROW-12304](https://issues.apache.org/jira/browse/ARROW-12304) - [R] Update news and polish docs for 4.0 +* [ARROW-12305](https://issues.apache.org/jira/browse/ARROW-12305) - [JS] Benchmark test data generate.py assumes python 2 +* [ARROW-12309](https://issues.apache.org/jira/browse/ARROW-12309) - [JS] Make es2015 bundles the default +* [ARROW-12316](https://issues.apache.org/jira/browse/ARROW-12316) - [C++] Switch default memory allocator from jemalloc to mimalloc on macOS +* [ARROW-12317](https://issues.apache.org/jira/browse/ARROW-12317) - [Rust] JSON writer does not support time, date or interval types +* [ARROW-12320](https://issues.apache.org/jira/browse/ARROW-12320) - [CI] REPO arg missing from conda-cpp-valgrind +* [ARROW-12323](https://issues.apache.org/jira/browse/ARROW-12323) - [C++][Gandiva] Implement castTIME(timestamp) function +* [ARROW-12325](https://issues.apache.org/jira/browse/ARROW-12325) - [C++] [CI] Nightly gandiva build failing due to failure of compiler to move return value +* [ARROW-12326](https://issues.apache.org/jira/browse/ARROW-12326) - [C++] Avoid needless c-ares detection +* [ARROW-12328](https://issues.apache.org/jira/browse/ARROW-12328) - [Rust] [Ballista] Fix code formatting +* [ARROW-12329](https://issues.apache.org/jira/browse/ARROW-12329) - [Rust] [Ballista] Add README +* [ARROW-12332](https://issues.apache.org/jira/browse/ARROW-12332) - [Rust] [Ballista] Api server for scheduler +* [ARROW-12333](https://issues.apache.org/jira/browse/ARROW-12333) - [JS] Remove jest-environment-node-debug and do not emit from typescript by default +* [ARROW-12335](https://issues.apache.org/jira/browse/ARROW-12335) - [Rust] [Ballista] Bump DataFusion version +* [ARROW-12337](https://issues.apache.org/jira/browse/ARROW-12337) - add DoubleEndedIterator and ExactSizeIterator traits +* [ARROW-12351](https://issues.apache.org/jira/browse/ARROW-12351) - [CI][Ruby] Use ruby/setup-ruby instead of actions/setup-ruby +* [ARROW-12352](https://issues.apache.org/jira/browse/ARROW-12352) - [CI][R][Windows] Remove needless workaround for MSYS2 +* [ARROW-12353](https://issues.apache.org/jira/browse/ARROW-12353) - [Packaging][deb] Rename -archive-keyring to -apt-source +* [ARROW-12354](https://issues.apache.org/jira/browse/ARROW-12354) - [Packaging][RPM] Use apache.jfrog.io/artifactory/ instead of apache.bintray.com/ +* [ARROW-12356](https://issues.apache.org/jira/browse/ARROW-12356) - [Website] Update install page instructions to point to artifactory +* [ARROW-12361](https://issues.apache.org/jira/browse/ARROW-12361) - [Rust] [DataFusion] Allow users to override physical optimization rules +* [ARROW-12367](https://issues.apache.org/jira/browse/ARROW-12367) - [C++] Stop producing when PushGenerator was destroyed +* [ARROW-12370](https://issues.apache.org/jira/browse/ARROW-12370) - [R] Bindings for power kernel +* [ARROW-12374](https://issues.apache.org/jira/browse/ARROW-12374) - [CI][C++][cron] Use Ubuntu 20.04 instead of 16.04 +* [ARROW-12375](https://issues.apache.org/jira/browse/ARROW-12375) - [Release] Remove rebase post-release scripts +* [ARROW-12376](https://issues.apache.org/jira/browse/ARROW-12376) - [Dev] archery trigger-bot should use logger.exception +* [ARROW-12380](https://issues.apache.org/jira/browse/ARROW-12380) - [Rust][Ballista] Add scheduler ui +* [ARROW-12381](https://issues.apache.org/jira/browse/ARROW-12381) - [Packaging][Python] macOS wheels are built with wrong package kind +* [ARROW-12383](https://issues.apache.org/jira/browse/ARROW-12383) - [JS] Update direct deps +* [ARROW-12384](https://issues.apache.org/jira/browse/ARROW-12384) - [JS] Improve code style +* [ARROW-12389](https://issues.apache.org/jira/browse/ARROW-12389) - [R] [Docs] Add note about autocasting +* [ARROW-12395](https://issues.apache.org/jira/browse/ARROW-12395) - [C++]: Create RunInSerialExecutor benchmark +* [ARROW-12396](https://issues.apache.org/jira/browse/ARROW-12396) - [Python][Docs] Clarify serialization docstrings about deprecated status +* [ARROW-12397](https://issues.apache.org/jira/browse/ARROW-12397) - [Rust] [DataFusion] Simplify readme example \#10038 +* [ARROW-12398](https://issues.apache.org/jira/browse/ARROW-12398) - [Rust] Remove double bound checks in iterators +* [ARROW-12400](https://issues.apache.org/jira/browse/ARROW-12400) - [Rust] Re-enable transform module tests +* [ARROW-12402](https://issues.apache.org/jira/browse/ARROW-12402) - [Rust] [DataFusion] Implement SQL metrics framework +* [ARROW-12406](https://issues.apache.org/jira/browse/ARROW-12406) - [R] fix checkbashims violation in configure +* [ARROW-12409](https://issues.apache.org/jira/browse/ARROW-12409) - [R] Remove LazyData from DESCRIPTION +* [ARROW-12419](https://issues.apache.org/jira/browse/ARROW-12419) - [Java] flatc is not used in mvn +* [ARROW-12420](https://issues.apache.org/jira/browse/ARROW-12420) - [C++/Dataset] Reading null columns as dictionary not longer possible +* [ARROW-12423](https://issues.apache.org/jira/browse/ARROW-12423) - [Docs] Codecov badge in main Readme only applies to Rust +* [ARROW-12425](https://issues.apache.org/jira/browse/ARROW-12425) - [Rust] new\_null\_array doesn't allocate keys buffer for dictionary arrays +* [ARROW-12432](https://issues.apache.org/jira/browse/ARROW-12432) - [Rust] [DataFusion] Add metrics for SortExec +* [ARROW-12436](https://issues.apache.org/jira/browse/ARROW-12436) - [Rust][Ballista] Add watch capabilities to config backend trait +* [ARROW-12467](https://issues.apache.org/jira/browse/ARROW-12467) - [C++][Gandiva] Add support for LLVM12 +* [ARROW-12477](https://issues.apache.org/jira/browse/ARROW-12477) - [Release] Download linux aarch64 miniforge in verify-release-candidate.sh +* [ARROW-12485](https://issues.apache.org/jira/browse/ARROW-12485) - [C++] Use mimalloc as the default memory allocator on macOS +* [ARROW-12488](https://issues.apache.org/jira/browse/ARROW-12488) - [GLib] Use g\_memdup2() with GLib 2.68 or later +* [ARROW-12494](https://issues.apache.org/jira/browse/ARROW-12494) - [C++] ORC adapter fails to compile on GCC 4.8 +* [ARROW-12506](https://issues.apache.org/jira/browse/ARROW-12506) - [Python] Improve modularity of pyarrow codebase to speedup compile time +* [ARROW-12652](https://issues.apache.org/jira/browse/ARROW-12652) - disable conda arm64 in nightly +* [PARQUET-1846](https://issues.apache.org/jira/browse/PARQUET-1846) - [C++] Remove deprecated IO classes and related functions +* [PARQUET-1899](https://issues.apache.org/jira/browse/PARQUET-1899) - [C++] Deprecated ReadBatchSpaced in parquet/column\_reader +* [PARQUET-1990](https://issues.apache.org/jira/browse/PARQUET-1990) - [C++] ConvertedType::NA is written out in some cases +* [PARQUET-1993](https://issues.apache.org/jira/browse/PARQUET-1993) - [C++] Expose when prefetching completes + + + +# Apache Arrow 3.0.0 (2021-01-25) ## New Features and Improvements * [ARROW-1846](https://issues.apache.org/jira/browse/ARROW-1846) - [C++] Implement "any" reduction kernel for boolean data -* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration * [ARROW-4193](https://issues.apache.org/jira/browse/ARROW-4193) - [Rust] Add support for decimal data type * [ARROW-4544](https://issues.apache.org/jira/browse/ARROW-4544) - [Rust] Read nested JSON structs into StructArrays * [ARROW-4804](https://issues.apache.org/jira/browse/ARROW-4804) - [Rust] Read temporal values from CSV - Parse Date32 and Date64 in CSV reader @@ -39,6 +2017,7 @@ * [ARROW-9296](https://issues.apache.org/jira/browse/ARROW-9296) - [CI][Rust] Enable more clippy lint checks * [ARROW-9304](https://issues.apache.org/jira/browse/ARROW-9304) - [C++] Add "AppendEmptyValue" builder APIs for use inside StructBuilder::AppendNull * [ARROW-9361](https://issues.apache.org/jira/browse/ARROW-9361) - [Rust] Move other array types into their own modules +* [ARROW-9367](https://issues.apache.org/jira/browse/ARROW-9367) - [Python] Sorting on pyarrow data structures ? * [ARROW-9400](https://issues.apache.org/jira/browse/ARROW-9400) - [Python] Do not depend on conda-forge static libraries in Windows wheel builds * [ARROW-9475](https://issues.apache.org/jira/browse/ARROW-9475) - [Java] Clean up usages of BaseAllocator, use BufferAllocator instead * [ARROW-9489](https://issues.apache.org/jira/browse/ARROW-9489) - [C++] Add fill\_null kernel implementation for (array[string], scalar[string]) @@ -326,9 +2305,9 @@ * [ARROW-10908](https://issues.apache.org/jira/browse/ARROW-10908) - [Rust] [DataFusion] Update relevant tpch-queries with BETWEEN * [ARROW-10917](https://issues.apache.org/jira/browse/ARROW-10917) - [Rust][Doc] Update feature matrix * [ARROW-10918](https://issues.apache.org/jira/browse/ARROW-10918) - [C++][Doc] Document supported Parquet features -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary +* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary +* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary +* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - [Rust] [Parquet] Add Decimal to ArrayBuilderReader for physical type fixed size binary * [ARROW-10929](https://issues.apache.org/jira/browse/ARROW-10929) - [Rust] Migrate CI tests to stable rust * [ARROW-10933](https://issues.apache.org/jira/browse/ARROW-10933) - [Rust] Update docs in regard to stable rust * [ARROW-10934](https://issues.apache.org/jira/browse/ARROW-10934) - [Python] Tests are failed with fsspec-0.8.5 @@ -458,6 +2437,7 @@ * [ARROW-11292](https://issues.apache.org/jira/browse/ARROW-11292) - [Release][JS] Use Node.JS LTS * [ARROW-11293](https://issues.apache.org/jira/browse/ARROW-11293) - [C++] Don't require Boost and gflags with find\_package(Arrow) * [ARROW-11307](https://issues.apache.org/jira/browse/ARROW-11307) - [Release][Ubuntu][20.10] Add workaround for dependency issue +* [ARROW-11454](https://issues.apache.org/jira/browse/ARROW-11454) - [Website] [Rust] 3.0.0 Blog Post * [PARQUET-1566](https://issues.apache.org/jira/browse/PARQUET-1566) - [C++] Indicate if null count, distinct count are present in column statistics @@ -471,6 +2451,7 @@ * [ARROW-9027](https://issues.apache.org/jira/browse/ARROW-9027) - [Python] Split in multiple files + clean-up pyarrow.parquet tests * [ARROW-9479](https://issues.apache.org/jira/browse/ARROW-9479) - [JS] Table.from fails for zero-item Lists, FixedSizeLists, Maps. ditto Table.empty * [ARROW-9636](https://issues.apache.org/jira/browse/ARROW-9636) - [Python] Update documentation about 'LZO' compression in parquet.write\_table +* [ARROW-9690](https://issues.apache.org/jira/browse/ARROW-9690) - [Go] tests failing on s390x * [ARROW-9776](https://issues.apache.org/jira/browse/ARROW-9776) - [R] read\_feather causes segfault in R if file doesn't exist * [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern * [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern @@ -492,6 +2473,7 @@ * [ARROW-10283](https://issues.apache.org/jira/browse/ARROW-10283) - [Python] Python deprecation warning for "PY\_SSIZE\_T\_CLEAN will be required for '\#' formats" * [ARROW-10293](https://issues.apache.org/jira/browse/ARROW-10293) - [Rust] [DataFusion] Fix benchmarks * [ARROW-10294](https://issues.apache.org/jira/browse/ARROW-10294) - [Java] Resolve problems of DecimalVector APIs on ArrowBufs +* [ARROW-10298](https://issues.apache.org/jira/browse/ARROW-10298) - [Rust] Incorrect offset handling in iterator over dictionary keys * [ARROW-10321](https://issues.apache.org/jira/browse/ARROW-10321) - [C++] Building AVX512 code when we should not * [ARROW-10333](https://issues.apache.org/jira/browse/ARROW-10333) - [Java] Remove split packages in arrow-memory-core and arrow-vectors * [ARROW-10345](https://issues.apache.org/jira/browse/ARROW-10345) - [C++] NaN breaks sorting @@ -521,7 +2503,6 @@ * [ARROW-10446](https://issues.apache.org/jira/browse/ARROW-10446) - [C++][Python] Timezone aware pd.Timestamp's are incorrectly converted to Timestamp arrys * [ARROW-10448](https://issues.apache.org/jira/browse/ARROW-10448) - [Rust] PrimitiveArray::new can create arrays not in spec * [ARROW-10453](https://issues.apache.org/jira/browse/ARROW-10453) - [Rust] [DataFusion] Performance degredation after removing specialization -* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests * [ARROW-10461](https://issues.apache.org/jira/browse/ARROW-10461) - [Rust] Offset related bug in BitChunks::remainder\_bits * [ARROW-10462](https://issues.apache.org/jira/browse/ARROW-10462) - [Python] ParquetDatasetPiece's path broken when using fsspec fs on Windows * [ARROW-10463](https://issues.apache.org/jira/browse/ARROW-10463) - [R] Better messaging for currently unsupported CSV options in open\_dataset @@ -531,7 +2512,6 @@ * [ARROW-10475](https://issues.apache.org/jira/browse/ARROW-10475) - [С++][FlightRPC] Arrow Flight Server / Client cannot be initialized with Ipv6 host * [ARROW-10480](https://issues.apache.org/jira/browse/ARROW-10480) - [Python] Parquet write\_table creates gzipped Parquet file, not Parquet with gzip compression * [ARROW-10482](https://issues.apache.org/jira/browse/ARROW-10482) - [Python] Specifying compression type on a column basis when writing Parquet not working -* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler * [ARROW-10491](https://issues.apache.org/jira/browse/ARROW-10491) - [FlightRPC][Java] Fix NPE when using FlightProducer without interceptors * [ARROW-10493](https://issues.apache.org/jira/browse/ARROW-10493) - [C++][Parquet] Writing nullable nested strings results in wrong data in file * [ARROW-10495](https://issues.apache.org/jira/browse/ARROW-10495) - [C++] find\_package(Arrow) is broken on Ubuntu 18 @@ -582,6 +2562,7 @@ * [ARROW-10684](https://issues.apache.org/jira/browse/ARROW-10684) - [Rust] Logical equality should consider parent array nullability * [ARROW-10690](https://issues.apache.org/jira/browse/ARROW-10690) - [Java] ComplexCopier gives incorrect result for list vector if target vector is non-empty * [ARROW-10692](https://issues.apache.org/jira/browse/ARROW-10692) - [Rust] Segfault while array buffer append +* [ARROW-10694](https://issues.apache.org/jira/browse/ARROW-10694) - [Python] ds.write\_dataset() generates empty files for each final partition * [ARROW-10699](https://issues.apache.org/jira/browse/ARROW-10699) - [C++] BitmapUInt64Reader doesn't work on big-endian * [ARROW-10701](https://issues.apache.org/jira/browse/ARROW-10701) - [Rust] [Datafusion] Benchmark sort\_limit\_query\_sql fails because order by clause specifies column index instead of expression * [ARROW-10705](https://issues.apache.org/jira/browse/ARROW-10705) - [Rust] Lifetime annotations in the IPC writer are too strict, preventing code reuse @@ -682,9 +2663,11 @@ * [ARROW-11232](https://issues.apache.org/jira/browse/ARROW-11232) - [C++] Table::CombineChunks() returns incorrect results if Table has no column * [ARROW-11233](https://issues.apache.org/jira/browse/ARROW-11233) - [C++][Flight] Fail to link with bundled gRPC and Abseil * [ARROW-11237](https://issues.apache.org/jira/browse/ARROW-11237) - [C++] Compiler error with GLog and unity build enabled +* [ARROW-11250](https://issues.apache.org/jira/browse/ARROW-11250) - [Python] Inconsistent behavior calling ds.dataset() * [ARROW-11251](https://issues.apache.org/jira/browse/ARROW-11251) - [CI] Make sure that devtoolset-8 is really installed + being used * [ARROW-11253](https://issues.apache.org/jira/browse/ARROW-11253) - [R] Make sure that large metadata tests are reproducible * [ARROW-11255](https://issues.apache.org/jira/browse/ARROW-11255) - [Packaging][Conda][macOS] Fix Python version +* [ARROW-11257](https://issues.apache.org/jira/browse/ARROW-11257) - [C++][Parquet] PyArrow Table contains different data after writing and reloading from Parquet * [ARROW-11271](https://issues.apache.org/jira/browse/ARROW-11271) - [Rust] [Parquet] List schema to Arrow parser misinterpreting child nullability * [ARROW-11274](https://issues.apache.org/jira/browse/ARROW-11274) - [Packaging][wheel][Windows] Fix wheels path for Gemfury * [ARROW-11275](https://issues.apache.org/jira/browse/ARROW-11275) - [Packaging][wheel][Linux] Fix paths for Gemfury @@ -694,11 +2677,18 @@ * [ARROW-11301](https://issues.apache.org/jira/browse/ARROW-11301) - [C++] Fix reading LZ4-compressed Parquet files produced by Java Parquet implementation * [ARROW-11302](https://issues.apache.org/jira/browse/ARROW-11302) - [Release][Python] Remove verification of python 3.5 wheel on macOS * [ARROW-11306](https://issues.apache.org/jira/browse/ARROW-11306) - [Packaging][Ubuntu][16.04] Add missing libprotobuf-dev dependency +* [ARROW-11363](https://issues.apache.org/jira/browse/ARROW-11363) - C++ Library Build Failure with gRPC 1.34+ +* [ARROW-11390](https://issues.apache.org/jira/browse/ARROW-11390) - [Python] pyarrow 3.0 issues with turbodbc +* [ARROW-11445](https://issues.apache.org/jira/browse/ARROW-11445) - Type conversion failure on numpy 0.1.20 +* [ARROW-11450](https://issues.apache.org/jira/browse/ARROW-11450) - [Python] pyarrow<3 incompatible with numpy\>=1.20.0 +* [ARROW-11487](https://issues.apache.org/jira/browse/ARROW-11487) - [Python] Can't create array from Categorical with numpy 1.20 +* [ARROW-11835](https://issues.apache.org/jira/browse/ARROW-11835) - [Python] PyArrow 3.0/Pip installation errors on Big Sur. +* [ARROW-12399](https://issues.apache.org/jira/browse/ARROW-12399) - Unable to load libhdfs * [PARQUET-1935](https://issues.apache.org/jira/browse/PARQUET-1935) - [C++][Parquet] nullptr access violation when writing arrays of non-nullable values -# Apache Arrow 2.0.0 (2020-10-13) +# Apache Arrow 2.0.0 (2020-10-19) ## Bug Fixes @@ -773,6 +2763,7 @@ * [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps * [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412 * [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client +* [ARROW-9676](https://issues.apache.org/jira/browse/ARROW-9676) - [R] Error converting Table with nested structs * [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz) * [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning * [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails @@ -803,6 +2794,7 @@ * [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch * [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect * [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns +* [ARROW-9801](https://issues.apache.org/jira/browse/ARROW-9801) - DictionaryArray with non-unique values are silently corrupted when written to a Parquet file * [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true * [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs * [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs @@ -827,7 +2819,6 @@ * [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6 * [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads * [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant" -* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern * [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem) * [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another * [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array @@ -915,18 +2906,22 @@ * [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors * [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386 * [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions +* [ARROW-10296](https://issues.apache.org/jira/browse/ARROW-10296) - [R] Data saved as integer64 loaded as integer +* [ARROW-10517](https://issues.apache.org/jira/browse/ARROW-10517) - [Python] Unable to read/write Parquet datasets with fsspec on Azure Blob +* [ARROW-11062](https://issues.apache.org/jira/browse/ARROW-11062) - [Java] When writing to flight stream, Spark's mapPartitions is not working ## New Features and Improvements * [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections -* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader * [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages +* [ARROW-1644](https://issues.apache.org/jira/browse/ARROW-1644) - [C++][Parquet] Read and write nested Parquet data with a mix of struct and list nesting levels * [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain * [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays * [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs * [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths * [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client +* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration * [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility * [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests * [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio @@ -957,14 +2952,11 @@ * [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema * [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs * [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder -* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer * [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers * [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather * [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes * [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values * [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java -* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet -* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types * [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction. * [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array reassembly logic * [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset @@ -1043,7 +3035,7 @@ * [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers * [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers * [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface -* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets +* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets * [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10 * [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI * [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE @@ -1224,7 +3216,6 @@ * [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel * [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list * [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion -* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes * [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code * [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded * [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids @@ -1246,10 +3237,8 @@ * [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing * [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels * [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs -* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields * [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from\` * [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers -* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches * [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk() * [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints * [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI @@ -1258,7 +3247,6 @@ * [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation * [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef * [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs -* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests * [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size * [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement. * [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image @@ -1282,7 +3270,55 @@ -# Apache Arrow 1.0.0 (2020-07-20) +# Apache Arrow 1.0.1 (2020-08-21) + +## Bug Fixes + +* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe +* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java +* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working +* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds +* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values +* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml +* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change +* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar +* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_' +* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release +* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN +* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0 +* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class +* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle +* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again +* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet] Spaced definition levels is not assigned correctly. +* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers +* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build +* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build +* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build +* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels +* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns +* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0 +* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight +* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight +* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path +* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers +* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz) +* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos +* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error +* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset +* [ARROW-10126](https://issues.apache.org/jira/browse/ARROW-10126) - [Python] Impossible to import pyarrow module in python. Generates this "ImportError: DLL load failed: The specified procedure could not be found." +* [ARROW-10460](https://issues.apache.org/jira/browse/ARROW-10460) - [FlightRPC][Python] FlightRPC authentication mechanism changed and is undocumented, breaking current working code + + +## New Features and Improvements + +* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends +* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website +* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1 +* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies + + + +# Apache Arrow 1.0.0 (2020-07-24) ## Bug Fixes @@ -1315,6 +3351,7 @@ * [ARROW-7702](https://issues.apache.org/jira/browse/ARROW-7702) - [C++][Dataset] Provide (optional) deterministic order of batches * [ARROW-7782](https://issues.apache.org/jira/browse/ARROW-7782) - [Python] Losing index information when using write\_to\_dataset with partition\_cols * [ARROW-7840](https://issues.apache.org/jira/browse/ARROW-7840) - [Java] [Integration] Java executables fail +* [ARROW-7843](https://issues.apache.org/jira/browse/ARROW-7843) - [Ruby] MSYS2 packages needed for Gandiva * [ARROW-7925](https://issues.apache.org/jira/browse/ARROW-7925) - [C++][Documentation] Instructions about running IWYU and other tasks in cpp/development.rst have gone stale * [ARROW-7939](https://issues.apache.org/jira/browse/ARROW-7939) - [Python] crashes when reading parquet file compressed with snappy * [ARROW-7967](https://issues.apache.org/jira/browse/ARROW-7967) - [CI][Crossbow] Pin macOS version in autobrew job to match CRAN @@ -1449,6 +3486,7 @@ * [ARROW-9024](https://issues.apache.org/jira/browse/ARROW-9024) - [C++/Python] Install anaconda-client in conda-clean job * [ARROW-9026](https://issues.apache.org/jira/browse/ARROW-9026) - [C++/Python] Force package removal from arrow-nightlies conda repository * [ARROW-9037](https://issues.apache.org/jira/browse/ARROW-9037) - [C++][C] unable to import array with null count == -1 (which could be exported) +* [ARROW-9040](https://issues.apache.org/jira/browse/ARROW-9040) - [Python][Parquet]"\_ParquetDatasetV2" fail to read with columns and use\_pandas\_metadata=True * [ARROW-9057](https://issues.apache.org/jira/browse/ARROW-9057) - [Rust] Projection should work on InMemoryScan without error * [ARROW-9059](https://issues.apache.org/jira/browse/ARROW-9059) - [Rust] Documentation for slicing array data has the wrong sign * [ARROW-9066](https://issues.apache.org/jira/browse/ARROW-9066) - [Python] Raise correct error in isnull() @@ -1557,7 +3595,7 @@ * [ARROW-9512](https://issues.apache.org/jira/browse/ARROW-9512) - [C++] Variadic template unpack inside lambda doesn't compile with gcc * [ARROW-9524](https://issues.apache.org/jira/browse/ARROW-9524) - [CI][Gandiva] C++ unit test arrow-ipc-read-write failing in gandiva nightly build * [ARROW-9527](https://issues.apache.org/jira/browse/ARROW-9527) - [Rust] Remove un-needed dev-dependencies -* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow +* [ARROW-10126](https://issues.apache.org/jira/browse/ARROW-10126) - [Python] Impossible to import pyarrow module in python. Generates this "ImportError: DLL load failed: The specified procedure could not be found." * [PARQUET-1839](https://issues.apache.org/jira/browse/PARQUET-1839) - [C++] values\_read not updated in ReadBatchSpaced * [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups * [PARQUET-1865](https://issues.apache.org/jira/browse/PARQUET-1865) - [C++] Failure from C++17 feature used in parquet/encoding\_benchmark.cc @@ -1592,6 +3630,7 @@ * [ARROW-3509](https://issues.apache.org/jira/browse/ARROW-3509) - [C++] Inconsistent child accessor naming * [ARROW-3520](https://issues.apache.org/jira/browse/ARROW-3520) - [C++] Implement List Flatten kernel * [ARROW-3688](https://issues.apache.org/jira/browse/ARROW-3688) - [Rust] Implement PrimitiveArrayBuilder.push\_values +* [ARROW-3764](https://issues.apache.org/jira/browse/ARROW-3764) - [C++] Port Python "ParquetDataset" business logic to C++ * [ARROW-3827](https://issues.apache.org/jira/browse/ARROW-3827) - [Rust] Implement UnionArray * [ARROW-4022](https://issues.apache.org/jira/browse/ARROW-4022) - [C++] Promote Datum variant out of compute namespace * [ARROW-4221](https://issues.apache.org/jira/browse/ARROW-4221) - [Format] Add canonical flag in COO sparse index @@ -1599,6 +3638,7 @@ * [ARROW-4412](https://issues.apache.org/jira/browse/ARROW-4412) - [DOCUMENTATION] Add explicit version numbers to the arrow specification documents. * [ARROW-4427](https://issues.apache.org/jira/browse/ARROW-4427) - [Doc] Move Confluence Wiki pages to the Sphinx docs * [ARROW-4429](https://issues.apache.org/jira/browse/ARROW-4429) - [Doc] Add git rebase tips to the 'Contributing' page in the developer docs +* [ARROW-4526](https://issues.apache.org/jira/browse/ARROW-4526) - [Java] Remove Netty references from ArrowBuf and move Allocator out of vector package * [ARROW-5035](https://issues.apache.org/jira/browse/ARROW-5035) - [C\#] ArrowBuffer.Builder is broken * [ARROW-5082](https://issues.apache.org/jira/browse/ARROW-5082) - [Python][Packaging] Reduce size of macOS and manylinux1 wheels * [ARROW-5143](https://issues.apache.org/jira/browse/ARROW-5143) - [Flight] Enable integration testing of batches with dictionaries @@ -1709,6 +3749,7 @@ * [ARROW-8440](https://issues.apache.org/jira/browse/ARROW-8440) - [C++] Refine simd header files * [ARROW-8443](https://issues.apache.org/jira/browse/ARROW-8443) - [Gandiva][C++] Fix round/truncate to no-op for special cases * [ARROW-8447](https://issues.apache.org/jira/browse/ARROW-8447) - [C++][Dataset] Ensure Scanner::ToTable preserve ordering of ScanTasks +* [ARROW-8456](https://issues.apache.org/jira/browse/ARROW-8456) - [Release] Add python script to help curating JIRA * [ARROW-8467](https://issues.apache.org/jira/browse/ARROW-8467) - [C++] Test cases using ArrayFromJSON assume only a little-endian platform * [ARROW-8474](https://issues.apache.org/jira/browse/ARROW-8474) - [CI][Crossbow] Skip some nightlies we don't need to run * [ARROW-8477](https://issues.apache.org/jira/browse/ARROW-8477) - [C++] Enable reading and writing of long filenames for Windows @@ -1787,7 +3828,6 @@ * [ARROW-8648](https://issues.apache.org/jira/browse/ARROW-8648) - [Rust] Optimize Rust CI Build Times * [ARROW-8650](https://issues.apache.org/jira/browse/ARROW-8650) - [Rust] [Website] Add documentation to Arrow website * [ARROW-8651](https://issues.apache.org/jira/browse/ARROW-8651) - [Python][Dataset] Support pickling of Dataset objects -* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset * [ARROW-8656](https://issues.apache.org/jira/browse/ARROW-8656) - [Python] Switch to VS2017 in the windows wheel builds * [ARROW-8659](https://issues.apache.org/jira/browse/ARROW-8659) - [Rust] ListBuilder and FixedSizeListBuilder capacity * [ARROW-8660](https://issues.apache.org/jira/browse/ARROW-8660) - [C++][Gandiva] Reduce dependence on Boost @@ -1889,6 +3929,7 @@ * [ARROW-8867](https://issues.apache.org/jira/browse/ARROW-8867) - [R] Support converting POSIXlt type * [ARROW-8875](https://issues.apache.org/jira/browse/ARROW-8875) - [C++] use AWS SDK SetResponseStreamFactory to avoid a copy of bytes * [ARROW-8877](https://issues.apache.org/jira/browse/ARROW-8877) - [Rust] add CSV read option struct to simplify datafusion interface +* [ARROW-8879](https://issues.apache.org/jira/browse/ARROW-8879) - [FlightRPC][Java] FlightStream should unwrap ExecutionExceptions * [ARROW-8880](https://issues.apache.org/jira/browse/ARROW-8880) - [R][Linux] Make R Binary Install Friendlier * [ARROW-8881](https://issues.apache.org/jira/browse/ARROW-8881) - [Rust] Add large list and binary support * [ARROW-8885](https://issues.apache.org/jira/browse/ARROW-8885) - [R] Don't include everything everywhere @@ -1947,6 +3988,7 @@ * [ARROW-9004](https://issues.apache.org/jira/browse/ARROW-9004) - [C++][Gandiva] Support building with LLVM 10 * [ARROW-9005](https://issues.apache.org/jira/browse/ARROW-9005) - [Rust] [DataFusion] Support sort expression * [ARROW-9007](https://issues.apache.org/jira/browse/ARROW-9007) - [Rust] Support appending arrays by merging array data +* [ARROW-9011](https://issues.apache.org/jira/browse/ARROW-9011) - [Python][Packaging] Move the anaconda cleanup script to crossbow * [ARROW-9014](https://issues.apache.org/jira/browse/ARROW-9014) - [Packaging] Bump the minor part of the automatically generated version in crossbow * [ARROW-9015](https://issues.apache.org/jira/browse/ARROW-9015) - [Java] Make BaseAllocator package private * [ARROW-9016](https://issues.apache.org/jira/browse/ARROW-9016) - [Java] Remove direct references to Netty/Unsafe Allocators @@ -2048,7 +4090,6 @@ * [ARROW-9202](https://issues.apache.org/jira/browse/ARROW-9202) - [GLib] Add GArrowDatum * [ARROW-9203](https://issues.apache.org/jira/browse/ARROW-9203) - [Packaging][deb] Add missing gir1.2-arrow-dataset-1.0.install * [ARROW-9204](https://issues.apache.org/jira/browse/ARROW-9204) - [C++][Flight] change records\_per\_stream to int64 in flight benchmark -* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst * [ARROW-9206](https://issues.apache.org/jira/browse/ARROW-9206) - [C++][Flight] measure latency in flight benchmark * [ARROW-9207](https://issues.apache.org/jira/browse/ARROW-9207) - [Python][Dataset] Clean-up internal FileSource class * [ARROW-9210](https://issues.apache.org/jira/browse/ARROW-9210) - [C++] Use OptionalBitBlockCounter in ArrayDataInlineVisitor @@ -2145,8 +4186,8 @@ * [ARROW-9493](https://issues.apache.org/jira/browse/ARROW-9493) - [Python][Dataset] Dictionary encode string partition columns by default * [ARROW-9509](https://issues.apache.org/jira/browse/ARROW-9509) - [Release] Don't test Gandiva in the windows wheel verification script * [ARROW-9511](https://issues.apache.org/jira/browse/ARROW-9511) - [Packaging][Release] Set conda packages' build number to 0 +* [ARROW-9514](https://issues.apache.org/jira/browse/ARROW-9514) - [Python] The new Dataset API will not work with files on Azure Blob * [ARROW-9519](https://issues.apache.org/jira/browse/ARROW-9519) - [Rust] Improve error message when getting a field by name from schema -* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel * [ARROW-9529](https://issues.apache.org/jira/browse/ARROW-9529) - [Dev][Release] Improvements to release verification scripts * [ARROW-9531](https://issues.apache.org/jira/browse/ARROW-9531) - [Packaging][Release] Update conda forge dependency pins * [PARQUET-1820](https://issues.apache.org/jira/browse/PARQUET-1820) - [C++] Use a column filter hint to inform read prefetching in Arrow reads @@ -2179,6 +4220,7 @@ ## New Features and Improvements +* [ARROW-7731](https://issues.apache.org/jira/browse/ARROW-7731) - [C++][Parquet] Support LargeListArray * [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6 * [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups * [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion @@ -7094,7 +9136,7 @@ * [ARROW-2305](https://issues.apache.org/jira/browse/ARROW-2305) - [Python] Cython 0.25.2 compilation failure * [ARROW-2314](https://issues.apache.org/jira/browse/ARROW-2314) - [Python] Union array slicing is defective * [ARROW-2326](https://issues.apache.org/jira/browse/ARROW-2326) - [Python] cannot import pip installed pyarrow on OS X (10.9) -* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - Writing a slice with feather ignores the offset +* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - [C++] Writing a slice with feather ignores the offset * [ARROW-2331](https://issues.apache.org/jira/browse/ARROW-2331) - [Python] Fix indexing implementations * [ARROW-2333](https://issues.apache.org/jira/browse/ARROW-2333) - [Python] boost bundling fails in setup.py * [ARROW-2342](https://issues.apache.org/jira/browse/ARROW-2342) - [Python] Aware timestamp type fails pickling @@ -7586,6 +9628,7 @@ * [ARROW-1463](https://issues.apache.org/jira/browse/ARROW-1463) - [JAVA] Restructure ValueVector hierarchy to minimize compile-time generated code * [ARROW-1579](https://issues.apache.org/jira/browse/ARROW-1579) - [Java] Add dockerized test setup to validate Spark integration * [ARROW-1580](https://issues.apache.org/jira/browse/ARROW-1580) - [Python] Instructions for setting up nightly builds on Linux +* [ARROW-1621](https://issues.apache.org/jira/browse/ARROW-1621) - [JAVA] Reduce Heap Usage per Vector * [ARROW-1623](https://issues.apache.org/jira/browse/ARROW-1623) - [C++] Add convenience method to construct Buffer from a string that owns its memory * [ARROW-1632](https://issues.apache.org/jira/browse/ARROW-1632) - [Python] Permit categorical conversions in Table.to\_pandas on a per-column basis * [ARROW-1643](https://issues.apache.org/jira/browse/ARROW-1643) - [Python] Accept hdfs:// prefixes in parquet.read\_table and attempt to connect to HDFS diff --git a/LICENSE.txt b/LICENSE.txt index 519a73f04f247..d285caa4ff2c9 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1488,42 +1488,6 @@ Other dependencies and licenses: OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Open Source Software Licensed Under the JSON License: - -------------------------------------------------------------------- - - json.org - Copyright (c) 2002 JSON.org - All Rights Reserved. - - JSON_checker - Copyright (c) 2002 JSON.org - All Rights Reserved. - - - Terms of the JSON License: - --------------------------------------------------- - - Permission is hereby granted, free of charge, to any person obtaining a - copy of this software and associated documentation files (the "Software"), - to deal in the Software without restriction, including without limitation - the rights to use, copy, modify, merge, publish, distribute, sublicense, - and/or sell copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - The Software shall be used for Good, not Evil. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS IN THE SOFTWARE. - - Terms of the MIT License: -------------------------------------------------------------------- diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 47ff79e616fb5..888b7143fbf6f 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1287,20 +1287,29 @@ garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass) * garrow_decimal_data_type_new: * @precision: The precision of decimal data. * @scale: The scale of decimal data. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: The newly created decimal data type. + * Returns: (nullable): + * The newly created decimal data type on success, %NULL on error. * - * Since: 0.10.0 + * #GArrowDecimal256DataType is used if @precision is larger than + * garrow_decimal128_data_type_max_precision(), + * #GArrowDecimal128DataType is used otherwise. * - * Deprecated: 0.12.0: - * Use garrow_decimal128_data_type_new() instead. + * Since: 0.10.0 */ GArrowDecimalDataType * garrow_decimal_data_type_new(gint32 precision, - gint32 scale) + gint32 scale, + GError **error) { - auto decimal128_data_type = garrow_decimal128_data_type_new(precision, scale); - return GARROW_DECIMAL_DATA_TYPE(decimal128_data_type); + if (precision <= garrow_decimal128_data_type_max_precision()) { + return GARROW_DECIMAL_DATA_TYPE( + garrow_decimal128_data_type_new(precision, scale, error)); + } else { + return GARROW_DECIMAL_DATA_TYPE( + garrow_decimal256_data_type_new(precision, scale, error)); + } } /** @@ -1371,22 +1380,30 @@ garrow_decimal128_data_type_max_precision() * garrow_decimal128_data_type_new: * @precision: The precision of decimal data. * @scale: The scale of decimal data. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: The newly created 128-bit decimal data type. + * Returns: (nullable): + * The newly created 128-bit decimal data type on success, %NULL on error. * * Since: 0.12.0 */ GArrowDecimal128DataType * garrow_decimal128_data_type_new(gint32 precision, - gint32 scale) -{ - auto arrow_data_type = arrow::decimal128(precision, scale); - - auto data_type = - GARROW_DECIMAL128_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE, - "data-type", &arrow_data_type, - NULL)); - return data_type; + gint32 scale, + GError **error) +{ + auto arrow_data_type_result = arrow::Decimal128Type::Make(precision, scale); + if (garrow::check(error, + arrow_data_type_result, + "[decimal128-data-type][new]")) { + auto arrow_data_type = *arrow_data_type_result; + return GARROW_DECIMAL128_DATA_TYPE( + g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE, + "data-type", &arrow_data_type, + NULL)); + } else { + return NULL; + } } @@ -1421,22 +1438,30 @@ garrow_decimal256_data_type_max_precision() * garrow_decimal256_data_type_new: * @precision: The precision of decimal data. * @scale: The scale of decimal data. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: The newly created 256-bit decimal data type. + * Returns: (nullable): + * The newly created 256-bit decimal data type on success, %NULL on error. * * Since: 3.0.0 */ GArrowDecimal256DataType * garrow_decimal256_data_type_new(gint32 precision, - gint32 scale) -{ - auto arrow_data_type = arrow::decimal256(precision, scale); - - auto data_type = - GARROW_DECIMAL256_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL256_DATA_TYPE, - "data-type", &arrow_data_type, - NULL)); - return data_type; + gint32 scale, + GError **error) +{ + auto arrow_data_type_result = arrow::Decimal256Type::Make(precision, scale); + if (garrow::check(error, + arrow_data_type_result, + "[decimal256-data-type][new]")) { + auto arrow_data_type = *arrow_data_type_result; + return GARROW_DECIMAL256_DATA_TYPE( + g_object_new(GARROW_TYPE_DECIMAL256_DATA_TYPE, + "data-type", &arrow_data_type, + NULL)); + } else { + return NULL; + } } diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index f56a8b2d990ae..b498583e26529 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -456,11 +456,8 @@ struct _GArrowDecimalDataTypeClass GArrowFixedSizeBinaryDataTypeClass parent_class; }; -#ifndef GARROW_DISABLE_DEPRECATED -GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_data_type_new) GArrowDecimalDataType * -garrow_decimal_data_type_new(gint32 precision, gint32 scale); -#endif +garrow_decimal_data_type_new(gint32 precision, gint32 scale, GError **error); gint32 garrow_decimal_data_type_get_precision(GArrowDecimalDataType *decimal_data_type); gint32 garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type); @@ -482,7 +479,7 @@ garrow_decimal128_data_type_max_precision(); GARROW_AVAILABLE_IN_0_12 GArrowDecimal128DataType * -garrow_decimal128_data_type_new(gint32 precision, gint32 scale); +garrow_decimal128_data_type_new(gint32 precision, gint32 scale, GError **error); #define GARROW_TYPE_DECIMAL256_DATA_TYPE (garrow_decimal256_data_type_get_type()) @@ -502,7 +499,7 @@ garrow_decimal256_data_type_max_precision(); GARROW_AVAILABLE_IN_3_0 GArrowDecimal256DataType * -garrow_decimal256_data_type_new(gint32 precision, gint32 scale); +garrow_decimal256_data_type_new(gint32 precision, gint32 scale, GError **error); #define GARROW_TYPE_EXTENSION_DATA_TYPE (garrow_extension_data_type_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowExtensionDataType, diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index cdfc96a5a4506..e98a3e7262712 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -106,7 +106,7 @@ namespace { garrow_sort_key_equal_raw(const arrow::compute::SortKey &sort_key, const arrow::compute::SortKey &other_sort_key) { return - (sort_key.name == other_sort_key.name) && + (sort_key.target == other_sort_key.target) && (sort_key.order == other_sort_key.order); } @@ -2299,7 +2299,7 @@ typedef struct GArrowSortKeyPrivate_ { } GArrowSortKeyPrivate; enum { - PROP_SORT_KEY_NAME = 1, + PROP_SORT_KEY_TARGET = 1, PROP_SORT_KEY_ORDER, }; @@ -2329,9 +2329,6 @@ garrow_sort_key_set_property(GObject *object, auto priv = GARROW_SORT_KEY_GET_PRIVATE(object); switch (prop_id) { - case PROP_SORT_KEY_NAME: - priv->sort_key.name = g_value_get_string(value); - break; case PROP_SORT_KEY_ORDER: priv->sort_key.order = static_cast(g_value_get_enum(value)); @@ -2351,8 +2348,15 @@ garrow_sort_key_get_property(GObject *object, auto priv = GARROW_SORT_KEY_GET_PRIVATE(object); switch (prop_id) { - case PROP_SORT_KEY_NAME: - g_value_set_string(value, priv->sort_key.name.c_str()); + case PROP_SORT_KEY_TARGET: + { + auto name = priv->sort_key.target.name(); + if (name) { + g_value_set_string(value, name->c_str()); + } else { + g_value_set_string(value, priv->sort_key.target.ToDotPath().c_str()); + } + } break; case PROP_SORT_KEY_ORDER: g_value_set_enum(value, static_cast(priv->sort_key.order)); @@ -2381,18 +2385,22 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass) GParamSpec *spec; /** - * GArrowSortKey:name: + * GArrowSortKey:target: * - * The column name to be used. + * A name or dot path for the sort target. * - * Since: 3.0.0 + * dot_path = '.' name + * | '[' digit+ ']' + * | dot_path+ + * + * Since: 7.0.0 */ - spec = g_param_spec_string("name", - "Name", - "The column name to be used", + spec = g_param_spec_string("target", + "Target", + "The sort target", NULL, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_SORT_KEY_NAME, spec); + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_SORT_KEY_TARGET, spec); /** * GArrowSortKey:order: @@ -2406,13 +2414,14 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass) "How to order values", GARROW_TYPE_SORT_ORDER, 0, - static_cast(G_PARAM_READWRITE)); + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_SORT_KEY_ORDER, spec); } /** * garrow_sort_key_new: - * @name: A column name to be used. + * @target: A name or dot path for sort target. * @order: How to order by this sort key. * * Returns: A newly created #GArrowSortKey. @@ -2420,12 +2429,21 @@ garrow_sort_key_class_init(GArrowSortKeyClass *klass) * Since: 3.0.0 */ GArrowSortKey * -garrow_sort_key_new(const gchar *name, GArrowSortOrder order) +garrow_sort_key_new(const gchar *target, + GArrowSortOrder order, + GError **error) { + auto arrow_reference_result = garrow_field_reference_resolve_raw(target); + if (!garrow::check(error, + arrow_reference_result, + "[sort-key][new]")) { + return NULL; + } auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY, - "name", name, "order", order, NULL); + auto priv = GARROW_SORT_KEY_GET_PRIVATE(sort_key); + priv->sort_key.target = *arrow_reference_result; return GARROW_SORT_KEY(sort_key); } @@ -2531,9 +2549,7 @@ garrow_sort_options_get_sort_keys(GArrowSortOptions *options) auto arrow_options = garrow_sort_options_get_raw(options); GList *sort_keys = NULL; for (const auto &arrow_sort_key : arrow_options->sort_keys) { - auto sort_key = - garrow_sort_key_new(arrow_sort_key.name.c_str(), - static_cast(arrow_sort_key.order)); + auto sort_key = garrow_sort_key_new_raw(arrow_sort_key); sort_keys = g_list_prepend(sort_keys, sort_key); } return g_list_reverse(sort_keys); @@ -4065,6 +4081,19 @@ garrow_record_batch_filter(GArrowRecordBatch *record_batch, G_END_DECLS + +arrow::Result +garrow_field_reference_resolve_raw(const gchar *reference) +{ + if (reference && reference[0] == '.') { + return arrow::FieldRef::FromDotPath(reference); + } else { + arrow::FieldRef arrow_reference(reference); + return arrow_reference; + } +} + + arrow::compute::ExecContext * garrow_execute_context_get_raw(GArrowExecuteContext *context) { @@ -4228,6 +4257,16 @@ garrow_array_sort_options_get_raw(GArrowArraySortOptions *options) garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +GArrowSortKey * +garrow_sort_key_new_raw(const arrow::compute::SortKey &arrow_sort_key) +{ + auto sort_key = g_object_new(GARROW_TYPE_SORT_KEY, NULL); + auto priv = GARROW_SORT_KEY_GET_PRIVATE(sort_key); + priv->sort_key = arrow_sort_key; + return GARROW_SORT_KEY(sort_key); +} + arrow::compute::SortKey * garrow_sort_key_get_raw(GArrowSortKey *sort_key) { @@ -4235,6 +4274,7 @@ garrow_sort_key_get_raw(GArrowSortKey *sort_key) return &(priv->sort_key); } + arrow::compute::SortOptions * garrow_sort_options_get_raw(GArrowSortOptions *options) { diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 2171d6abd9a99..2416b03c12a96 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -426,7 +426,9 @@ struct _GArrowSortKeyClass GARROW_AVAILABLE_IN_3_0 GArrowSortKey * -garrow_sort_key_new(const gchar *name, GArrowSortOrder order); +garrow_sort_key_new(const gchar *target, + GArrowSortOrder order, + GError **error); GARROW_AVAILABLE_IN_3_0 gboolean diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 88f55d5329c30..b9a127155171c 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -24,6 +24,10 @@ #include +arrow::Result +garrow_field_reference_resolve_raw(const gchar *reference); + + arrow::compute::ExecContext * garrow_execute_context_get_raw(GArrowExecuteContext *context); @@ -88,6 +92,8 @@ arrow::compute::ArraySortOptions * garrow_array_sort_options_get_raw(GArrowArraySortOptions *options); +GArrowSortKey * +garrow_sort_key_new_raw(const arrow::compute::SortKey &arrow_sort_key); arrow::compute::SortKey * garrow_sort_key_get_raw(GArrowSortKey *sort_key); diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal.cpp index 497d76fcfaa36..ebda68e0ff7cb 100644 --- a/c_glib/arrow-glib/decimal.cpp +++ b/c_glib/arrow-glib/decimal.cpp @@ -60,11 +60,18 @@ struct DecimalConverter { template typename DecimalConverter::GArrowType * -garrow_decimal_new_string(const gchar *data) -{ - auto arrow_decimal = std::make_shared(data); - DecimalConverter converter; - return converter.new_raw(&arrow_decimal); +garrow_decimal_new_string(const gchar *data, + GError **error, + const gchar *tag) +{ + auto arrow_decimal_result = Decimal::FromString(data); + if (garrow::check(error, arrow_decimal_result, tag)) { + auto arrow_decimal = std::make_shared(*arrow_decimal_result); + DecimalConverter converter; + return converter.new_raw(&arrow_decimal); + } else { + return NULL; + } } template @@ -375,15 +382,18 @@ garrow_decimal128_class_init(GArrowDecimal128Class *klass) /** * garrow_decimal128_new_string: * @data: The data of the decimal. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: A newly created #GArrowDecimal128. + * Returns: (nullable): + * A newly created #GArrowDecimal128 on success, %NULL on error. * * Since: 0.10.0 */ GArrowDecimal128 * -garrow_decimal128_new_string(const gchar *data) +garrow_decimal128_new_string(const gchar *data, GError **error) { - return garrow_decimal_new_string(data); + return garrow_decimal_new_string( + data, error, "[decimal128][new][string]"); } /** @@ -780,15 +790,18 @@ garrow_decimal256_class_init(GArrowDecimal256Class *klass) /** * garrow_decimal256_new_string: * @data: The data of the decimal. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: A newly created #GArrowDecimal256. + * Returns: (nullable): + * A newly created #GArrowDecimal256 on success, %NULL on error. * * Since: 3.0.0 */ GArrowDecimal256 * -garrow_decimal256_new_string(const gchar *data) +garrow_decimal256_new_string(const gchar *data, GError **error) { - return garrow_decimal_new_string(data); + return garrow_decimal_new_string( + data, error, "[decimal256][new][string]"); } /** diff --git a/c_glib/arrow-glib/decimal.h b/c_glib/arrow-glib/decimal.h index 61f849234933b..97b521f3fbe94 100644 --- a/c_glib/arrow-glib/decimal.h +++ b/c_glib/arrow-glib/decimal.h @@ -37,7 +37,9 @@ struct _GArrowDecimal128Class GObjectClass parent_class; }; -GArrowDecimal128 *garrow_decimal128_new_string(const gchar *data); +GArrowDecimal128 * +garrow_decimal128_new_string(const gchar *data, + GError **error); GArrowDecimal128 *garrow_decimal128_new_integer(const gint64 data); GARROW_AVAILABLE_IN_3_0 GArrowDecimal128 *garrow_decimal128_copy(GArrowDecimal128 *decimal); @@ -99,7 +101,9 @@ struct _GArrowDecimal256Class }; GARROW_AVAILABLE_IN_3_0 -GArrowDecimal256 *garrow_decimal256_new_string(const gchar *data); +GArrowDecimal256 * +garrow_decimal256_new_string(const gchar *data, + GError **error); GARROW_AVAILABLE_IN_3_0 GArrowDecimal256 *garrow_decimal256_new_integer(const gint64 data); GARROW_AVAILABLE_IN_3_0 diff --git a/c_glib/arrow-glib/expression.cpp b/c_glib/arrow-glib/expression.cpp index 406e121cdb80e..eaa8bcd5ddc6e 100644 --- a/c_glib/arrow-glib/expression.cpp +++ b/c_glib/arrow-glib/expression.cpp @@ -175,20 +175,14 @@ GArrowFieldExpression * garrow_field_expression_new(const gchar *reference, GError **error) { - if (reference && reference[0] == '.') { - auto arrow_reference_result = arrow::FieldRef::FromDotPath(reference); - if (!garrow::check(error, - arrow_reference_result, - "[field-expression][new]")) { - return NULL; - } - auto arrow_expression = arrow::compute::field_ref(*arrow_reference_result); - return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression)); - } else { - arrow::FieldRef arrow_reference(reference); - auto arrow_expression = arrow::compute::field_ref(arrow_reference); - return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression)); + auto arrow_reference_result = garrow_field_reference_resolve_raw(reference); + if (!garrow::check(error, + arrow_reference_result, + "[field-expression][new]")) { + return NULL; } + auto arrow_expression = arrow::compute::field_ref(*arrow_reference_result); + return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression)); } diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp index 2c2c36e74bb4c..bb2e19513bb13 100644 --- a/c_glib/arrow-glib/file-system.cpp +++ b/c_glib/arrow-glib/file-system.cpp @@ -51,6 +51,8 @@ G_BEGIN_DECLS * * #GArrowHDFSFileSystem is a class for HDFS-backed file system. * + * #GArrowS3GlobalOptions is a class for options to initialize S3 APIs. + * * #GArrowS3FileSystem is a class for S3-backed file system. */ @@ -72,10 +74,10 @@ enum { G_DEFINE_TYPE_WITH_PRIVATE(GArrowFileInfo, garrow_file_info, G_TYPE_OBJECT) -#define GARROW_FILE_INFO_GET_PRIVATE(obj) \ +#define GARROW_FILE_INFO_GET_PRIVATE(object) \ static_cast( \ - garrow_file_info_get_instance_private( \ - GARROW_FILE_INFO(obj))) + garrow_file_info_get_instance_private( \ + GARROW_FILE_INFO(object))) static void garrow_file_info_finalize(GObject *object) @@ -1364,6 +1366,218 @@ garrow_hdfs_file_system_class_init(GArrowHDFSFileSystemClass *klass) } +#ifndef ARROW_S3 +namespace arrow { + namespace fs { + enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace }; + + struct ARROW_EXPORT S3GlobalOptions { + S3LogLevel log_level; + }; + } +} +#endif + +typedef struct GArrowS3GlobalOptionsPrivate_ { + arrow::fs::S3GlobalOptions options; +} GArrowS3GlobalOptionsPrivate; + +enum { + PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowS3GlobalOptions, + garrow_s3_global_options, + G_TYPE_OBJECT) + +#define GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_s3_global_options_get_instance_private( \ + GARROW_S3_GLOBAL_OPTIONS(object))) + +static void +garrow_s3_global_options_finalize(GObject *object) +{ + auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object); + priv->options.~S3GlobalOptions(); + G_OBJECT_CLASS(garrow_s3_global_options_parent_class)->finalize(object); +} + +static void +garrow_s3_global_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ +#ifdef ARROW_S3 + auto arrow_options = + garrow_s3_global_options_get_raw(GARROW_S3_GLOBAL_OPTIONS(object)); + + switch (prop_id) { + case PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL: + arrow_options->log_level = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +#else + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); +#endif +} + +static void +garrow_s3_global_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ +#ifdef ARROW_S3 + auto arrow_options = + garrow_s3_global_options_get_raw(GARROW_S3_GLOBAL_OPTIONS(object)); + + switch (prop_id) { + case PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL: + g_value_set_enum(value, + static_cast(arrow_options->log_level)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +#else + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); +#endif +} + +static void +garrow_s3_global_options_init(GArrowS3GlobalOptions *object) +{ + auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(object); + new(&priv->options) arrow::fs::S3GlobalOptions; +} + +static void +garrow_s3_global_options_class_init(GArrowS3GlobalOptionsClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_s3_global_options_finalize; + gobject_class->set_property = garrow_s3_global_options_set_property; + gobject_class->get_property = garrow_s3_global_options_get_property; + + /** + * GArrowS3GlobalOptions:log-level: + * + * The log level of S3 APIs. + * + * Since: 7.0.0 + */ + spec = g_param_spec_enum("log-level", + "Log level", + "The log level of S3 APIs", + GARROW_TYPE_S3_LOG_LEVEL, + GARROW_S3_LOG_LEVEL_FATAL, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT)); + g_object_class_install_property(gobject_class, + PROP_S3_GLOBAL_OPTIONS_LOG_LEVEL, + spec); +} + +/** + * garrow_s3_global_options_new: + * + * Returns: A newly created #GArrowS3GlobalOptions. + * + * Since: 7.0.0 + */ +GArrowS3GlobalOptions * +garrow_s3_global_options_new(void) +{ + return GARROW_S3_GLOBAL_OPTIONS( + g_object_new(GARROW_TYPE_S3_GLOBAL_OPTIONS, NULL)); +} + + +/** + * garrow_s3_is_enabled: + * + * Returns: %TRUE if Apache Arrow C++ is built with S3 support, %FALSE + * otherwise. + * + * Since: 7.0.0 + */ +gboolean +garrow_s3_is_enabled(void) +{ +#ifdef ARROW_S3 + return TRUE; +#else + return FALSE; +#endif +} + +/** + * garrow_s3_initialize: + * @options: (nullable): Options to initialize the S3 APIs. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Normally, you don't need to call this function because the S3 APIs + * are initialized with the default options automatically. If you want + * to call this function, you must call this function before you use + * any #GArrowS3FileSystem related APIs. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 7.0.0 + */ +gboolean +garrow_s3_initialize(GArrowS3GlobalOptions *options, + GError **error) +{ +#ifdef ARROW_S3 + auto arrow_options = garrow_s3_global_options_get_raw(options); + return garrow::check(error, + arrow::fs::InitializeS3(*arrow_options), + "[s3][initialize]"); +#else + return garrow::check(error, + arrow::Status::NotImplemented( + "Apache Arrow C++ isn't built with S3 support"), + "[s3][initialize]"); +#endif +} + +/** + * garrow_s3_finalize: + * @error: (nullable): Return location for a #GError or %NULL. + * + * Finalize the S3 APIs. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 7.0.0 + */ +gboolean +garrow_s3_finalize(GError **error) +{ +#ifdef ARROW_S3 + return garrow::check(error, + arrow::fs::FinalizeS3(), + "[s3][finalize]"); +#else + return garrow::check(error, + arrow::Status::NotImplemented( + "Apache Arrow C++ isn't built with S3 support"), + "[s3][initialize]"); +#endif +} + + G_DEFINE_TYPE(GArrowS3FileSystem, garrow_s3_file_system, GARROW_TYPE_FILE_SYSTEM) @@ -1448,3 +1662,12 @@ garrow_slow_file_system_new_raw( "base-file-system", base_file_system, NULL)); } + +#ifdef ARROW_S3 +arrow::fs::S3GlobalOptions * +garrow_s3_global_options_get_raw(GArrowS3GlobalOptions *options) +{ + auto priv = GARROW_S3_GLOBAL_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} +#endif diff --git a/c_glib/arrow-glib/file-system.h b/c_glib/arrow-glib/file-system.h index dc9fba7dd50a2..7ab356e42402b 100644 --- a/c_glib/arrow-glib/file-system.h +++ b/c_glib/arrow-glib/file-system.h @@ -268,6 +268,59 @@ struct _GArrowHDFSFileSystemClass }; +/** + * GArrowS3LogLevel: + * @GARROW_S3_LOG_LEVEL_OFF: Off. + * @GARROW_S3_LOG_LEVEL_FATAL: Fatal. This is the default. + * @GARROW_S3_LOG_LEVEL_ERROR: Error. + * @GARROW_S3_LOG_LEVEL_WARN: Warn. + * @GARROW_S3_LOG_LEVEL_INFO: Info. + * @GARROW_S3_LOG_LEVEL_DEBUG: Debug. + * @GARROW_S3_LOG_LEVEL_TRACE: Trace. + * + * They are corresponding to `arrow::fs::S3LogLevel` values. + * + * Since: 7.0.0 + */ +typedef enum { + GARROW_S3_LOG_LEVEL_OFF, + GARROW_S3_LOG_LEVEL_FATAL, + GARROW_S3_LOG_LEVEL_ERROR, + GARROW_S3_LOG_LEVEL_WARN, + GARROW_S3_LOG_LEVEL_INFO, + GARROW_S3_LOG_LEVEL_DEBUG, + GARROW_S3_LOG_LEVEL_TRACE, +} GArrowS3LogLevel; + + +#define GARROW_TYPE_S3_GLOBAL_OPTIONS (garrow_s3_global_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowS3GlobalOptions, + garrow_s3_global_options, + GARROW, + S3_GLOBAL_OPTIONS, + GObject) +struct _GArrowS3GlobalOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_7_0 +GArrowS3GlobalOptions * +garrow_s3_global_options_new(void); + + +GARROW_AVAILABLE_IN_7_0 +gboolean +garrow_s3_is_enabled(void); +GARROW_AVAILABLE_IN_7_0 +gboolean +garrow_s3_initialize(GArrowS3GlobalOptions *options, + GError **error); +GARROW_AVAILABLE_IN_7_0 +gboolean +garrow_s3_finalize(GError **error); + + #define GARROW_TYPE_S3_FILE_SYSTEM (garrow_s3_file_system_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowS3FileSystem, garrow_s3_file_system, diff --git a/c_glib/arrow-glib/file-system.hpp b/c_glib/arrow-glib/file-system.hpp index 6130d2df52f2d..6d33ba74fb199 100644 --- a/c_glib/arrow-glib/file-system.hpp +++ b/c_glib/arrow-glib/file-system.hpp @@ -46,3 +46,8 @@ garrow_slow_file_system_new_raw( std::shared_ptr *arrow_file_system, GArrowFileSystem *base_file_system); + +#ifdef ARROW_S3 +arrow::fs::S3GlobalOptions * +garrow_s3_global_options_get_raw(GArrowS3GlobalOptions *options); +#endif diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 7b7174e66bdd6..57978d3eb1033 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_7_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 7.0.0 + */ +#define GARROW_VERSION_7_0 G_ENCODE_VERSION(7, 0) + /** * GARROW_VERSION_6_0: * @@ -274,6 +283,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_7_0 +# define GARROW_DEPRECATED_IN_7_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_7_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_7_0 +# define GARROW_DEPRECATED_IN_7_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_7_0 +# define GARROW_AVAILABLE_IN_7_0 GARROW_UNAVAILABLE(7, 0) +#else +# define GARROW_AVAILABLE_IN_7_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_6_0 # define GARROW_DEPRECATED_IN_6_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_6_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 43f6a7edcd83c..4c37028bb487e 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -184,6 +184,10 @@ Index of deprecated API + + Index of new symbols in 7.0.0 + + Index of new symbols in 6.0.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 0e090c979688e..d5f8d66a5b1a9 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -version = '6.0.0-SNAPSHOT' +version = '7.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/test/test-decimal128-data-type.rb b/c_glib/test/test-decimal128-data-type.rb index b27e1cad1ea3f..bcee24187f4c2 100644 --- a/c_glib/test/test-decimal128-data-type.rb +++ b/c_glib/test/test-decimal128-data-type.rb @@ -40,4 +40,17 @@ def test_scale data_type = Arrow::Decimal128DataType.new(8, 2) assert_equal(2, data_type.scale) end + + def test_deciaml_data_type_new + assert_equal(Arrow::Decimal128DataType.new(8, 2), + Arrow::DecimalDataType.new(8, 2)) + end + + def test_invalid_precision + message = + "[decimal128-data-type][new]: Invalid: Decimal precision out of range: 39" + assert_raise(Arrow::Error::Invalid.new(message)) do + Arrow::Decimal128DataType.new(39, 1) + end + end end diff --git a/c_glib/test/test-decimal128.rb b/c_glib/test/test-decimal128.rb index 8f14cfbe52092..d032afd510db7 100644 --- a/c_glib/test/test-decimal128.rb +++ b/c_glib/test/test-decimal128.rb @@ -18,6 +18,17 @@ class TestDecimal128 < Test::Unit::TestCase include Helper::Omittable + def test_new_string_invalid + message = + "[decimal128][new][string]: Invalid: " + + "The string '1,1' is not a valid decimal128 number" + error = assert_raise(Arrow::Error::Invalid) do + Arrow::Decimal128.new("1,1") + end + assert_equal(message, + error.message.lines.first.chomp) + end + def test_copy decimal = Arrow::Decimal128.new("234.23445") assert_equal(decimal, decimal.copy) diff --git a/c_glib/test/test-decimal256-data-type.rb b/c_glib/test/test-decimal256-data-type.rb index 596c3dab92998..3070a4e4c6ca4 100644 --- a/c_glib/test/test-decimal256-data-type.rb +++ b/c_glib/test/test-decimal256-data-type.rb @@ -40,4 +40,17 @@ def test_scale data_type = Arrow::Decimal256DataType.new(8, 2) assert_equal(2, data_type.scale) end + + def test_deciaml_data_type_new + assert_equal(Arrow::Decimal256DataType.new(39, 1), + Arrow::DecimalDataType.new(39, 1)) + end + + def test_invalid_precision + message = + "[decimal256-data-type][new]: Invalid: Decimal precision out of range: 77" + assert_raise(Arrow::Error::Invalid.new(message)) do + Arrow::Decimal256DataType.new(77, 1) + end + end end diff --git a/c_glib/test/test-decimal256.rb b/c_glib/test/test-decimal256.rb index d422aef339e2e..24fd3b5552b2f 100644 --- a/c_glib/test/test-decimal256.rb +++ b/c_glib/test/test-decimal256.rb @@ -18,6 +18,17 @@ class TestDecimal256 < Test::Unit::TestCase include Helper::Omittable + def test_new_string_invalid + message = + "[decimal256][new][string]: Invalid: " + + "The string '1,1' is not a valid decimal256 number" + error = assert_raise(Arrow::Error::Invalid) do + Arrow::Decimal256.new("1,1") + end + assert_equal(message, + error.message.lines.first.chomp) + end + def test_copy decimal = Arrow::Decimal256.new("234.23445") assert_equal(decimal, decimal.copy) diff --git a/c_glib/test/test-s3-global-options.rb b/c_glib/test/test-s3-global-options.rb new file mode 100644 index 0000000000000..b1270e42faf1a --- /dev/null +++ b/c_glib/test/test-s3-global-options.rb @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestS3GlobalOptions < Test::Unit::TestCase + def setup + omit("S3 enabled Apache Arrow C++ is needed") unless Arrow.s3_is_enabled? + @options = Arrow::S3GlobalOptions.new + end + + sub_test_case("#log_level") do + test("default") do + assert_equal(Arrow::S3LogLevel::FATAL, + @options.log_level) + end + end + + test("#log_level=") do + @options.log_level = :trace + assert_equal(Arrow::S3LogLevel::TRACE, + @options.log_level) + end +end diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index d2ccb66a26b42..6d1ebf353415f 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -26,7 +26,7 @@ gflags glog gmock>=1.10.0 grpc-cpp>=1.27.3 -gtest=1.10.0 +gtest>=1.10.0 libprotobuf libutf8proc lz4-c @@ -40,3 +40,4 @@ snappy thrift-cpp>=0.11.0 zlib zstd +flatbuffers diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 49388e2b437f1..64e1c16a551f2 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -19,6 +19,5 @@ breathe doxygen ipython -# Pinned per ARROW-9693 -sphinx=3.1.2 +sphinx=4.2 pydata-sphinx-theme diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index ff31930c06cf9..40a855b5dd277 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -48,6 +48,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + GTest_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ PARQUET_HOME=$CONDA_PREFIX diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 2dba1d10bfa92..d40973330a6da 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -29,6 +29,7 @@ ARG go=1.15 COPY ci/conda_env_archery.txt /arrow/ci/ RUN conda install -q \ --file arrow/ci/conda_env_archery.txt \ + "python>=3.7" \ numpy \ compilers \ maven=${maven} \ @@ -40,6 +41,7 @@ RUN conda install -q \ # Install Rust with only the needed components # (rustfmt is needed for tonic-build to compile the protobuf definitions) RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --profile=minimal -y && \ + $HOME/.cargo/bin/rustup toolchain install stable && \ $HOME/.cargo/bin/rustup component add rustfmt ENV GOROOT=/opt/go \ @@ -51,7 +53,7 @@ RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -x ENV DOTNET_ROOT=/opt/dotnet \ PATH=/opt/dotnet:$PATH RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 3.1 -InstallDir /opt/dotnet - + ENV ARROW_BUILD_INTEGRATION=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index 16e867fc3fbd6..f85408f03489a 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -74,8 +74,6 @@ RUN apt-get update -y -q && \ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/r-fedora-clang-devel-san.dockerfile b/ci/docker/r-fedora-clang-devel-san.dockerfile new file mode 100644 index 0000000000000..7c3aa536b7967 --- /dev/null +++ b/ci/docker/r-fedora-clang-devel-san.dockerfile @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Fedora-clang-devel with the sanitizer enabled, this should/will +# be upstreamed to rhub, so separated out like this + +# start with the Docker 'base R' Debian-based image +FROM rhub/fedora-clang:latest + +# TODO: rhub maintainer when we upstream + +ENV CRAN http://cran.r-project.org + +RUN cd /tmp \ + && svn co https://svn.r-project.org/R/trunk R-devel + +ENV RPREFIX /opt/R-devel + +ENV ROPTIONS --with-x --with-recommended-packages --enable-R-shlib --enable-R-static-lib + +ENV CC /usr/bin/clang +ENV CXX /usr/bin/clang++ +ENV F77 gfortran +ENV CPP cpp + +RUN yum -y install rsync +RUN dnf install -y libcxx-devel + +RUN cd /tmp/R-devel \ + && ./tools/rsync-recommended \ + && R_PAPERSIZE=letter \ + R_BATCHSAVE="--no-save --no-restore" \ + CC="clang -fsanitize=address,undefined -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" \ + CXX="clang++ -stdlib=libc++ -fsanitize=address,undefined -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" \ + CFLAGS="-g -O3 -Wall -pedantic -mtune=native" \ + FFLAGS="-g -O2 -mtune=native" \ + FCFLAGS="-g -O2 -mtune=native" \ + CXXFLAGS="-g -O3 -Wall -pedantic -mtune=native" \ + MAIN_LD="clang++ -stdlib=libc++ -fsanitize=undefined,address" \ + R_OPENMP_CFLAGS=-fopenmp \ + ./configure --prefix=${RPREFIX} ${ROPTIONS} \ + && make \ + && make install + +# TODO: re-enable when upstreamed? +# COPY xvfb-run /usr/local/bin/xvfb-run + +# RUN chmod +x /usr/local/bin/xvfb-run && \ +# rm -f /bin/xvfb-run /usr/bin/xvfb-run + +ENV RHUB_PLATFORM linux-x86_64-fedora-clang + +# More verbose UBSAN/SAN output (cf #3) -- this is still somewhat speculative +# Entry copied from Prof Ripley's setup described at http://www.stats.ox.ac.uk/pub/bdr/memtests/README.txt +ENV ASAN_OPTIONS 'alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0' + +ENV PATH=${RPREFIX}/bin:$PATH + +RUN cd $RPREFIX/bin \ + && mv R Rdevel \ + && cp Rscript Rscriptdevel \ + && ln -s Rdevel RDsan \ + && ln -s Rscriptdevel RDscriptsan diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index de872da9a8f75..5a48c648e3bfe 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -78,6 +78,7 @@ RUN apt-get update -y -q && \ liblz4-dev \ libprotobuf-dev \ libprotoc-dev \ + libradospp-dev \ libre2-dev \ libsnappy-dev \ libssl-dev \ @@ -89,6 +90,8 @@ RUN apt-get update -y -q && \ pkg-config \ protobuf-compiler \ python3-pip \ + python3-rados \ + rados-objclass-dev \ rapidjson-dev \ tzdata \ wget && \ @@ -99,6 +102,8 @@ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ceph.sh # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 246b679129a38..284d35b9f63a6 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=5.0.0.9000 +pkgver=6.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index a11dd23b7f7fe..0ea9b1b89dc47 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -91,6 +91,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \ -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ diff --git a/ci/scripts/generate_dataset.py b/ci/scripts/generate_dataset.py new file mode 100644 index 0000000000000..42ee0763a1b25 --- /dev/null +++ b/ci/scripts/generate_dataset.py @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import os +import shutil +import random + +import pandas as pd + +if __name__ == "__main__": + # generate the test dataframe + data = { + "total_amount": list(), + "fare_amount": list() + } + for i in range(0, 500): + data['total_amount'].append(random.randint(1,11)*5) + data['fare_amount'].append(random.randint(1,11)*3) + df = pd.DataFrame(data) + + # dump the dataframe to a parquet file + df.to_parquet("skyhook_test_data.parquet") + + # create the dataset by copying the parquet files + shutil.rmtree("nyc", ignore_errors=True) + payment_type = ["1", "2", "3", "4"] + vendor_id = ["1", "2"] + for p in payment_type: + for v in vendor_id: + path = f"nyc/payment_type={p}/VendorID={v}" + os.makedirs(path, exist_ok=True) + shutil.copyfile("skyhook_test_data.parquet", os.path.join(path, f"{p}.{v}.parquet")) diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index 267f78e594697..0b71e376a765a 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -24,7 +24,12 @@ source_dir=${1}/go pushd ${source_dir}/arrow if [[ -n "${ARROW_GO_TESTCGO}" ]]; then - TAGS="-tags ccalloc" + if [[ "${MSYSTEM}" = "MINGW64" ]]; then + export PATH=${MINGW_PREFIX}/bin:$PATH + go clean -cache + go clean -testcache + fi + TAGS="-tags assert,test,ccalloc" fi go get -d -t -v ./... diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh index f7b2cd963fc4d..eee156fd77cc9 100755 --- a/ci/scripts/go_test.sh +++ b/ci/scripts/go_test.sh @@ -36,7 +36,10 @@ fi pushd ${source_dir}/arrow TAGS="assert,test" -if [[ -n "${ARROW_GO_TESTCGO}" ]]; then +if [[ -n "${ARROW_GO_TESTCGO}" ]]; then + if [[ "${MSYSTEM}" = "MINGW64" ]]; then + export PATH=${MINGW_PREFIX}/bin:$PATH + fi TAGS="${TAGS},ccalloc" fi diff --git a/dev/benchmarking/Dockerfile b/ci/scripts/install_ceph.sh old mode 100644 new mode 100755 similarity index 75% rename from dev/benchmarking/Dockerfile rename to ci/scripts/install_ceph.sh index f470333979ca4..d9abef0619408 --- a/dev/benchmarking/Dockerfile +++ b/ci/scripts/install_ceph.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -5,19 +7,22 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at - +# # http://www.apache.org/licenses/LICENSE-2.0 - +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# -FROM postgres:11-alpine -# Any `.sh` and `.sql` files copied to the entrypoint directory -# will be run during startup. See `docker-entrypoint.sh` in -# https://github.com/docker-library/postgres/blob/master/11/alpine/ -COPY ddl/* /docker-entrypoint-initdb.d/ +set -ex + +ARCH=$(uname -m) +if [ "$ARCH" != "x86_64" ]; then + exit 0 +fi + +apt update +apt install -y attr ceph-common ceph-fuse ceph-mds ceph-mgr ceph-mon ceph-osd diff --git a/ci/scripts/integration_skyhook.sh b/ci/scripts/integration_skyhook.sh new file mode 100755 index 0000000000000..6c3011f9c63ed --- /dev/null +++ b/ci/scripts/integration_skyhook.sh @@ -0,0 +1,141 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script spawns a single-node Ceph cluster, creates a CephFS mount, +# generates a Parquet dataset, and runs the SkyhookDM integration tests. +# Taken from https://github.com/ceph/go-ceph/blob/master/micro-osd.sh + +set -e +set -x +set -u + +if [ "${ARROW_SKYHOOK:-OFF}" != "ON" ]; then + exit 0 +fi + +ARROW_BUILD_DIR=${1}/cpp +DIR=/tmp/integration_skyhook + +# set environment variables +pkill ceph || true +rm -rf ${DIR}/* +LOG_DIR=${DIR}/log +MON_DATA=${DIR}/mon +MDS_DATA=${DIR}/mds +MOUNTPT=${MDS_DATA}/mnt +OSD_DATA=${DIR}/osd +mkdir -p ${LOG_DIR} ${MON_DATA} ${OSD_DATA} ${MDS_DATA} ${MOUNTPT} +MDS_NAME="Z" +MON_NAME="a" +MGR_NAME="x" +MIRROR_ID="m" + +# cluster wide parameters +cat >> ${DIR}/ceph.conf < ${MDS_DATA}/keyring +ceph osd pool create cephfs_data 8 +ceph osd pool create cephfs_metadata 8 +ceph fs new cephfs cephfs_metadata cephfs_data +ceph fs ls +ceph-mds -i ${MDS_NAME} +ceph status +while [[ ! $(ceph mds stat | grep "up:active") ]]; do sleep 1; done + +# start a manager +ceph-mgr --id ${MGR_NAME} + +# test the setup +ceph --version +ceph status + +apt update +apt install -y python3-pip + +pushd ${ARROW_BUILD_DIR} + # create the rados-classes, if not there already + mkdir -p /usr/lib/x86_64-linux-gnu/rados-classes/ + cp debug/libcls_skyhook* /usr/lib/x86_64-linux-gnu/rados-classes/ + + # mount a ceph filesystem to /mnt/cephfs in the user-space using ceph-fuse + mkdir -p /mnt/cephfs + ceph-fuse /mnt/cephfs + sleep 5 + + # download an example dataset and copy into the mounted dir + pip3 install pyarrow pandas + python3 /arrow/ci/scripts/generate_dataset.py + cp -r nyc /mnt/cephfs/ + sleep 10 + + # run the tests + SKYHOOK_CLS_TEST=debug/skyhook-cls-test + if [ -f "$SKYHOOK_CLS_TEST" ]; then + debug/skyhook-cls-test + fi + + SKYHOOK_PROTOCOL_TEST=debug/skyhook-protocol-test + if [ -f "$SKYHOOK_PROTOCOL_TEST" ]; then + debug/skyhook-protocol-test + fi +popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh new file mode 100755 index 0000000000000..86ea7cf155350 --- /dev/null +++ b/ci/scripts/java_cdata_integration.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +export ARROW_SOURCE_DIR=${arrow_dir} + +pushd ${arrow_dir}/java/c/src/test/python + +python integration_tests.py + +popd diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh index fb1d2b5f535d2..6f12a2fd3d18f 100755 --- a/ci/scripts/java_full_build.sh +++ b/ci/scripts/java_full_build.sh @@ -26,11 +26,37 @@ export ARROW_TEST_DATA=${arrow_dir}/testing/data pushd ${arrow_dir}/java +# generate dummy GPG key for -Papache-release. +# -Papache-release generates signs (*.asc) of artifacts. +# We don't use these signs in our release process. +(echo "Key-Type: RSA"; \ + echo "Key-Length: 4096"; \ + echo "Name-Real: Build"; \ + echo "Name-Email: build@example.com"; \ + echo "%no-protection") | \ + gpg --full-generate-key --batch + # build the entire project -mvn clean install -Parrow-c-data -Parrow-jni -Darrow.cpp.build.dir=$dist_dir -Darrow.c.jni.dist.dir=$dist_dir +mvn clean \ + install \ + assembly:single \ + source:jar \ + javadoc:jar \ + -Papache-release \ + -Parrow-c-data \ + -Parrow-jni \ + -Darrow.cpp.build.dir=$dist_dir \ + -Darrow.c.jni.dist.dir=$dist_dir \ + -DdescriptorId=source-release -# copy all jars and pom files to the distribution folder -find . -name "*.jar" -exec echo {} \; -exec cp {} $dist_dir \; -find . -name "*.pom" -exec echo {} \; -exec cp {} $dist_dir \; +# copy all jar, zip and pom files to the distribution folder +find . \ + "(" -name "*-javadoc.jar" -o -name "*-sources.jar" ")" \ + -exec echo {} ";" \ + -exec cp {} $dist_dir ";" +find ~/.m2/repository/org/apache/arrow \ + "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ + -exec echo {} ";" \ + -exec cp {} $dist_dir ";" popd diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index 10ceb41ee6589..12f58d54bb8a7 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -30,7 +30,16 @@ yarn lint:ci yarn build if [ "${with_docs}" == "true" ]; then - yarn doc + if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then + yarn doc + elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then + yarn doc --gitRemote upstream + elif [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + yarn doc --gitRemote apache + else + echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." + exit 0 + fi fi popd diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index 6f6012c879adf..01cd5fa9ee360 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -65,6 +65,9 @@ case "${target}" in cgo) packages+=(${MINGW_PACKAGE_PREFIX}-arrow) packages+=(${MINGW_PACKAGE_PREFIX}-gcc) + packages+=(${MINGW_PACKAGE_PREFIX}-go) + packages+=(${MINGW_PACKAGE_PREFIX}-toolchain) + packages+=(base-devel) ;; esac diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index fda53a8da528f..ec703abfc0320 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -80,5 +80,5 @@ if [ "${CHECK_UNITTESTS}" == "ON" ]; then pip install -U -r ${source_dir}/python/requirements-wheel-test.txt # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' - pytest -r s --pyargs pyarrow + python -m pytest -r s --pyargs pyarrow fi diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index 6c79c085180a7..33b4361163031 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -21,6 +21,7 @@ set -ex : ${R_BIN:=RDsan} source_dir=${1}/r +rhome=$(${R_BIN} RHOME) pushd ${source_dir} @@ -28,19 +29,33 @@ pushd ${source_dir} export CMAKE_UNITY_BUILD=OFF # Make installation verbose so that the CI job doesn't time out due to silence export ARROW_R_DEV=TRUE +# Get line numbers in sanitizer tracebacks +export CMAKE_BUILD_TYPE=RelWithDebInfo + +ncores=$(${R_BIN} -s -e 'cat(parallel::detectCores())') +echo "MAKEFLAGS=-j${ncores}" >> ${rhome}/etc/Renviron.site + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} . # But unset the env var so that it doesn't cause us to run extra dev tests unset ARROW_R_DEV export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" +# run tests pushd tests ${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } -popd -${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then exit 1 fi + +# run examples +popd +${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } + +cat examples.out +if grep -q "runtime error" examples.out; then + exit 1 +fi popd diff --git a/cpp/.gitignore b/cpp/.gitignore index 03c03a401a552..e1e921762f9ce 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -18,6 +18,7 @@ thirdparty/*.tar* CMakeFiles/ CMakeCache.txt +CMakeUserPresets.json CTestTestfile.cmake Makefile cmake_install.cmake @@ -25,6 +26,7 @@ build/ *-build/ Testing/ build-support/boost_* +vcpkg_installed/ # Build directories created by Clion cmake-build-*/ diff --git a/cpp/Brewfile b/cpp/Brewfile index 78ee5e64c8fac..039833c274093 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -16,6 +16,7 @@ # under the License. brew "automake" +brew "aws-sdk-cpp" brew "boost" brew "brotli" brew "c-ares" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c787794d39de6..f3d6b24c48f45 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -47,7 +47,7 @@ if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() -set(ARROW_VERSION "6.0.0-SNAPSHOT") +set(ARROW_VERSION "7.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -233,15 +233,22 @@ find_program(CPPLINT_BIN HINTS ${BUILD_SUPPORT_DIR}) message(STATUS "Found cpplint executable at ${CPPLINT_BIN}") +set(COMMON_LINT_OPTIONS + --exclude_globs + ${LINT_EXCLUSIONS_FILE} + --source_dir + ${CMAKE_CURRENT_SOURCE_DIR}/src + --source_dir + ${CMAKE_CURRENT_SOURCE_DIR}/examples + --source_dir + ${CMAKE_CURRENT_SOURCE_DIR}/tools) + add_custom_target(lint ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_cpplint.py --cpplint_binary ${CPPLINT_BIN} - --exclude_globs - ${LINT_EXCLUSIONS_FILE} - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_LINT_OPTIONS} ${ARROW_LINT_QUIET}) # @@ -254,10 +261,7 @@ if(${CLANG_FORMAT_FOUND}) ${BUILD_SUPPORT_DIR}/run_clang_format.py --clang_format_binary ${CLANG_FORMAT_BIN} - --exclude_globs - ${LINT_EXCLUSIONS_FILE} - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_LINT_OPTIONS} --fix ${ARROW_LINT_QUIET}) @@ -267,10 +271,7 @@ if(${CLANG_FORMAT_FOUND}) ${BUILD_SUPPORT_DIR}/run_clang_format.py --clang_format_binary ${CLANG_FORMAT_BIN} - --exclude_globs - ${LINT_EXCLUSIONS_FILE} - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_LINT_OPTIONS} ${ARROW_LINT_QUIET}) endif() @@ -294,12 +295,9 @@ if(${CLANG_TIDY_FOUND}) ${BUILD_SUPPORT_DIR}/run_clang_tidy.py --clang_tidy_binary ${CLANG_TIDY_BIN} - --exclude_globs - ${LINT_EXCLUSIONS_FILE} --compile_commands ${CMAKE_BINARY_DIR}/compile_commands.json - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_LINT_OPTIONS} --fix ${ARROW_LINT_QUIET}) @@ -309,12 +307,9 @@ if(${CLANG_TIDY_FOUND}) ${BUILD_SUPPORT_DIR}/run_clang_tidy.py --clang_tidy_binary ${CLANG_TIDY_BIN} - --exclude_globs - ${LINT_EXCLUSIONS_FILE} --compile_commands ${CMAKE_BINARY_DIR}/compile_commands.json - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_LINT_OPTIONS} ${ARROW_LINT_QUIET}) endif() @@ -351,6 +346,13 @@ if(ARROW_ENGINE) set(ARROW_COMPUTE ON) endif() +if(ARROW_SKYHOOK) + set(ARROW_DATASET ON) + set(ARROW_PARQUET ON) + set(ARROW_WITH_LZ4 ON) + set(ARROW_WITH_SNAPPY ON) +endif() + if(ARROW_DATASET) set(ARROW_COMPUTE ON) set(ARROW_FILESYSTEM ON) @@ -938,6 +940,10 @@ if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() +if(ARROW_SKYHOOK) + add_subdirectory(src/skyhook) +endif() + if(ARROW_BUILD_EXAMPLES) add_custom_target(runexample ctest -L example) add_subdirectory(examples/arrow) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json new file mode 100644 index 0000000000000..a9ca585abdc0a --- /dev/null +++ b/cpp/CMakePresets.json @@ -0,0 +1,165 @@ +{ + "version": 2, + "cmakeMinimumRequired": { + "major": 3, + "minor": 20, + "patch": 0 + }, + "configurePresets": [ + { + "name": "ninja-benchmarks", + "description": "Build for benchmarks", + "inherits": "ninja-release", + "cacheVariables": { + "ARROW_BUILD_BENCHMARKS": "ON", + "ARROW_BUILD_BENCHMARKS_REFERENCE": "ON", + "ARROW_BUILD_TESTS": "OFF" + } + }, + { + "name": "ninja-debug", + "description": "Debug configuration with basic build", + "binaryDir": "${sourceDir}/build/${presetName}", + "generator": "Ninja", + "cacheVariables": { + "ARROW_BUILD_BENCHMARKS": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_BUILD_TESTS": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_COMPUTE": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_CSV": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_CUDA": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_DATASET": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_GANDIVA": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_GANDIVA_JAVA": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_GANDIVA_JNI": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_FILESYSTEM": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_IPC": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_PARQUET": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_PLASMA_JAVA_CLIENT": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_PYTHON": { + "type": "BOOL", + "value": "ON" + }, + "ARROW_SKYHOOK": { + "type": "BOOL", + "value": "OFF" + }, + "ARROW_WITH_RE2": { + "type": "BOOL", + "value": "ON" + }, + "CMAKE_BUILD_TYPE": { + "type": "String", + "value": "Debug" + }, + "CMAKE_INSTALL_PREFIX": { + "type": "PATH", + "value": "${sourceDir}/build/${presetName}/pkg" + } + } + }, + { + "name": "ninja-debug-cuda", + "description": "Debug Arrow build with CUDA extensions (requires CUDA toolkit)", + "inherits": "ninja-debug", + "cacheVariables": { + "ARROW_CUDA": "ON" + } + }, + { + "name": "ninja-debug-dataset", + "description": "Builds Arrow Dataset modules", + "inherits": "ninja-debug", + "cacheVariables": { + "ARROW_DATASET": "ON" + } + }, + { + "name": "ninja-debug-gandiva", + "description": "Builds Gandiva libraries", + "inherits": "ninja-debug", + "cacheVariables": { + "ARROW_GANDIVA": "ON" + } + }, + { + "name": "ninja-debug-parquet", + "description": "Builds Parquet libraries", + "inherits": "ninja-debug", + "cacheVariables": { + "ARROW_PARQUET": "ON" + } + }, + { + "name": "ninja-debug-skyhook", + "description": "Builds Skyhook libraries", + + "inherits": "ninja-debug", + "cacheVariables": { + "ARROW_SKYHOOK": "ON" + } + }, + { + "name": "ninja-release", + "description": "Release configuration", + "inherits": "ninja-debug", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + }, + { + "name": "ninja-release-gandiva", + "description": "Release configuration with Gandiva", + "inherits": "ninja-release", + "cacheVariables": { + "ARROW_GANDIVA": "ON" + } + }, + { + "name": "ninja-release-parquet", + "description": "Release configuration with Parquet", + "inherits": "ninja-release", + "cacheVariables": { + "ARROW_PARQUET": "ON" + } + } + ] +} diff --git a/cpp/build-support/lint_exclusions.txt b/cpp/build-support/lint_exclusions.txt index 4feb8fbe13861..73cbd884f44e3 100644 --- a/cpp/build-support/lint_exclusions.txt +++ b/cpp/build-support/lint_exclusions.txt @@ -1,8 +1,6 @@ *_generated* *.grpc.fb.* -*apidoc/* *arrowExports.cpp* -*build_support/* *parquet_constants.* *parquet_types.* *pyarrow_api.h diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index fd653a530711e..96487251d0070 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -61,6 +61,7 @@ def _check_one_file(filename, formatted): "that should be excluded from the checks") parser.add_argument("--source_dir", required=True, + action="append", help="Root directory of the source code") parser.add_argument("--fix", default=False, action="store_true", @@ -78,8 +79,9 @@ def _check_one_file(filename, formatted): exclude_globs.extend(line.strip() for line in f) formatted_filenames = [] - for path in lintutils.get_sources(arguments.source_dir, exclude_globs): - formatted_filenames.append(str(path)) + for source_dir in arguments.source_dir: + for path in lintutils.get_sources(source_dir, exclude_globs): + formatted_filenames.append(str(path)) if arguments.fix: if not arguments.quiet: diff --git a/cpp/build-support/run_clang_tidy.py b/cpp/build-support/run_clang_tidy.py index e5211be84e554..863c5bd70ab2c 100755 --- a/cpp/build-support/run_clang_tidy.py +++ b/cpp/build-support/run_clang_tidy.py @@ -83,6 +83,7 @@ def _check_all(cmd, filenames): help="compile_commands.json to pass clang-tidy") parser.add_argument("--source_dir", required=True, + action="append", help="Root directory of the source code") parser.add_argument("--fix", default=False, action="store_true", @@ -100,8 +101,9 @@ def _check_all(cmd, filenames): exclude_globs.append(line.strip()) linted_filenames = [] - for path in lintutils.get_sources(arguments.source_dir, exclude_globs): - linted_filenames.append(path) + for source_dir in arguments.source_dir: + for path in lintutils.get_sources(source_dir, exclude_globs): + linted_filenames.append(path) if not arguments.quiet: msg = 'Tidying {}' if arguments.fix else 'Checking {}' diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py index cc98e094e6ce9..76c0fe0aefaca 100755 --- a/cpp/build-support/run_cpplint.py +++ b/cpp/build-support/run_cpplint.py @@ -67,6 +67,7 @@ def _check_some_files(completed_processes, filenames): "that should be excluded from the checks") parser.add_argument("--source_dir", required=True, + action="append", help="Root directory of the source code") parser.add_argument("--quiet", default=False, action="store_true", @@ -79,8 +80,9 @@ def _check_some_files(completed_processes, filenames): exclude_globs.extend(line.strip() for line in f) linted_filenames = [] - for path in lintutils.get_sources(arguments.source_dir, exclude_globs): - linted_filenames.append(str(path)) + for source_dir in arguments.source_dir: + for path in lintutils.get_sources(source_dir, exclude_globs): + linted_filenames.append(str(path)) cmd = [ arguments.cpplint_binary, diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index cd8290d1bbb67..38c35d7e79e70 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -652,6 +652,7 @@ function(ADD_TEST_CASE REL_TEST_NAME) EXTRA_DEPENDENCIES LABELS EXTRA_LABELS + TEST_ARGUMENTS PREFIX) cmake_parse_arguments(ARG "${options}" @@ -730,15 +731,16 @@ function(ADD_TEST_CASE REL_TEST_NAME) "cd '${CMAKE_SOURCE_DIR}'; \ valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \ - --error-exitcode=1 ${TEST_PATH}") + --error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}") elseif(WIN32) - add_test(${TEST_NAME} ${TEST_PATH}) + add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS}) else() add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test - ${TEST_PATH}) + ${TEST_PATH} + ${ARG_TEST_ARGUMENTS}) endif() # Add test as dependency of relevant targets @@ -798,7 +800,12 @@ endfunction() function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) set(options) set(one_value_args) - set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + set(multi_value_args + EXTRA_INCLUDES + EXTRA_LINK_LIBS + EXTRA_SOURCES + DEPENDENCIES + PREFIX) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -820,7 +827,7 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc) # This example has a corresponding .cc file, set it up as an executable. set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}") - add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc") + add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc" ${ARG_EXTRA_SOURCES}) target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS}) add_dependencies(runexample ${EXAMPLE_NAME}) set(NO_COLOR "--color_print=false") @@ -834,6 +841,10 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) endif() + if(ARG_EXTRA_INCLUDES) + target_include_directories(${EXAMPLE_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES}) + endif() + add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") endfunction() diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 3568887fa261f..f81a1b1577901 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -266,6 +266,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) + define_option(ARROW_SKYHOOK "Build the Skyhook libraries" OFF) + define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF) define_option(ARROW_TESTING "Build the Arrow testing libraries" OFF) diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 18b23f32269b5..9bef477c13d0b 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -56,6 +56,20 @@ if(GRPCPP_PC_FOUND) list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) endif() find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) + + # gRPC does not expose the reflection library via pkg-config, but it should be alongside the main library + get_filename_component(GRPCPP_IMPORTED_DIRECTORY ${GRPCPP_IMPORTED_LOCATION} DIRECTORY) + if(ARROW_GRPC_USE_SHARED) + set(GRPCPP_REFLECTION_LIB_NAME + "${CMAKE_SHARED_LIBRARY_PREFIX}grpc++_reflection${CMAKE_SHARED_LIBRARY_SUFFIX}") + else() + set(GRPCPP_REFLECTION_LIB_NAME + "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++_reflection${CMAKE_STATIC_LIBRARY_SUFFIX}") + endif() + find_library(GRPCPP_REFLECTION_IMPORTED_LOCATION + NAMES grpc++_reflection ${GRPCPP_REFLECTION_LIB_NAME} + PATHS ${GRPCPP_IMPORTED_DIRECTORY} + NO_DEFAULT_PATH) else() set(gRPCAlt_FOUND FALSE) endif() @@ -70,6 +84,12 @@ if(gRPCAlt_FOUND) INTERFACE_LINK_LIBRARIES "${GRPCPP_LINK_LIBRARIES}" INTERFACE_LINK_OPTIONS "${GRPCPP_LINK_OPTIONS}") + add_library(gRPC::grpc++_reflection UNKNOWN IMPORTED) + set_target_properties(gRPC::grpc++_reflection + PROPERTIES IMPORTED_LOCATION + "${GRPCPP_REFLECTION_IMPORTED_LOCATION}" + INTERFACE_LINK_LIBRARIES gRPC::grpc++) + add_executable(gRPC::grpc_cpp_plugin IMPORTED) set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION ${GRPC_CPP_PLUGIN}) diff --git a/cpp/cmake_modules/Findlibrados.cmake b/cpp/cmake_modules/Findlibrados.cmake new file mode 100644 index 0000000000000..695d73fae1cb8 --- /dev/null +++ b/cpp/cmake_modules/Findlibrados.cmake @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +find_path(LIBRADOS_INCLUDE_DIR rados/librados.hpp) + +find_library(LIBRADOS_LIBRARY NAMES rados) + +mark_as_advanced(LIBRADOS_LIBRARY LIBRADOS_INCLUDE_DIR) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(librados DEFAULT_MSG LIBRADOS_LIBRARY + LIBRADOS_INCLUDE_DIR) + +if(librados_FOUND) + add_library(librados::rados UNKNOWN IMPORTED) + set_target_properties(librados::rados + PROPERTIES IMPORTED_LOCATION "${LIBRADOS_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${LIBRADOS_INCLUDE_DIR}") +endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index c1a1ba043664d..a04eba91d0e10 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -275,6 +275,7 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") @@ -305,6 +306,7 @@ elseif("${BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wpedantic") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 673a58eedad8a..085e8d43f5c47 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -579,10 +579,20 @@ endif() if(DEFINED ENV{ARROW_SNAPPY_URL}) set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}") else() - set_urls(SNAPPY_SOURCE_URL - "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" - ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS + "4.9") + # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 "SNAPPY_OLD" for those (ARROW-14661) + set_urls(SNAPPY_SOURCE_URL + "https://github.com/google/snappy/archive/${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz" + ) + set(ARROW_SNAPPY_BUILD_SHA256_CHECKSUM ${ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM}) + else() + set_urls(SNAPPY_SOURCE_URL + "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" + ) + endif() endif() if(DEFINED ENV{ARROW_THRIFT_URL}) @@ -904,7 +914,13 @@ if(ARROW_USE_UBSAN) set(ARROW_USE_NATIVE_INT128 FALSE) else() include(CheckCXXSymbolExists) - check_cxx_symbol_exists("__SIZEOF_INT128__" "" ARROW_USE_NATIVE_INT128) + check_cxx_symbol_exists("_M_ARM64" "" WIN32_ARM64_TARGET) + if(WIN32_ARM64_TARGET AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") + # NOTE: For clang/win-arm64, native int128_t produce linker errors + set(ARROW_USE_NATIVE_INT128 FALSE) + else() + check_cxx_symbol_exists("__SIZEOF_INT128__" "" ARROW_USE_NATIVE_INT128) + endif() endif() # - Gandiva has a compile-time (header-only) dependency on Boost, not runtime. @@ -965,7 +981,10 @@ macro(build_snappy) ) set(SNAPPY_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_LIBDIR=lib -DSNAPPY_BUILD_TESTS=OFF + ${EP_COMMON_CMAKE_ARGS} + -DCMAKE_INSTALL_LIBDIR=lib + -DSNAPPY_BUILD_TESTS=OFF + -DSNAPPY_BUILD_BENCHMARKS=OFF "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") externalproject_add(snappy_ep @@ -1464,6 +1483,7 @@ macro(build_protobuf) add_dependencies(toolchain protobuf_ep) add_dependencies(arrow::protobuf::libprotobuf protobuf_ep) + add_dependencies(arrow::protobuf::protoc protobuf_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS arrow::protobuf::libprotobuf) endmacro() @@ -3300,6 +3320,9 @@ macro(build_grpc) set(GRPC_STATIC_LIBRARY_ADDRESS_SORTING "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++_reflection${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(GRPC_STATIC_LIBRARY_UPB "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}" ) @@ -3394,6 +3417,7 @@ macro(build_grpc) ${GRPC_STATIC_LIBRARY_GRPC} ${GRPC_STATIC_LIBRARY_GRPCPP} ${GRPC_STATIC_LIBRARY_ADDRESS_SORTING} + ${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION} ${GRPC_STATIC_LIBRARY_UPB} ${GRPC_CPP_PLUGIN} CMAKE_ARGS ${GRPC_CMAKE_ARGS} ${EP_LOG_OPTIONS} @@ -3427,6 +3451,12 @@ macro(build_grpc) "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}" INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + add_library(gRPC::grpc++_reflection STATIC IMPORTED) + set_target_properties(gRPC::grpc++_reflection + PROPERTIES IMPORTED_LOCATION + "${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION}" + INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + add_library(gRPC::grpc STATIC IMPORTED) set(GRPC_LINK_LIBRARIES gRPC::gpr @@ -3506,6 +3536,8 @@ if(ARROW_WITH_GRPC) if(GRPC_VENDORED) set(GRPCPP_PP_INCLUDE TRUE) + # Examples need to link to static Arrow if we're using static gRPC + set(ARROW_GRPC_USE_SHARED OFF) else() # grpc++ headers may reside in ${GRPC_INCLUDE_DIR}/grpc++ or ${GRPC_INCLUDE_DIR}/grpcpp # depending on the gRPC version. @@ -3937,11 +3969,19 @@ macro(build_awssdk) DEPENDS aws_checksums_ep) add_dependencies(AWS::aws-c-event-stream aws_c_event_stream_ep) + set(AWSSDK_PATCH_COMMAND) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER + "10") + # Workaround for https://github.com/aws/aws-sdk-cpp/issues/1750 + set(AWSSDK_PATCH_COMMAND "sed" "-i.bak" "-e" "s/\"-Werror\"//g" + "/cmake/compiler_settings.cmake") + endif() externalproject_add(awssdk_ep ${EP_LOG_OPTIONS} URL ${AWSSDK_SOURCE_URL} URL_HASH "SHA256=${ARROW_AWSSDK_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${AWSSDK_CMAKE_ARGS} + PATCH_COMMAND ${AWSSDK_PATCH_COMMAND} BUILD_BYPRODUCTS ${AWS_CPP_SDK_COGNITO_IDENTITY_STATIC_LIBRARY} ${AWS_CPP_SDK_CORE_STATIC_LIBRARY} ${AWS_CPP_SDK_IDENTITY_MANAGEMENT_STATIC_LIBRARY} diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index ac758b92d81cd..e46cc7a6fe58a 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -15,30 +15,89 @@ # specific language governing permissions and limitations # under the License. -ADD_ARROW_EXAMPLE(row_wise_conversion_example) +add_arrow_example(row_wise_conversion_example) -if (ARROW_COMPUTE) - ADD_ARROW_EXAMPLE(compute_register_example) +if(ARROW_COMPUTE) + add_arrow_example(compute_register_example) endif() -if (ARROW_COMPUTE AND ARROW_CSV) - ADD_ARROW_EXAMPLE(compute_and_write_csv_example) +if(ARROW_COMPUTE AND ARROW_CSV) + add_arrow_example(compute_and_write_csv_example) endif() -if (ARROW_PARQUET AND ARROW_DATASET) - if (ARROW_BUILD_SHARED) +if(ARROW_FLIGHT) + # Static gRPC means we cannot linked to shared Arrow, since then + # we'll violate ODR for gRPC symbols + if(ARROW_GRPC_USE_SHARED) + set(FLIGHT_EXAMPLES_LINK_LIBS arrow_flight_shared) + # We don't directly use symbols from the reflection library, so + # ensure the linker still links to it + set(GRPC_REFLECTION_LINK_LIBS -Wl,--no-as-needed gRPC::grpc++_reflection + -Wl,--as-needed) + elseif(NOT ARROW_BUILD_STATIC) + message(FATAL_ERROR "Statically built gRPC requires ARROW_BUILD_STATIC=ON") + else() + set(FLIGHT_EXAMPLES_LINK_LIBS arrow_flight_static) + set(GRPC_REFLECTION_LINK_LIBS -Wl,--whole-archive gRPC::grpc++_reflection + -Wl,--no-whole-archive) + endif() + + set(FLIGHT_EXAMPLE_GENERATED_PROTO_FILES + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.h" + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.h") + set_source_files_properties(${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES} PROPERTIES GENERATED + TRUE) + + set(FLIGHT_EXAMPLE_PROTO "helloworld.proto") + set(FLIGHT_EXAMPLE_PROTO_PATH "${CMAKE_CURRENT_LIST_DIR}") + set(FLIGHT_EXAMPLE_PROTO_DEPENDS ${FLIGHT_EXAMPLE_PROTO} gRPC::grpc_cpp_plugin) + + add_custom_command(OUTPUT ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES} + COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}" + "--cpp_out=${CMAKE_CURRENT_BINARY_DIR}" + "${FLIGHT_EXAMPLE_PROTO}" + COMMAND ${ARROW_PROTOBUF_PROTOC} "-I${FLIGHT_EXAMPLE_PROTO_PATH}" + "--grpc_out=${CMAKE_CURRENT_BINARY_DIR}" + "--plugin=protoc-gen-grpc=$" + "${FLIGHT_EXAMPLE_PROTO}" + DEPENDS ${FLIGHT_EXAMPLE_PROTO_DEPENDS}) + + add_custom_target(flight_grpc_example_gen ALL + DEPENDS ${FLIGHT_EXAMPLE_GENERATED_PROTO_FILES}) + + add_arrow_example(flight_grpc_example + DEPENDENCIES + flight_grpc_example_gen + # Not CMAKE_CURRENT_BINARY_DIR so we can #include + # "examples/arrow/helloworld.pb.h" instead of + # "helloworld.pb.h" (which fails lint) + EXTRA_INCLUDES + ${CMAKE_BINARY_DIR} + EXTRA_LINK_LIBS + ${FLIGHT_EXAMPLES_LINK_LIBS} + gRPC::grpc++ + ${GRPC_REFLECTION_LINK_LIBS} + ${ARROW_PROTOBUF_LIBPROTOBUF} + ${GFLAGS_LIBRARIES} + EXTRA_SOURCES + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/helloworld.grpc.pb.cc") +endif() + +if(ARROW_PARQUET AND ARROW_DATASET) + if(ARROW_BUILD_SHARED) set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared) else() set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_static) endif() - ADD_ARROW_EXAMPLE(dataset_parquet_scan_example - EXTRA_LINK_LIBS - ${DATASET_EXAMPLES_LINK_LIBS}) + add_arrow_example(dataset_parquet_scan_example EXTRA_LINK_LIBS + ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(dataset_parquet_scan_example parquet) - ADD_ARROW_EXAMPLE(dataset_documentation_example - EXTRA_LINK_LIBS - ${DATASET_EXAMPLES_LINK_LIBS}) + add_arrow_example(dataset_documentation_example EXTRA_LINK_LIBS + ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(dataset_documentation_example parquet) endif() diff --git a/cpp/examples/arrow/flight_grpc_example.cc b/cpp/examples/arrow/flight_grpc_example.cc new file mode 100644 index 0000000000000..db9cc177a5f74 --- /dev/null +++ b/cpp/examples/arrow/flight_grpc_example.cc @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include +#include +#include + +#include "examples/arrow/helloworld.grpc.pb.h" +#include "examples/arrow/helloworld.pb.h" + +// Demonstrate registering a gRPC service alongside a Flight service +// +// The gRPC service can be accessed with a gRPC client, on the same +// port as the Flight service. Additionally, the CMake config for this +// example links against the gRPC reflection library, enabling tools +// like grpc_cli and grpcurl to list and call RPCs on the server +// without needing local copies of the Protobuf definitions. +// For example, with grpcurl (https://github.com/fullstorydev/grpcurl): +// +// grpcurl -d '{"name": "Rakka"}' -plaintext localhost:31337 HelloWorldService/SayHello +// +// Note that for applications that wish to follow the example here, +// care must be taken to ensure that Protobuf and gRPC are not +// multiply linked, else the resulting program may crash or silently +// corrupt data. In particular: +// +// * If dynamically linking Arrow Flight, then your application and +// Arrow Flight must also dynamically link Protobuf and gRPC. (The +// same goes for static linking.) +// * The Flight packages on some platforms may make this difficult, +// because the Flight dynamic library will itself have statically +// linked Protobuf and gRPC since the platform does not ship a +// recent enough version of those dependencies. +// * The versions of Protobuf and gRPC must be the same between Flight +// and your application. +// +// See "Using Arrow C++ in your own project" in the documentation. + +DEFINE_int32(port, -1, "Server port to listen on"); + +namespace flight = ::arrow::flight; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +// Flight service +class SimpleFlightServer : public flight::FlightServerBase {}; + +// gRPC service +class HelloWorldServiceImpl : public HelloWorldService::Service { + grpc::Status SayHello(grpc::ServerContext* ctx, const HelloRequest* request, + HelloResponse* reply) override { + const std::string& name = request->name(); + if (name.empty()) { + return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Must provide a name!"); + } + reply->set_reply("Hello, " + name); + return grpc::Status::OK; + } +}; + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_port < 0) { + // For CI + std::cout << "Must specify a port with -port" << std::endl; + return EXIT_SUCCESS; + } + + std::unique_ptr server; + server.reset(new SimpleFlightServer()); + + flight::Location bind_location; + ABORT_ON_FAILURE(flight::Location::ForGrpcTcp("0.0.0.0", FLAGS_port, &bind_location)); + flight::FlightServerOptions options(bind_location); + + HelloWorldServiceImpl grpc_service; + int extra_port = 0; + + options.builder_hook = [&](void* raw_builder) { + auto* builder = reinterpret_cast(raw_builder); + builder->AddListeningPort("0.0.0.0:0", grpc::InsecureServerCredentials(), + &extra_port); + builder->RegisterService(&grpc_service); + }; + ABORT_ON_FAILURE(server->Init(options)); + std::cout << "Listening on ports " << FLAGS_port << " and " << extra_port << std::endl; + ABORT_ON_FAILURE(server->SetShutdownOnSignals({SIGTERM})); + ABORT_ON_FAILURE(server->Serve()); + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/helloworld.proto b/cpp/examples/arrow/helloworld.proto new file mode 100644 index 0000000000000..599f88b185a8e --- /dev/null +++ b/cpp/examples/arrow/helloworld.proto @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax = "proto3"; + +service HelloWorldService { + rpc SayHello(HelloRequest) returns (HelloResponse); +} + +message HelloRequest { + string name = 1; +} + +message HelloResponse { + string reply = 1; +} \ No newline at end of file diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt index 9fc20c70fe0d2..d3eea7ba0f08c 100644 --- a/cpp/examples/minimal_build/CMakeLists.txt +++ b/cpp/examples/minimal_build/CMakeLists.txt @@ -31,7 +31,7 @@ message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}") add_executable(arrow_example example.cc) -if (ARROW_LINK_SHARED) +if(ARROW_LINK_SHARED) target_link_libraries(arrow_example PRIVATE arrow_shared) else() set(THREADS_PREFER_PTHREAD_FLAG ON) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 2d16948ae4ab8..e0d7432688a55 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -23,33 +23,33 @@ target_include_directories(parquet_low_level_example PRIVATE low_level_api/) target_include_directories(parquet_low_level_example2 PRIVATE low_level_api/) # The variables in these files are for illustration purposes -set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS - low_level_api/reader_writer.cc - low_level_api/reader_writer2.cc) +set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS low_level_api/reader_writer.cc + low_level_api/reader_writer2.cc) -if (PARQUET_REQUIRE_ENCRYPTION) +if(PARQUET_REQUIRE_ENCRYPTION) add_executable(parquet_encryption_example low_level_api/encryption_reader_writer.cc) - add_executable(parquet_encryption_example_all_crypto_options low_level_api/encryption_reader_writer_all_crypto_options.cc) + add_executable(parquet_encryption_example_all_crypto_options + low_level_api/encryption_reader_writer_all_crypto_options.cc) target_include_directories(parquet_encryption_example PRIVATE low_level_api/) - target_include_directories(parquet_encryption_example_all_crypto_options PRIVATE low_level_api/) + target_include_directories(parquet_encryption_example_all_crypto_options + PRIVATE low_level_api/) set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS - ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS} - low_level_api/encryption_reader_writer.cc - low_level_api/encryption_reader_writer_all_crypto_options.cc) + ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS} low_level_api/encryption_reader_writer.cc + low_level_api/encryption_reader_writer_all_crypto_options.cc) endif() if(UNIX) foreach(FILE ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS}) set_property(SOURCE ${FILE} - APPEND_STRING - PROPERTY COMPILE_FLAGS "-Wno-unused-variable") + APPEND_STRING + PROPERTY COMPILE_FLAGS "-Wno-unused-variable") endforeach() endif() # Prefer shared linkage but use static if shared build is deactivated -if (ARROW_BUILD_SHARED) +if(ARROW_BUILD_SHARED) set(PARQUET_EXAMPLE_LINK_LIBS parquet_shared) else() set(PARQUET_EXAMPLE_LINK_LIBS parquet_static) @@ -62,17 +62,17 @@ target_link_libraries(parquet_stream_api_example ${PARQUET_EXAMPLE_LINK_LIBS}) if(PARQUET_REQUIRE_ENCRYPTION) target_link_libraries(parquet_encryption_example ${PARQUET_EXAMPLE_LINK_LIBS}) - target_link_libraries(parquet_encryption_example_all_crypto_options ${PARQUET_EXAMPLE_LINK_LIBS}) + target_link_libraries(parquet_encryption_example_all_crypto_options + ${PARQUET_EXAMPLE_LINK_LIBS}) endif() add_dependencies(parquet - parquet_low_level_example - parquet_low_level_example2 - parquet_arrow_example - parquet_stream_api_example) + parquet_low_level_example + parquet_low_level_example2 + parquet_arrow_example + parquet_stream_api_example) -if (PARQUET_REQUIRE_ENCRYPTION) - add_dependencies(parquet - parquet_encryption_example - parquet_encryption_example_all_crypto_options) +if(PARQUET_REQUIRE_ENCRYPTION) + add_dependencies(parquet parquet_encryption_example + parquet_encryption_example_all_crypto_options) endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d7e433f48440b..8d9cbb32300a6 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -199,6 +199,7 @@ set(ARROW_SRCS util/bitmap_builders.cc util/bitmap_ops.cc util/bpacking.cc + util/byte_size.cc util/cancel.cc util/compression.cc util/counting_semaphore.cc @@ -376,10 +377,21 @@ if(ARROW_COMPUTE) compute/exec/exec_plan.cc compute/exec/expression.cc compute/exec/filter_node.cc + compute/exec/hash_join.cc + compute/exec/hash_join_dict.cc + compute/exec/hash_join_node.cc + compute/exec/ir_consumer.cc + compute/exec/key_compare.cc + compute/exec/key_encode.cc + compute/exec/key_hash.cc + compute/exec/key_map.cc + compute/exec/order_by_impl.cc compute/exec/project_node.cc - compute/exec/source_node.cc compute/exec/sink_node.cc - compute/exec/order_by_impl.cc + compute/exec/source_node.cc + compute/exec/task_util.cc + compute/exec/union_node.cc + compute/exec/util.cc compute/function.cc compute/function_internal.cc compute/kernel.cc @@ -391,6 +403,7 @@ if(ARROW_COMPUTE) compute/kernels/aggregate_var_std.cc compute/kernels/codegen_internal.cc compute/kernels/hash_aggregate.cc + compute/kernels/row_encoder.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_cast_boolean.cc @@ -401,38 +414,28 @@ if(ARROW_COMPUTE) compute/kernels/scalar_cast_string.cc compute/kernels/scalar_cast_temporal.cc compute/kernels/scalar_compare.cc + compute/kernels/scalar_if_else.cc compute/kernels/scalar_nested.cc compute/kernels/scalar_set_lookup.cc compute/kernels/scalar_string.cc compute/kernels/scalar_temporal_binary.cc compute/kernels/scalar_temporal_unary.cc compute/kernels/scalar_validity.cc - compute/kernels/scalar_if_else.cc compute/kernels/util_internal.cc compute/kernels/vector_array_sort.cc compute/kernels/vector_hash.cc compute/kernels/vector_nested.cc compute/kernels/vector_replace.cc compute/kernels/vector_selection.cc - compute/kernels/vector_sort.cc - compute/kernels/row_encoder.cc - compute/exec/union_node.cc - compute/exec/key_hash.cc - compute/exec/key_map.cc - compute/exec/key_compare.cc - compute/exec/key_encode.cc - compute/exec/util.cc - compute/exec/hash_join.cc - compute/exec/hash_join_node.cc - compute/exec/task_util.cc) + compute/kernels/vector_sort.cc) append_avx2_src(compute/kernels/aggregate_basic_avx2.cc) append_avx512_src(compute/kernels/aggregate_basic_avx512.cc) - append_avx2_src(compute/exec/key_hash_avx2.cc) - append_avx2_src(compute/exec/key_map_avx2.cc) append_avx2_src(compute/exec/key_compare_avx2.cc) append_avx2_src(compute/exec/key_encode_avx2.cc) + append_avx2_src(compute/exec/key_hash_avx2.cc) + append_avx2_src(compute/exec/key_map_avx2.cc) append_avx2_src(compute/exec/util_avx2.cc) list(APPEND ARROW_TESTING_SRCS compute/exec/test_util.cc) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index dd3cec1d7e9e4..136e6cadb956b 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -305,9 +305,6 @@ Status Array::Accept(ArrayVisitor* visitor) const { Status Array::Validate() const { return internal::ValidateArray(*this); } -Status Array::ValidateFull() const { - RETURN_NOT_OK(internal::ValidateArray(*this)); - return internal::ValidateArrayFull(*this); -} +Status Array::ValidateFull() const { return internal::ValidateArrayFull(*this); } } // namespace arrow diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 6892e5f0a91f7..7840c60f8974d 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -324,7 +324,7 @@ class TestStringArray : public ::testing::Test { std::shared_ptr strings_; }; -TYPED_TEST_SUITE(TestStringArray, BinaryArrowTypes); +TYPED_TEST_SUITE(TestStringArray, BaseBinaryArrowTypes); TYPED_TEST(TestStringArray, TestArrayBasics) { this->TestArrayBasics(); } @@ -661,7 +661,7 @@ class TestStringBuilder : public TestBuilder { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestStringBuilder, BinaryArrowTypes); +TYPED_TEST_SUITE(TestStringBuilder, BaseBinaryArrowTypes); TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); } @@ -863,7 +863,7 @@ struct BinaryAppender { }; template -class TestBinaryDataVisitor : public ::testing::Test { +class TestBaseBinaryDataVisitor : public ::testing::Test { public: using TypeClass = T; @@ -891,10 +891,10 @@ class TestBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBinaryDataVisitor, BinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); -TYPED_TEST(TestBinaryDataVisitor, Basics) { this->TestBasics(); } +TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } -TYPED_TEST(TestBinaryDataVisitor, Sliced) { this->TestSliced(); } +TYPED_TEST(TestBaseBinaryDataVisitor, Sliced) { this->TestSliced(); } } // namespace arrow diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a503cbd516c66..34887ad26fc34 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -25,6 +25,8 @@ #include "arrow/array.h" #include "arrow/array/builder_nested.h" +#include "arrow/array/util.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/testing/gtest_common.h" @@ -621,6 +623,7 @@ TEST_F(TestMapArray, Equality) { } ASSERT_OK(ib.AppendValues(equal_values.data(), equal_values.size())); ASSERT_OK(builder_->Finish(out)); + ASSERT_OK((*out)->ValidateFull()); } // now an unequal one @@ -630,6 +633,7 @@ TEST_F(TestMapArray, Equality) { } ASSERT_OK(ib.AppendValues(unequal_values.data(), unequal_values.size())); ASSERT_OK(builder_->Finish(&unequal_array)); + ASSERT_OK(unequal_array->ValidateFull()); // Test array equality EXPECT_TRUE(array->Equals(array)); @@ -713,6 +717,57 @@ TEST_F(TestMapArray, BuildingStringToInt) { ASSERT_ARRAYS_EQUAL(*actual, expected); } +TEST_F(TestMapArray, ValidateErrorNullStruct) { + ASSERT_OK_AND_ASSIGN( + auto values, + MakeArrayOfNull(struct_({field("key", utf8()), field("value", int32())}), 1)); + + Int32Builder offset_builder; + ASSERT_OK(offset_builder.AppendNull()); + ASSERT_OK(offset_builder.Append(0)); + ASSERT_OK_AND_ASSIGN(auto offsets, offset_builder.Finish()); + + ASSERT_OK_AND_ASSIGN(auto lists, ListArray::FromArrays(*offsets, *values)); + ASSERT_OK(lists->ValidateFull()); + ASSERT_EQ(lists->length(), 1); + ASSERT_EQ(lists->null_count(), 1); + + // Make a Map ArrayData from the list array + // Note we can't construct a MapArray as that would crash with an assertion. + auto map_data = lists->data()->Copy(); + map_data->type = map(utf8(), int32()); + ASSERT_RAISES(Invalid, internal::ValidateArray(*map_data)); +} + +TEST_F(TestMapArray, ValidateErrorNullKey) { + StringBuilder key_builder; + ASSERT_OK(key_builder.AppendNull()); + ASSERT_OK_AND_ASSIGN(auto keys, key_builder.Finish()); + + Int32Builder item_builder; + ASSERT_OK(item_builder.Append(42)); + ASSERT_OK_AND_ASSIGN(auto items, item_builder.Finish()); + + ASSERT_OK_AND_ASSIGN( + auto values, + StructArray::Make({keys, items}, std::vector{"key", "value"})); + + Int32Builder offset_builder; + ASSERT_OK(offset_builder.Append(0)); + ASSERT_OK(offset_builder.Append(1)); + ASSERT_OK_AND_ASSIGN(auto offsets, offset_builder.Finish()); + + // The list array contains: [[null, 42]] + ASSERT_OK_AND_ASSIGN(auto lists, ListArray::FromArrays(*offsets, *values)); + ASSERT_OK(lists->ValidateFull()); + + // Make a Map ArrayData from the list array + // Note we can't construct a MapArray as that would crash with an assertion. + auto map_data = lists->data()->Copy(); + map_data->type = map(keys->type(), items->type()); + ASSERT_RAISES(Invalid, internal::ValidateArray(*map_data)); +} + TEST_F(TestMapArray, FromArrays) { std::shared_ptr offsets1, offsets2, offsets3, offsets4, keys, items; diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 22ad728a4ecdf..a3c1fab054e94 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -35,6 +35,7 @@ #include "arrow/type_traits.h" #include "arrow/util/atomic_shared_ptr.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -541,56 +542,63 @@ std::shared_ptr StructArray::GetFieldByName(const std::string& name) cons Result StructArray::Flatten(MemoryPool* pool) const { ArrayVector flattened; - flattened.reserve(data_->child_data.size()); + flattened.resize(data_->child_data.size()); std::shared_ptr null_bitmap = data_->buffers[0]; - for (const auto& child_data_ptr : data_->child_data) { - auto child_data = child_data_ptr->Copy(); + for (int i = 0; static_cast(i) < data_->child_data.size(); i++) { + ARROW_ASSIGN_OR_RAISE(flattened[i], GetFlattenedField(i, pool)); + } - std::shared_ptr flattened_null_bitmap; - int64_t flattened_null_count = kUnknownNullCount; + return flattened; +} - // Need to adjust for parent offset - if (data_->offset != 0 || data_->length != child_data->length) { - child_data = child_data->Slice(data_->offset, data_->length); - } - std::shared_ptr child_null_bitmap = child_data->buffers[0]; - const int64_t child_offset = child_data->offset; - - // The validity of a flattened datum is the logical AND of the struct - // element's validity and the individual field element's validity. - if (null_bitmap && child_null_bitmap) { - ARROW_ASSIGN_OR_RAISE( - flattened_null_bitmap, - BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_, - data_->offset, data_->length, child_offset)); - } else if (child_null_bitmap) { - flattened_null_bitmap = child_null_bitmap; - flattened_null_count = child_data->null_count; - } else if (null_bitmap) { - if (child_offset == data_->offset) { - flattened_null_bitmap = null_bitmap; - } else { - // If the child has an offset, need to synthesize a validity - // buffer with an offset too - ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap, - AllocateEmptyBitmap(child_offset + data_->length, pool)); - CopyBitmap(null_bitmap_data_, data_->offset, data_->length, - flattened_null_bitmap->mutable_data(), child_offset); - } - flattened_null_count = data_->null_count; - } else { - flattened_null_count = 0; - } +Result> StructArray::GetFlattenedField(int index, + MemoryPool* pool) const { + std::shared_ptr null_bitmap = data_->buffers[0]; - auto flattened_data = child_data->Copy(); - flattened_data->buffers[0] = flattened_null_bitmap; - flattened_data->null_count = flattened_null_count; + auto child_data = data_->child_data[index]->Copy(); - flattened.push_back(MakeArray(flattened_data)); + std::shared_ptr flattened_null_bitmap; + int64_t flattened_null_count = kUnknownNullCount; + + // Need to adjust for parent offset + if (data_->offset != 0 || data_->length != child_data->length) { + child_data = child_data->Slice(data_->offset, data_->length); } + std::shared_ptr child_null_bitmap = child_data->buffers[0]; + const int64_t child_offset = child_data->offset; - return flattened; + // The validity of a flattened datum is the logical AND of the struct + // element's validity and the individual field element's validity. + if (null_bitmap && child_null_bitmap) { + ARROW_ASSIGN_OR_RAISE( + flattened_null_bitmap, + BitmapAnd(pool, child_null_bitmap->data(), child_offset, null_bitmap_data_, + data_->offset, data_->length, child_offset)); + } else if (child_null_bitmap) { + flattened_null_bitmap = child_null_bitmap; + flattened_null_count = child_data->null_count; + } else if (null_bitmap) { + if (child_offset == data_->offset) { + flattened_null_bitmap = null_bitmap; + } else { + // If the child has an offset, need to synthesize a validity + // buffer with an offset too + ARROW_ASSIGN_OR_RAISE(flattened_null_bitmap, + AllocateEmptyBitmap(child_offset + data_->length, pool)); + CopyBitmap(null_bitmap_data_, data_->offset, data_->length, + flattened_null_bitmap->mutable_data(), child_offset); + } + flattened_null_count = data_->null_count; + } else { + flattened_null_count = 0; + } + + auto flattened_data = child_data->Copy(); + flattened_data->buffers[0] = flattened_null_bitmap; + flattened_data->null_count = flattened_null_count; + + return MakeArray(flattened_data); } // ---------------------------------------------------------------------- @@ -643,6 +651,43 @@ SparseUnionArray::SparseUnionArray(std::shared_ptr type, int64_t lengt SetData(std::move(internal_data)); } +Result> SparseUnionArray::GetFlattenedField( + int index, MemoryPool* pool) const { + if (index < 0 || index >= num_fields()) { + return Status::Invalid("Index out of range: ", index); + } + auto child_data = data_->child_data[index]->Copy(); + // Adjust the result offset/length to be absolute. + if (data_->offset != 0 || data_->length != child_data->length) { + child_data = child_data->Slice(data_->offset, data_->length); + } + std::shared_ptr child_null_bitmap = child_data->buffers[0]; + const int64_t child_offset = child_data->offset; + + // Synthesize a null bitmap based on the union discriminant. + // Make sure the bitmap has extra bits corresponding to the child offset. + ARROW_ASSIGN_OR_RAISE(std::shared_ptr flattened_null_bitmap, + AllocateEmptyBitmap(child_data->length + child_offset, pool)); + const int8_t type_code = union_type()->type_codes()[index]; + const int8_t* type_codes = raw_type_codes(); + int64_t offset = 0; + internal::GenerateBitsUnrolled(flattened_null_bitmap->mutable_data(), child_offset, + data_->length, + [&] { return type_codes[offset++] == type_code; }); + + // The validity of a flattened datum is the logical AND of the synthesized + // null bitmap buffer and the individual field element's validity. + if (child_null_bitmap) { + BitmapAnd(flattened_null_bitmap->data(), child_offset, child_null_bitmap->data(), + child_offset, child_data->length, child_offset, + flattened_null_bitmap->mutable_data()); + } + + child_data->buffers[0] = std::move(flattened_null_bitmap); + child_data->null_count = kUnknownNullCount; + return MakeArray(child_data); +} + DenseUnionArray::DenseUnionArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 762ba24f27980..b89680b79c043 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -284,9 +284,9 @@ class ARROW_EXPORT FixedSizeListArray : public Array { std::shared_ptr value_type() const; // The following functions will not perform boundschecking - int32_t value_offset(int64_t i) const { + int64_t value_offset(int64_t i) const { i += data_->offset; - return static_cast(list_size_ * i); + return list_size_ * i; } int32_t value_length(int64_t i = 0) const { ARROW_UNUSED(i); @@ -370,6 +370,14 @@ class ARROW_EXPORT StructArray : public Array { /// \param[in] pool The pool to allocate null bitmaps from, if necessary Result Flatten(MemoryPool* pool = default_memory_pool()) const; + /// \brief Get one of the child arrays, combining its null bitmap + /// with the parent struct array's bitmap. + /// + /// \param[in] index Which child array to get + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + private: // For caching boxed child data // XXX This is not handled in a thread-safe manner. @@ -456,6 +464,14 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { return internal::checked_cast(union_type_); } + /// \brief Get one of the child arrays, adjusting its null bitmap + /// where the union array type code does not match. + /// + /// \param[in] index Which child array to get (i.e. the physical index, not the type + /// code) \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + protected: void SetData(std::shared_ptr data); }; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 62ee032db706f..efe600f1223bb 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -117,7 +117,8 @@ TEST_F(TestArray, TestNullToString) { auto data = std::make_shared(nullptr, 400); std::unique_ptr arr(new Int32Array(100, data)); - ASSERT_EQ(arr->ToString(), ""); + ASSERT_EQ(arr->ToString(), + ""); } TEST_F(TestArray, TestSliceSafe) { @@ -332,6 +333,10 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { } TEST_F(TestArray, TestMakeArrayOfNull) { + FieldVector union_fields1({field("a", utf8()), field("b", int32())}); + FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); + std::vector union_type_codes{7, 42}; + std::shared_ptr types[] = { // clang-format off null(), @@ -354,19 +359,33 @@ TEST_F(TestArray, TestMakeArrayOfNull) { fixed_size_list(int64(), 4), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), + sparse_union(union_fields1, union_type_codes), + sparse_union(union_fields2, union_type_codes), + dense_union(union_fields1, union_type_codes), + dense_union(union_fields2, union_type_codes), smallint(), // extension type // clang-format on }; for (int64_t length : {0, 1, 16, 133}) { for (auto type : types) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(type, length)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); - ASSERT_EQ(array->null_count(), length); - for (int64_t i = 0; i < length; ++i) { - ASSERT_TRUE(array->IsNull(i)); - ASSERT_FALSE(array->IsValid(i)); + if (is_union(type->id())) { + // For unions, MakeArrayOfNull places the nulls in the children + ASSERT_EQ(array->null_count(), 0); + const auto& union_array = checked_cast(*array); + for (int i = 0; i < union_array.num_fields(); ++i) { + ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length()); + } + } else { + ASSERT_EQ(array->null_count(), length); + for (int64_t i = 0; i < length; ++i) { + ASSERT_TRUE(array->IsNull(i)); + ASSERT_FALSE(array->IsValid(i)); + } } } } @@ -2668,8 +2687,9 @@ class DecimalTest : public ::testing::TestWithParam { } template - void TestCreate(int32_t precision, const DecimalVector& draw, - const std::vector& valid_bytes, int64_t offset) const { + std::shared_ptr TestCreate(int32_t precision, const DecimalVector& draw, + const std::vector& valid_bytes, + int64_t offset) const { auto type = std::make_shared(precision, 4); auto builder = std::make_shared(type); @@ -2677,20 +2697,20 @@ class DecimalTest : public ::testing::TestWithParam { const size_t size = draw.size(); - ASSERT_OK(builder->Reserve(size)); + ARROW_EXPECT_OK(builder->Reserve(size)); for (size_t i = 0; i < size; ++i) { if (valid_bytes[i]) { - ASSERT_OK(builder->Append(draw[i])); + ARROW_EXPECT_OK(builder->Append(draw[i])); } else { - ASSERT_OK(builder->AppendNull()); + ARROW_EXPECT_OK(builder->AppendNull()); ++null_count; } } std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); - ASSERT_EQ(builder->length(), 0); + EXPECT_EQ(builder->length(), 0); std::vector raw_bytes; @@ -2699,7 +2719,7 @@ class DecimalTest : public ::testing::TestWithParam { auto expected_data = std::make_shared(raw_bytes.data(), BYTE_WIDTH); std::shared_ptr expected_null_bitmap; - ASSERT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes)); + EXPECT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes)); int64_t expected_null_count = CountNulls(valid_bytes); auto expected = std::make_shared( @@ -2708,6 +2728,8 @@ class DecimalTest : public ::testing::TestWithParam { std::shared_ptr lhs = out->Slice(offset); std::shared_ptr rhs = expected->Slice(offset); ASSERT_ARRAYS_EQUAL(*rhs, *lhs); + + return out; } }; @@ -2741,6 +2763,21 @@ TEST_P(Decimal128Test, WithNulls) { this->TestCreate(precision, draw, valid_bytes, 2); } +TEST_P(Decimal128Test, ValidateFull) { + int32_t precision = GetParam(); + std::vector draw; + Decimal128 val = Decimal128::GetMaxValue(precision) + 1; + + draw = {Decimal128(), val}; + auto arr = this->TestCreate(precision, draw, {true, false}, 0); + ASSERT_OK(arr->ValidateFull()); + + draw = {val, Decimal128()}; + arr = this->TestCreate(precision, draw, {true, false}, 0); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull()); +} + INSTANTIATE_TEST_SUITE_P(Decimal128Test, Decimal128Test, ::testing::Range(1, 38)); using Decimal256Test = DecimalTest; @@ -2777,6 +2814,21 @@ TEST_P(Decimal256Test, WithNulls) { this->TestCreate(precision, draw, valid_bytes, 2); } +TEST_P(Decimal256Test, ValidateFull) { + int32_t precision = GetParam(); + std::vector draw; + Decimal256 val = Decimal256::GetMaxValue(precision) + 1; + + draw = {Decimal256(), val}; + auto arr = this->TestCreate(precision, draw, {true, false}, 0); + ASSERT_OK(arr->ValidateFull()); + + draw = {val, Decimal256()}; + arr = this->TestCreate(precision, draw, {true, false}, 0); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull()); +} + INSTANTIATE_TEST_SUITE_P(Decimal256Test, Decimal256Test, ::testing::Values(1, 2, 5, 10, 38, 39, 40, 75, 76)); @@ -2904,16 +2956,24 @@ TEST(TestSwapEndianArrayData, PrimitiveType) { expected_data = ArrayData::Make(uint64(), 1, {null_buffer, data_int64_buffer}, 0); AssertArrayDataEqualsWithSwapEndian(data, expected_data); - auto data_16byte_buffer = Buffer::FromString("0123456789abcdef"); + auto data_16byte_buffer = Buffer::FromString( + "\x01" + "123456789abcde\x01"); data = ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_16byte_buffer}); - auto data_decimal128_buffer = Buffer::FromString("fedcba9876543210"); + auto data_decimal128_buffer = Buffer::FromString( + "\x01" + "edcba987654321\x01"); expected_data = ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_decimal128_buffer}, 0); AssertArrayDataEqualsWithSwapEndian(data, expected_data); - auto data_32byte_buffer = Buffer::FromString("0123456789abcdef123456789ABCDEF0"); + auto data_32byte_buffer = Buffer::FromString( + "\x01" + "123456789abcdef123456789ABCDEF\x01"); data = ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_32byte_buffer}); - auto data_decimal256_buffer = Buffer::FromString("0FEDCBA987654321fedcba9876543210"); + auto data_decimal256_buffer = Buffer::FromString( + "\x01" + "FEDCBA987654321fedcba987654321\x01"); expected_data = ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_decimal256_buffer}, 0); AssertArrayDataEqualsWithSwapEndian(data, expected_data); diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index d3afe40df8ddc..3bd87a3438f4c 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -32,6 +32,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; TEST(TestUnionArray, TestSliceEquals) { std::shared_ptr batch; @@ -68,6 +69,70 @@ TEST(TestUnionArray, TestSliceEquals) { CheckUnion(batch->column(1)); } +TEST(TestSparseUnionArray, GetFlattenedField) { + auto ty = sparse_union({field("ints", int64()), field("strs", utf8())}, {2, 7}); + auto ints = ArrayFromJSON(int64(), "[0, 1, 2, 3]"); + auto strs = ArrayFromJSON(utf8(), R"(["a", null, "c", "d"])"); + auto ids = ArrayFromJSON(int8(), "[2, 7, 2, 7]")->data()->buffers[1]; + const int length = 4; + + { + SparseUnionArray arr(ty, length, {ints, strs}, ids); + ASSERT_OK(arr.ValidateFull()); + + ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0)); + AssertArraysEqual(*ArrayFromJSON(int64(), "[0, null, 2, null]"), *flattened, + /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1)); + AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, null, null, "d"])"), *flattened, + /*verbose=*/true); + + const auto sliced = checked_pointer_cast(arr.Slice(1, 2)); + + ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(0)); + AssertArraysEqual(*ArrayFromJSON(int64(), "[null, 2]"), *flattened, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(1)); + AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, null])"), *flattened, + /*verbose=*/true); + + ASSERT_RAISES(Invalid, arr.GetFlattenedField(-1)); + ASSERT_RAISES(Invalid, arr.GetFlattenedField(2)); + } + { + SparseUnionArray arr(ty, length - 2, {ints->Slice(1, 2), strs->Slice(1, 2)}, ids); + ASSERT_OK(arr.ValidateFull()); + + ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0)); + AssertArraysEqual(*ArrayFromJSON(int64(), "[1, null]"), *flattened, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1)); + AssertArraysEqual(*ArrayFromJSON(utf8(), R"([null, "c"])"), *flattened, + /*verbose=*/true); + + const auto sliced = checked_pointer_cast(arr.Slice(1, 1)); + + ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(0)); + AssertArraysEqual(*ArrayFromJSON(int64(), "[null]"), *flattened, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN(flattened, sliced->GetFlattenedField(1)); + AssertArraysEqual(*ArrayFromJSON(utf8(), R"(["c"])"), *flattened, /*verbose=*/true); + } + { + SparseUnionArray arr(ty, /*length=*/0, {ints->Slice(length), strs->Slice(length)}, + ids); + ASSERT_OK(arr.ValidateFull()); + + ASSERT_OK_AND_ASSIGN(auto flattened, arr.GetFlattenedField(0)); + AssertArraysEqual(*ArrayFromJSON(int64(), "[]"), *flattened, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN(flattened, arr.GetFlattenedField(1)); + AssertArraysEqual(*ArrayFromJSON(utf8(), "[]"), *flattened, + /*verbose=*/true); + } +} + TEST(TestSparseUnionArray, Validate) { auto a = ArrayFromJSON(int32(), "[4, 5]"); auto type = sparse_union({field("a", int32())}); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index d639830f469e0..2045b8f5c71bd 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -363,19 +363,28 @@ class NullArrayFactory { return Status::OK(); } - Status Visit(const UnionType& type) { + Status Visit(const SparseUnionType& type) { // type codes RETURN_NOT_OK(MaxOf(length_)); - if (type.mode() == UnionMode::DENSE) { - // offsets - RETURN_NOT_OK(MaxOf(sizeof(int32_t) * length_)); - } + // will create children of the same length as the union for (const auto& child : type.fields()) { RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), length_))); } return Status::OK(); } + Status Visit(const DenseUnionType& type) { + // type codes + RETURN_NOT_OK(MaxOf(length_)); + // offsets + RETURN_NOT_OK(MaxOf(sizeof(int32_t) * length_)); + // will create children of length 1 + for (const auto& child : type.fields()) { + RETURN_NOT_OK(MaxOf(GetBufferLength(child->type(), 1))); + } + return Status::OK(); + } + Status Visit(const DictionaryType& type) { RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); return MaxOf(GetBufferLength(type.index_type(), length_)); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index c66c4f53b9dda..52fcad5e7eb2f 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -35,17 +35,101 @@ namespace arrow { namespace internal { -/////////////////////////////////////////////////////////////////////////// -// ValidateArray: cheap validation checks - namespace { +struct UTF8DataValidator { + const ArrayData& data; + + Status Visit(const DataType&) { + // Default, should be unreachable + return Status::NotImplemented(""); + } + + template + enable_if_string Visit(const StringType&) { + util::InitializeUTF8(); + + int64_t i = 0; + return VisitArrayDataInline( + data, + [&](util::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } +}; + +struct BoundsChecker { + const ArrayData& data; + int64_t min_value; + int64_t max_value; + + Status Visit(const DataType&) { + // Default, should be unreachable + return Status::NotImplemented(""); + } + + template + enable_if_integer Visit(const IntegerType&) { + using c_type = typename IntegerType::c_type; + + int64_t i = 0; + return VisitArrayDataInline( + data, + [&](c_type value) { + const auto v = static_cast(value); + if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) { + return Status::Invalid("Value at position ", i, " out of bounds: ", v, + " (should be in [", min_value, ", ", max_value, "])"); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } +}; + struct ValidateArrayImpl { const ArrayData& data; + const bool full_validation; - Status Validate() { return ValidateWithType(*data.type); } + Status Validate() { + if (data.type == nullptr) { + return Status::Invalid("Array type is absent"); + } - Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); } + // XXX should we unpack extension types here? + + RETURN_NOT_OK(ValidateLayout(*data.type)); + // Check nulls *after* validating the buffer sizes, to avoid + // reading out of bounds. + RETURN_NOT_OK(ValidateNulls(*data.type)); + + // Run type-specific validations + return ValidateWithType(*data.type); + } + + Status ValidateWithType(const DataType& type) { + if (type.id() != Type::EXTENSION) { + if (data.child_data.size() != static_cast(type.num_fields())) { + return Status::Invalid("Expected ", type.num_fields(), + " child arrays in array " + "of type ", + type.ToString(), ", got ", data.child_data.size()); + } + } + return VisitTypeInline(type, this); + } Status Visit(const NullType&) { if (data.null_count != data.length) { @@ -54,28 +138,46 @@ struct ValidateArrayImpl { return Status::OK(); } - Status Visit(const FixedWidthType&) { - if (data.length > 0) { - if (!IsBufferValid(1)) { - return Status::Invalid("Missing values buffer in non-empty array"); - } + Status Visit(const FixedWidthType&) { return ValidateFixedWidthBuffers(); } + + Status Visit(const Decimal128Type& type) { + RETURN_NOT_OK(ValidateFixedWidthBuffers()); + return ValidateDecimals(type); + } + + Status Visit(const Decimal256Type& type) { + RETURN_NOT_OK(ValidateFixedWidthBuffers()); + return ValidateDecimals(type); + } + + Status Visit(const StringType& type) { + RETURN_NOT_OK(ValidateBinaryLike(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); } return Status::OK(); } - Status Visit(const StringType& type) { return ValidateBinaryLike(type); } + Status Visit(const LargeStringType& type) { + RETURN_NOT_OK(ValidateBinaryLike(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); } - Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); } - Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } - Status Visit(const MapType& type) { return ValidateListLike(type); } + Status Visit(const MapType& type) { + RETURN_NOT_OK(ValidateListLike(type)); + return MapArray::ValidateChildData(data.child_data); + } Status Visit(const FixedSizeListType& type) { const ArrayData& values = *data.child_data[0]; @@ -92,7 +194,7 @@ struct ValidateArrayImpl { ") multiplied by the value size (", list_size, ")"); } - const Status child_valid = ValidateArray(values); + const Status child_valid = RecurseInto(values); if (!child_valid.ok()) { return Status::Invalid("Fixed size list child array invalid: ", child_valid.ToString()); @@ -106,7 +208,7 @@ struct ValidateArrayImpl { const auto& field_data = *data.child_data[i]; // Validate child first, to catch nonsensical length / offset etc. - const Status field_valid = ValidateArray(field_data); + const Status field_valid = RecurseInto(field_data); if (!field_valid.ok()) { return Status::Invalid("Struct child array #", i, " invalid: ", field_valid.ToString()); @@ -132,8 +234,8 @@ struct ValidateArrayImpl { for (int i = 0; i < type.num_fields(); ++i) { const auto& field_data = *data.child_data[i]; - // Validate child first, to catch nonsensical length / offset etc. - const Status field_valid = ValidateArray(field_data); + // Validate children first, to catch nonsensical length / offset etc. + const Status field_valid = RecurseInto(field_data); if (!field_valid.ok()) { return Status::Invalid("Union child array #", i, " invalid: ", field_valid.ToString()); @@ -153,6 +255,57 @@ struct ValidateArrayImpl { field_type->ToString()); } } + + if (full_validation) { + // Validate all type codes + const auto& child_ids = type.child_ids(); + const auto& type_codes_map = type.type_codes(); + + const int8_t* type_codes = data.GetValues(1); + + for (int64_t i = 0; i < data.length; ++i) { + // Note that union arrays never have top-level nulls + const int32_t code = type_codes[i]; + if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) { + return Status::Invalid("Union value at position ", i, " has invalid type id ", + code); + } + } + + if (type.mode() == UnionMode::DENSE) { + // Validate all offsets + + // Map logical type id to child length + std::vector child_lengths(256); + for (int child_id = 0; child_id < type.num_fields(); ++child_id) { + child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length; + } + + // Check offsets are in bounds + std::vector last_child_offsets(256, 0); + const int32_t* offsets = data.GetValues(2); + for (int64_t i = 0; i < data.length; ++i) { + const int32_t code = type_codes[i]; + const int32_t offset = offsets[i]; + if (offset < 0) { + return Status::Invalid("Union value at position ", i, " has negative offset ", + offset); + } + if (offset >= child_lengths[code]) { + return Status::Invalid("Union value at position ", i, + " has offset larger " + "than child length (", + offset, " >= ", child_lengths[code], ")"); + } + if (offset < last_child_offsets[code]) { + return Status::Invalid("Union value at position ", i, + " has non-monotonic offset ", offset); + } + last_child_offsets[code] = offset; + } + } + } + return Status::OK(); } @@ -164,12 +317,23 @@ struct ValidateArrayImpl { if (!data.dictionary) { return Status::Invalid("Dictionary values must be non-null"); } - const Status dict_valid = ValidateArray(*data.dictionary); + // Validate dictionary + const Status dict_valid = RecurseInto(*data.dictionary); if (!dict_valid.ok()) { return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString()); } - // Visit indices - return ValidateWithType(*type.index_type()); + // Validate indices + RETURN_NOT_OK(ValidateWithType(*type.index_type())); + + if (full_validation) { + // Check indices within dictionary bounds + const Status indices_status = + CheckBounds(*type.index_type(), 0, data.dictionary->length - 1); + if (!indices_status.ok()) { + return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString()); + } + } + return Status::OK(); } Status Visit(const ExtensionType& type) { @@ -184,13 +348,126 @@ struct ValidateArrayImpl { return data.buffers[index] != nullptr && data.buffers[index]->address() != 0; } + Status RecurseInto(const ArrayData& related_data) { + ValidateArrayImpl impl{related_data, full_validation}; + return impl.Validate(); + } + + Status ValidateLayout(const DataType& type) { + // Check the data layout conforms to the spec + const auto layout = type.layout(); + + if (data.length < 0) { + return Status::Invalid("Array length is negative"); + } + + if (data.buffers.size() != layout.buffers.size()) { + return Status::Invalid("Expected ", layout.buffers.size(), + " buffers in array " + "of type ", + type.ToString(), ", got ", data.buffers.size()); + } + + // This check is required to avoid addition overflow below + int64_t length_plus_offset = -1; + if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) { + return Status::Invalid("Array of type ", type.ToString(), + " has impossibly large length and offset"); + } + + for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + const auto& buffer = data.buffers[i]; + const auto& spec = layout.buffers[i]; + + if (buffer == nullptr) { + continue; + } + int64_t min_buffer_size = -1; + switch (spec.kind) { + case DataTypeLayout::BITMAP: + min_buffer_size = BitUtil::BytesForBits(length_plus_offset); + break; + case DataTypeLayout::FIXED_WIDTH: + if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, + &min_buffer_size)) { + return Status::Invalid("Array of type ", type.ToString(), + " has impossibly large length and offset"); + } + break; + case DataTypeLayout::ALWAYS_NULL: + // XXX Should we raise on non-null buffer? + continue; + default: + continue; + } + if (buffer->size() < min_buffer_size) { + return Status::Invalid("Buffer #", i, " too small in array of type ", + type.ToString(), " and length ", data.length, + ": expected at least ", min_buffer_size, " byte(s), got ", + buffer->size()); + } + } + if (layout.has_dictionary && !data.dictionary) { + return Status::Invalid("Array of type ", type.ToString(), + " must have dictionary values"); + } + if (!layout.has_dictionary && data.dictionary) { + return Status::Invalid("Unexpected dictionary values in array of type ", + type.ToString()); + } + return Status::OK(); + } + + Status ValidateNulls(const DataType& type) { + if (type.id() != Type::NA && data.null_count > 0 && data.buffers[0] == nullptr) { + return Status::Invalid("Array of type ", type.ToString(), " has ", data.null_count, + " nulls but no null bitmap"); + } + if (data.null_count > data.length) { + return Status::Invalid("Null count exceeds array length"); + } + if (data.null_count < 0 && data.null_count != kUnknownNullCount) { + return Status::Invalid("Negative null count"); + } + + if (full_validation) { + if (data.null_count != kUnknownNullCount) { + int64_t actual_null_count; + if (HasValidityBitmap(data.type->id()) && data.buffers[0]) { + // Do not call GetNullCount() as it would also set the `null_count` member + actual_null_count = data.length - CountSetBits(data.buffers[0]->data(), + data.offset, data.length); + } else if (data.type->id() == Type::NA) { + actual_null_count = data.length; + } else { + actual_null_count = 0; + } + if (actual_null_count != data.null_count) { + return Status::Invalid("null_count value (", data.null_count, + ") doesn't match actual number of nulls in array (", + actual_null_count, ")"); + } + } + } + return Status::OK(); + } + + Status ValidateFixedWidthBuffers() { + if (data.length > 0 && !IsBufferValid(1)) { + return Status::Invalid("Missing values buffer in non-empty fixed-width array"); + } + return Status::OK(); + } + template Status ValidateBinaryLike(const BinaryType& type) { if (!IsBufferValid(2)) { return Status::Invalid("Value data buffer is null"); } + const Buffer& values = *data.buffers[2]; + // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type)); + RETURN_NOT_OK(ValidateOffsets(type, values.size())); if (data.length > 0 && data.buffers[1]->is_cpu()) { using offset_type = typename BinaryType::offset_type; @@ -224,10 +501,14 @@ struct ValidateArrayImpl { template Status ValidateListLike(const ListType& type) { - // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type)); - const ArrayData& values = *data.child_data[0]; + const Status child_valid = RecurseInto(values); + if (!child_valid.ok()) { + return Status::Invalid("List child array invalid: ", child_valid.ToString()); + } + + // First validate offsets, to make sure the accesses below are valid + RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length)); // An empty list array can have 0 offsets if (data.length > 0 && data.buffers[1]->is_cpu()) { @@ -257,19 +538,14 @@ struct ValidateArrayImpl { } } - const Status child_valid = ValidateArray(values); - if (!child_valid.ok()) { - return Status::Invalid("List child array invalid: ", child_valid.ToString()); - } return Status::OK(); } template - Status ValidateOffsets(const TypeClass& type) { + Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; - const Buffer* offsets = data.buffers[1].get(); - if (offsets == nullptr) { + if (!IsBufferValid(1)) { // For length 0, an empty offsets buffer seems accepted as a special case // (ARROW-544) if (data.length > 0) { @@ -279,355 +555,58 @@ struct ValidateArrayImpl { } // An empty list array can have 0 offsets - auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0; - if (offsets->size() / static_cast(sizeof(offset_type)) < required_offsets) { - return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(), - " isn't large enough for length: ", data.length); - } - - return Status::OK(); - } -}; - -} // namespace - -ARROW_EXPORT -Status ValidateArray(const ArrayData& data) { - if (data.type == nullptr) { - return Status::Invalid("Array type is absent"); - } - - // First check the data layout conforms to the spec - const DataType& type = *data.type; - const auto layout = type.layout(); - - if (data.length < 0) { - return Status::Invalid("Array length is negative"); - } - - if (data.buffers.size() != layout.buffers.size()) { - return Status::Invalid("Expected ", layout.buffers.size(), - " buffers in array " - "of type ", - type.ToString(), ", got ", data.buffers.size()); - } - - // This check is required to avoid addition overflow below - int64_t length_plus_offset = -1; - if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) { - return Status::Invalid("Array of type ", type.ToString(), - " has impossibly large length and offset"); - } - - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { - const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; - - if (buffer == nullptr) { - continue; - } - int64_t min_buffer_size = -1; - switch (spec.kind) { - case DataTypeLayout::BITMAP: - min_buffer_size = BitUtil::BytesForBits(length_plus_offset); - break; - case DataTypeLayout::FIXED_WIDTH: - if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, &min_buffer_size)) { - return Status::Invalid("Array of type ", type.ToString(), - " has impossibly large length and offset"); - } - break; - case DataTypeLayout::ALWAYS_NULL: - // XXX Should we raise on non-null buffer? - continue; - default: - continue; - } - if (buffer->size() < min_buffer_size) { - return Status::Invalid("Buffer #", i, " too small in array of type ", - type.ToString(), " and length ", data.length, - ": expected at least ", min_buffer_size, " byte(s), got ", - buffer->size()); - } - } - if (type.id() != Type::NA && data.null_count > 0 && data.buffers[0] == nullptr) { - return Status::Invalid("Array of type ", type.ToString(), " has ", data.null_count, - " nulls but no null bitmap"); - } - - // Check null_count() *after* validating the buffer sizes, to avoid - // reading out of bounds. - if (data.null_count > data.length) { - return Status::Invalid("Null count exceeds array length"); - } - if (data.null_count < 0 && data.null_count != kUnknownNullCount) { - return Status::Invalid("Negative null count"); - } - - if (type.id() != Type::EXTENSION) { - if (data.child_data.size() != static_cast(type.num_fields())) { - return Status::Invalid("Expected ", type.num_fields(), - " child arrays in array " - "of type ", - type.ToString(), ", got ", data.child_data.size()); - } - } - if (layout.has_dictionary && !data.dictionary) { - return Status::Invalid("Array of type ", type.ToString(), - " must have dictionary values"); - } - if (!layout.has_dictionary && data.dictionary) { - return Status::Invalid("Unexpected dictionary values in array of type ", - type.ToString()); - } - - ValidateArrayImpl validator{data}; - return validator.Validate(); -} - -ARROW_EXPORT -Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); } - -/////////////////////////////////////////////////////////////////////////// -// ValidateArrayFull: expensive validation checks - -namespace { - -struct UTF8DataValidator { - const ArrayData& data; - - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } - - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArrayDataInline( - data, - [&](util::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); - } -}; - -struct BoundsChecker { - const ArrayData& data; - int64_t min_value; - int64_t max_value; - - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } - - template - enable_if_integer Visit(const IntegerType&) { - using c_type = typename IntegerType::c_type; - - int64_t i = 0; - return VisitArrayDataInline( - data, - [&](c_type value) { - const auto v = static_cast(value); - if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) { - return Status::Invalid("Value at position ", i, " out of bounds: ", v, - " (should be in [", min_value, ", ", max_value, "])"); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); - } -}; - -struct ValidateArrayFullImpl { - const ArrayData& data; - - Status Validate() { return ValidateWithType(*data.type); } - - Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); } - - Status Visit(const NullType& type) { return Status::OK(); } - - Status Visit(const FixedWidthType& type) { return Status::OK(); } - - Status Visit(const StringType& type) { - RETURN_NOT_OK(ValidateBinaryLike(type)); - return ValidateUTF8(data); - } - - Status Visit(const LargeStringType& type) { - RETURN_NOT_OK(ValidateBinaryLike(type)); - return ValidateUTF8(data); - } - - Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); } - - Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } - - Status Visit(const ListType& type) { return ValidateListLike(type); } - - Status Visit(const LargeListType& type) { return ValidateListLike(type); } - - Status Visit(const MapType& type) { return ValidateListLike(type); } - - Status Visit(const FixedSizeListType& type) { - const ArrayData& child = *data.child_data[0]; - const Status child_valid = ValidateArrayFull(child); - if (!child_valid.ok()) { - return Status::Invalid("Fixed size list child array invalid: ", - child_valid.ToString()); - } - return Status::OK(); - } - - Status Visit(const StructType& type) { - // Validate children - for (int64_t i = 0; i < type.num_fields(); ++i) { - const ArrayData& field = *data.child_data[i]; - const Status field_valid = ValidateArrayFull(field); - if (!field_valid.ok()) { - return Status::Invalid("Struct child array #", i, - " invalid: ", field_valid.ToString()); - } - } - return Status::OK(); - } - - Status Visit(const UnionType& type) { - const auto& child_ids = type.child_ids(); - const auto& type_codes_map = type.type_codes(); - - const int8_t* type_codes = data.GetValues(1); - - for (int64_t i = 0; i < data.length; ++i) { - // Note that union arrays never have top-level nulls - const int32_t code = type_codes[i]; - if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) { - return Status::Invalid("Union value at position ", i, " has invalid type id ", - code); - } - } - - if (type.mode() == UnionMode::DENSE) { - // Map logical type id to child length - std::vector child_lengths(256); - for (int child_id = 0; child_id < type.num_fields(); ++child_id) { - child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length; + const auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0; + const auto offsets_byte_size = data.buffers[1]->size(); + if (offsets_byte_size / static_cast(sizeof(offset_type)) < + required_offsets) { + return Status::Invalid("Offsets buffer size (bytes): ", offsets_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (full_validation && required_offsets > 0) { + // Validate all offset values + const offset_type* offsets = data.GetValues(1); + + auto prev_offset = offsets[0]; + if (prev_offset < 0) { + return Status::Invalid( + "Offset invariant failure: array starts at negative offset ", prev_offset); } - - // Check offsets are in bounds - std::vector last_child_offsets(256, 0); - const int32_t* offsets = data.GetValues(2); - for (int64_t i = 0; i < data.length; ++i) { - const int32_t code = type_codes[i]; - const int32_t offset = offsets[i]; - if (offset < 0) { - return Status::Invalid("Union value at position ", i, " has negative offset ", - offset); - } - if (offset >= child_lengths[code]) { - return Status::Invalid("Union value at position ", i, - " has offset larger " - "than child length (", - offset, " >= ", child_lengths[code], ")"); + for (int64_t i = 1; i <= data.length; ++i) { + const auto current_offset = offsets[i]; + if (current_offset < prev_offset) { + return Status::Invalid( + "Offset invariant failure: non-monotonic offset at slot ", i, ": ", + current_offset, " < ", prev_offset); } - if (offset < last_child_offsets[code]) { - return Status::Invalid("Union value at position ", i, - " has non-monotonic offset ", offset); + if (current_offset > offset_limit) { + return Status::Invalid("Offset invariant failure: offset for slot ", i, + " out of bounds: ", current_offset, " > ", offset_limit); } - last_child_offsets[code] = offset; - } - } - - // Validate children - for (int64_t i = 0; i < type.num_fields(); ++i) { - const ArrayData& field = *data.child_data[i]; - const Status field_valid = ValidateArrayFull(field); - if (!field_valid.ok()) { - return Status::Invalid("Union child array #", i, - " invalid: ", field_valid.ToString()); + prev_offset = current_offset; } } return Status::OK(); } - Status Visit(const DictionaryType& type) { - const Status indices_status = - CheckBounds(*type.index_type(), 0, data.dictionary->length - 1); - if (!indices_status.ok()) { - return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString()); - } - return ValidateArrayFull(*data.dictionary); - } - - Status Visit(const ExtensionType& type) { - return ValidateWithType(*type.storage_type()); - } - - protected: - template - Status ValidateBinaryLike(const BinaryType& type) { - const auto& data_buffer = data.buffers[2]; - if (data_buffer == nullptr) { - return Status::Invalid("Binary data buffer is null"); - } - return ValidateOffsets(type, data_buffer->size()); - } - - template - Status ValidateListLike(const ListType& type) { - const ArrayData& child = *data.child_data[0]; - const Status child_valid = ValidateArrayFull(child); - if (!child_valid.ok()) { - return Status::Invalid("List child array invalid: ", child_valid.ToString()); - } - return ValidateOffsets(type, child.offset + child.length); - } - - template - Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { - using offset_type = typename TypeClass::offset_type; - if (data.length == 0) { - return Status::OK(); - } - - const offset_type* offsets = data.GetValues(1); - if (offsets == nullptr) { - return Status::Invalid("Non-empty array but offsets are null"); - } - - auto prev_offset = offsets[0]; - if (prev_offset < 0) { - return Status::Invalid("Offset invariant failure: array starts at negative offset ", - prev_offset); - } - for (int64_t i = 1; i <= data.length; ++i) { - const auto current_offset = offsets[i]; - if (current_offset < prev_offset) { - return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ", - i, ": ", current_offset, " < ", prev_offset); - } - if (current_offset > offset_limit) { - return Status::Invalid("Offset invariant failure: offset for slot ", i, - " out of bounds: ", current_offset, " > ", offset_limit); - } - prev_offset = current_offset; + template + Status ValidateDecimals(const DecimalType& type) { + using CType = typename TypeTraits::CType; + if (full_validation) { + const int32_t precision = type.precision(); + return VisitArrayDataInline( + data, + [&](util::string_view bytes) { + DCHECK_EQ(bytes.size(), DecimalType::kByteWidth); + CType value(reinterpret_cast(bytes.data())); + if (!value.FitsInPrecision(precision)) { + return Status::Invalid("Decimal value ", value.ToIntegerString(), + " does not fit in precision of ", type); + } + return Status::OK(); + }, + []() { return Status::OK(); }); } return Status::OK(); } @@ -640,26 +619,18 @@ struct ValidateArrayFullImpl { } // namespace +ARROW_EXPORT +Status ValidateArray(const ArrayData& data) { + ValidateArrayImpl validator{data, /*full_validation=*/false}; + return validator.Validate(); +} + +ARROW_EXPORT +Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); } + ARROW_EXPORT Status ValidateArrayFull(const ArrayData& data) { - if (data.null_count != kUnknownNullCount) { - int64_t actual_null_count; - if (HasValidityBitmap(data.type->id()) && data.buffers[0]) { - // Do not call GetNullCount() as it would also set the `null_count` member - actual_null_count = - data.length - CountSetBits(data.buffers[0]->data(), data.offset, data.length); - } else if (data.type->id() == Type::NA) { - actual_null_count = data.length; - } else { - actual_null_count = 0; - } - if (actual_null_count != data.null_count) { - return Status::Invalid("null_count value (", data.null_count, - ") doesn't match actual number of nulls in array (", - actual_null_count, ")"); - } - } - return ValidateArrayFullImpl{data}.Validate(); + return ValidateArrayImpl{data, /*full_validation=*/true}.Validate(); } ARROW_EXPORT diff --git a/cpp/src/arrow/array/validate.h b/cpp/src/arrow/array/validate.h index cae3e16b3c577..3ebfa0a51edce 100644 --- a/cpp/src/arrow/array/validate.h +++ b/cpp/src/arrow/array/validate.h @@ -35,8 +35,9 @@ ARROW_EXPORT Status ValidateArray(const ArrayData& data); // O(N) array data validation. -// Note the "full" routines don't validate metadata. It should be done -// beforehand using ValidateArray(), otherwise invalid memory accesses +// Note that, starting from 7.0.0, "full" routines also validate metadata. +// Before, ValidateArray() needed to be called before ValidateArrayFull() +// to ensure metadata correctness, otherwise invalid memory accesses // may occur. ARROW_EXPORT diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 0c954e72e5047..eb483d1b8587f 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -227,24 +227,27 @@ std::string ChunkedArray::ToString() const { return ss.str(); } -Status ChunkedArray::Validate() const { - if (chunks_.size() == 0) { +namespace { + +Status ValidateChunks(const ArrayVector& chunks, bool full_validation) { + if (chunks.size() == 0) { return Status::OK(); } - const auto& type = *chunks_[0]->type(); + const auto& type = *chunks[0]->type(); // Make sure chunks all have the same type - for (size_t i = 1; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; + for (size_t i = 1; i < chunks.size(); ++i) { + const Array& chunk = *chunks[i]; if (!chunk.type()->Equals(type)) { return Status::Invalid("In chunk ", i, " expected type ", type.ToString(), " but saw ", chunk.type()->ToString()); } } // Validate the chunks themselves - for (size_t i = 0; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; - const Status st = internal::ValidateArray(chunk); + for (size_t i = 0; i < chunks.size(); ++i) { + const Array& chunk = *chunks[i]; + const Status st = full_validation ? internal::ValidateArrayFull(chunk) + : internal::ValidateArray(chunk); if (!st.ok()) { return Status::Invalid("In chunk ", i, ": ", st.ToString()); } @@ -252,16 +255,14 @@ Status ChunkedArray::Validate() const { return Status::OK(); } +} // namespace + +Status ChunkedArray::Validate() const { + return ValidateChunks(chunks_, /*full_validation=*/false); +} + Status ChunkedArray::ValidateFull() const { - RETURN_NOT_OK(Validate()); - for (size_t i = 0; i < chunks_.size(); ++i) { - const Array& chunk = *chunks_[i]; - const Status st = internal::ValidateArrayFull(chunk); - if (!st.ok()) { - return Status::Invalid("In chunk ", i, ": ", st.ToString()); - } - } - return Status::OK(); + return ValidateChunks(chunks_, /*full_validation=*/true); } namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index e3fe1bdf73daf..2fd1698ee1d3e 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -223,6 +223,8 @@ static auto kExtractRegexOptionsType = GetFunctionOptionsType( DataMember("value_set", &SetLookupOptions::value_set), DataMember("skip_nulls", &SetLookupOptions::skip_nulls)); +static auto kStructFieldOptionsType = GetFunctionOptionsType( + DataMember("indices", &StructFieldOptions::indices)); static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrptimeOptions::format), DataMember("unit", &StrptimeOptions::unit)); @@ -351,6 +353,11 @@ SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {} constexpr char SetLookupOptions::kTypeName[]; +StructFieldOptions::StructFieldOptions(std::vector indices) + : FunctionOptions(internal::kStructFieldOptionsType), indices(std::move(indices)) {} +StructFieldOptions::StructFieldOptions() : StructFieldOptions(std::vector()) {} +constexpr char StructFieldOptions::kTypeName[]; + StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) : FunctionOptions(internal::kStrptimeOptionsType), format(std::move(format)), @@ -444,6 +451,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kStructFieldOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kAssumeTimezoneOptionsType)); @@ -532,44 +540,22 @@ Result MinElementWise(const std::vector& args, // ---------------------------------------------------------------------- // Set-related operations -static Result ExecSetLookup(const std::string& func_name, const Datum& data, - const SetLookupOptions& options, ExecContext* ctx) { - if (!options.value_set.is_arraylike()) { - return Status::Invalid("Set lookup value set must be Array or ChunkedArray"); - } - std::shared_ptr data_type; - if (data.type()->id() == Type::DICTIONARY) { - data_type = - arrow::internal::checked_pointer_cast(data.type())->value_type(); - } else { - data_type = data.type(); - } - - if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) { - std::stringstream ss; - ss << "Array type didn't match type of values set: " << data_type->ToString() - << " vs " << options.value_set.type()->ToString(); - return Status::Invalid(ss.str()); - } - return CallFunction(func_name, {data}, &options, ctx); -} - Result IsIn(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { - return ExecSetLookup("is_in", values, options, ctx); + return CallFunction("is_in", {values}, &options, ctx); } Result IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx); + return IsIn(values, SetLookupOptions{value_set}, ctx); } Result IndexIn(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { - return ExecSetLookup("index_in", values, options, ctx); + return CallFunction("index_in", {values}, &options, ctx); } Result IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx); + return IndexIn(values, SetLookupOptions{value_set}, ctx); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 4bb18b37527a9..d2234a6182d10 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -223,6 +223,18 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { bool skip_nulls; }; +/// Options for struct_field function +class ARROW_EXPORT StructFieldOptions : public FunctionOptions { + public: + explicit StructFieldOptions(std::vector indices); + StructFieldOptions(); + constexpr static char const kTypeName[] = "StructFieldOptions"; + + /// The child indices to extract. For instance, to get the 2nd child + /// of the 1st child of a struct or union, this would be {0, 1}. + std::vector indices; +}; + class ARROW_EXPORT StrptimeOptions : public FunctionOptions { public: explicit StrptimeOptions(std::string format, TimeUnit::type unit); diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 1fc6b78745833..95114d8d8a5cb 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -97,11 +97,11 @@ namespace compute { // Function options bool SortKey::Equals(const SortKey& other) const { - return name == other.name && order == other.order; + return target == other.target && order == other.order; } std::string SortKey::ToString() const { std::stringstream ss; - ss << name << ' '; + ss << target.ToString() << ' '; switch (order) { case SortOrder::Ascending: ss << "ASC"; diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index a91cf91df069a..8788d5d160e76 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -98,8 +98,8 @@ enum class NullPlacement { /// \brief One sort key for PartitionNthIndices (TODO) and SortIndices class ARROW_EXPORT SortKey : public util::EqualityComparable { public: - explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending) - : name(std::move(name)), order(order) {} + explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending) + : target(std::move(target)), order(order) {} using util::EqualityComparable::Equals; using util::EqualityComparable::operator==; @@ -107,8 +107,8 @@ class ARROW_EXPORT SortKey : public util::EqualityComparable { bool Equals(const SortKey& other) const; std::string ToString() const; - /// The name of the sort column. - std::string name; + /// A FieldRef targetting the sort column. + FieldRef target; /// How to order by this sort key. SortOrder order; }; diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 50f1ad4fd0b7c..3696477b8575d 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -102,6 +102,14 @@ void PrintTo(const ExecBatch& batch, std::ostream* os) { } } +int64_t ExecBatch::TotalBufferSize() const { + int64_t sum = 0; + for (const auto& value : values) { + sum += value.TotalBufferSize(); + } + return sum; +} + std::string ExecBatch::ToString() const { std::stringstream ss; PrintTo(*this, &ss); @@ -676,7 +684,9 @@ class ScalarExecutor : public KernelExecutorImpl { if (output_descr_.shape == ValueDescr::ARRAY) { ArrayData* out_arr = out.mutable_array(); - if (kernel_->null_handling == NullHandling::INTERSECTION) { + if (output_descr_.type->id() == Type::NA) { + out_arr->null_count = out_arr->length; + } else if (kernel_->null_handling == NullHandling::INTERSECTION) { RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr)); } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { out_arr->null_count = 0; diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index 7707622bc5312..faebddb73342d 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -212,6 +212,15 @@ struct ARROW_EXPORT ExecBatch { /// by ExecBatchIterator which by design does not yield length-0 batches. int64_t length; + /// \brief The sum of bytes in each buffer referenced by the batch + /// + /// Note: Scalars are not counted + /// Note: Some values may referenced only part of a buffer, for + /// example, an array with an offset. The actual data + /// visible to this batch will be smaller than the total + /// buffer size in this case. + int64_t TotalBufferSize() const; + /// \brief Return the value at the i-th index template inline const Datum& operator[](index_type i) const { diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index ccc36c093e820..79ffd67b8fd0b 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -31,3 +31,11 @@ add_arrow_compute_test(union_node_test PREFIX "arrow-compute") add_arrow_compute_test(util_test PREFIX "arrow-compute") add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") + +add_arrow_compute_test(ir_test + PREFIX + "arrow-compute" + EXTRA_LINK_LIBS + ${GFLAGS_LIBRARIES} + TEST_ARGUMENTS + "--computeir_dir=${CMAKE_SOURCE_DIR}/../experimental/computeir") diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index 295979062d03f..ddf6f7934a719 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -372,7 +372,7 @@ class GroupByNode : public ExecNode { for (size_t i = 0; i < key_field_ids_.size(); ++i) { keys[i] = batch.values[key_field_ids_[i]]; } - ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(keys)); + ExecBatch key_batch(std::move(keys), batch.length); // Create a batch with group ids ARROW_ASSIGN_OR_RAISE(Datum id_batch, state->grouper->Consume(key_batch)); @@ -421,6 +421,8 @@ class GroupByNode : public ExecNode { Result Finalize() { ThreadLocalState* state = &local_states_[0]; + // If we never got any batches, then state won't have been initialized + RETURN_NOT_OK(InitLocalStateIfNeeded(state)); ExecBatch out_data{{}, state->grouper->num_groups()}; out_data.values.resize(agg_kernels_.size() + key_field_ids_.size()); @@ -525,9 +527,8 @@ class GroupByNode : public ExecNode { void StopProducing(ExecNode* output) override { DCHECK_EQ(output, outputs_[0]); - if (input_counter_.Cancel()) { - finished_.MarkFinished(); - } else if (output_counter_.Cancel()) { + ARROW_UNUSED(input_counter_.Cancel()); + if (output_counter_.Cancel()) { finished_.MarkFinished(); } inputs_[0]->StopProducing(this); diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc index 7e7824d8524b0..7cd3011b8ab83 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.cc +++ b/cpp/src/arrow/compute/exec/exec_plan.cc @@ -23,6 +23,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/options.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" @@ -250,27 +251,41 @@ Status ExecNode::Validate() const { std::string ExecNode::ToString() const { std::stringstream ss; - ss << kind_name() << "{\"" << label_ << '"'; + + auto PrintLabelAndKind = [&](const ExecNode* node) { + ss << node->label() << ":" << node->kind_name(); + }; + + PrintLabelAndKind(this); + ss << "{"; + if (!inputs_.empty()) { - ss << ", inputs=["; + ss << "inputs=["; for (size_t i = 0; i < inputs_.size(); i++) { if (i > 0) ss << ", "; - ss << input_labels_[i] << ": \"" << inputs_[i]->label() << '"'; + ss << input_labels_[i] << "="; + PrintLabelAndKind(inputs_[i]); } ss << ']'; } if (!outputs_.empty()) { - ss << ", outputs=["; + if (!inputs_.empty()) { + ss << ", "; + } + + ss << "outputs=["; for (size_t i = 0; i < outputs_.size(); i++) { if (i > 0) ss << ", "; - ss << "\"" << outputs_[i]->label() << "\""; + PrintLabelAndKind(outputs_[i]); } ss << ']'; } const std::string extra = ToStringExtra(); - if (!extra.empty()) ss << ", " << extra; + if (!extra.empty()) { + ss << ", " << extra; + } ss << '}'; return ss.str(); @@ -338,7 +353,9 @@ Future<> MapNode::finished() { return finished_; } void MapNode::SubmitTask(std::function(ExecBatch)> map_fn, ExecBatch batch) { Status status; - if (finished_.is_finished()) { + // This will be true if the node is stopped early due to an error or manual + // cancellation + if (input_counter_.Completed()) { return; } auto task = [this, map_fn, batch]() { @@ -368,7 +385,9 @@ void MapNode::SubmitTask(std::function(ExecBatch)> map_fn, this->Finish(status); } } - if (!status.ok()) { + // If we get a cancelled status from AddTask it means this node was stopped + // or errored out already so we can just drop the task. + if (!status.ok() && !status.IsCancelled()) { if (input_counter_.Cancel()) { this->Finish(status); } @@ -518,6 +537,6 @@ Result>()>> MakeReaderGenerator( return MakeBackgroundGenerator(std::move(batch_it), io_executor, max_q, q_restart); } -} // namespace compute +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/exec_plan.h b/cpp/src/arrow/compute/exec/exec_plan.h index b5e59fe8d30c5..4cb7fad009fcd 100644 --- a/cpp/src/arrow/compute/exec/exec_plan.h +++ b/cpp/src/arrow/compute/exec/exec_plan.h @@ -344,19 +344,26 @@ struct ARROW_EXPORT Declaration { options{std::move(options)}, label{std::move(label)} {} + template + Declaration(std::string factory_name, std::vector inputs, Options options, + std::string label) + : Declaration{std::move(factory_name), std::move(inputs), + std::shared_ptr( + std::make_shared(std::move(options))), + std::move(label)} {} + template Declaration(std::string factory_name, std::vector inputs, Options options) - : factory_name{std::move(factory_name)}, - inputs{std::move(inputs)}, - options{std::make_shared(std::move(options))}, - label{this->factory_name} {} + : Declaration{std::move(factory_name), std::move(inputs), std::move(options), + /*label=*/""} {} template Declaration(std::string factory_name, Options options) - : factory_name{std::move(factory_name)}, - inputs{}, - options{std::make_shared(std::move(options))}, - label{this->factory_name} {} + : Declaration{std::move(factory_name), {}, std::move(options), /*label=*/""} {} + + template + Declaration(std::string factory_name, Options options, std::string label) + : Declaration{std::move(factory_name), {}, std::move(options), std::move(label)} {} /// \brief Convenience factory for the common case of a simple sequence of nodes. /// diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 64e3305825d10..4249179e1bf8c 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -63,7 +63,7 @@ Expression::Expression(Parameter parameter) Expression literal(Datum lit) { return Expression(std::move(lit)); } Expression field_ref(FieldRef ref) { - return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1}); + return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, {-1}}); } Expression call(std::string function, std::vector arguments, @@ -394,14 +394,11 @@ Result BindImpl(Expression expr, const TypeOrSchema& in, if (expr.literal()) return expr; if (auto ref = expr.field_ref()) { - if (ref->IsNested()) { - return Status::NotImplemented("nested field references"); - } - ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in)); auto bound = *expr.parameter(); - bound.index = path[0]; + bound.indices.resize(path.indices().size()); + std::copy(path.indices().begin(), path.indices().end(), bound.indices.begin()); ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in)); bound.descr.type = field->type(); bound.descr.shape = shape; @@ -512,7 +509,13 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i return MakeNullScalar(null()); } - const Datum& field = input[param->index]; + Datum field = input[param->indices[0]]; + if (param->indices.size() > 1) { + std::vector indices(param->indices.begin() + 1, param->indices.end()); + compute::StructFieldOptions options(std::move(indices)); + ARROW_ASSIGN_OR_RAISE( + field, compute::CallFunction("struct_field", {std::move(field)}, &options)); + } if (!field.type()->Equals(param->descr.type)) { return Status::Invalid("Referenced field ", expr.ToString(), " was ", field.type()->ToString(), " but should have been ", diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index dac5728ab46d6..7c567cc8fc694 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -27,6 +27,7 @@ #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/type_fwd.h" +#include "arrow/util/small_vector.h" #include "arrow/util/variant.h" namespace arrow { @@ -112,7 +113,7 @@ class ARROW_EXPORT Expression { // post-bind properties ValueDescr descr; - int index; + internal::SmallVector indices; }; const Parameter* parameter() const; diff --git a/cpp/src/arrow/compute/exec/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc index 1899b7caab6df..d1738c9c23cbb 100644 --- a/cpp/src/arrow/compute/exec/expression_benchmark.cc +++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc @@ -19,6 +19,7 @@ #include "arrow/compute/cast.h" #include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/test_util.h" #include "arrow/dataset/partition.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -29,6 +30,34 @@ namespace compute { std::shared_ptr ninety_nine_dict = DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int64(), "[99]")); +static void BindAndEvaluate(benchmark::State& state, Expression expr) { + ExecContext ctx; + auto struct_type = struct_({ + field("int", int64()), + field("float", float64()), + }); + auto dataset_schema = schema({ + field("int_arr", int64()), + field("struct_arr", struct_type), + field("int_scalar", int64()), + field("struct_scalar", struct_type), + }); + ExecBatch input( + { + Datum(ArrayFromJSON(int64(), "[0, 2, 4, 8]")), + Datum(ArrayFromJSON(struct_type, + "[[0, 2.0], [4, 8.0], [16, 32.0], [64, 128.0]]")), + Datum(ScalarFromJSON(int64(), "16")), + Datum(ScalarFromJSON(struct_type, "[32, 64.0]")), + }, + /*length=*/4); + + for (auto _ : state) { + ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema)); + ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status()); + } +} + // A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression filter, Expression guarantee) { @@ -84,5 +113,12 @@ BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, positive_filter_cast_guarantee_dictionary, filter_cast_positive, guarantee_dictionary); +BENCHMARK_CAPTURE(BindAndEvaluate, simple_array, field_ref("int_arr")); +BENCHMARK_CAPTURE(BindAndEvaluate, simple_scalar, field_ref("int_scalar")); +BENCHMARK_CAPTURE(BindAndEvaluate, nested_array, + field_ref(FieldRef("struct_arr", "float"))); +BENCHMARK_CAPTURE(BindAndEvaluate, nested_scalar, + field_ref(FieldRef("struct_scalar", "float"))); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc index 88b94e80434b6..94ca20748351c 100644 --- a/cpp/src/arrow/compute/exec/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -476,15 +476,16 @@ TEST(Expression, BindLiteral) { } void ExpectBindsTo(Expression expr, util::optional expected, - Expression* bound_out = nullptr) { + Expression* bound_out = nullptr, + const Schema& schema = *kBoringSchema) { if (!expected) { expected = expr; } - ASSERT_OK_AND_ASSIGN(auto bound, expr.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto bound, expr.Bind(schema)); EXPECT_TRUE(bound.IsBound()); - ASSERT_OK_AND_ASSIGN(expected, expected->Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(expected, expected->Bind(schema)); EXPECT_EQ(bound, *expected) << " unbound: " << expr.ToString(); if (bound_out) { @@ -508,11 +509,24 @@ TEST(Expression, BindFieldRef) { // in the input schema ASSERT_RAISES(Invalid, field_ref("alpha").Bind(Schema( {field("alpha", int32()), field("alpha", float32())}))); +} + +TEST(Expression, BindNestedFieldRef) { + Expression expr; + auto schema = Schema({field("a", struct_({field("b", int32())}))}); + + ExpectBindsTo(field_ref(FieldRef("a", "b")), no_change, &expr, schema); + EXPECT_TRUE(expr.IsBound()); + EXPECT_EQ(expr.descr(), ValueDescr::Array(int32())); - // referencing nested fields is not supported - ASSERT_RAISES(NotImplemented, - field_ref(FieldRef("a", "b")) - .Bind(Schema({field("a", struct_({field("b", int32())}))}))); + ExpectBindsTo(field_ref(FieldRef(FieldPath({0, 0}))), no_change, &expr, schema); + EXPECT_TRUE(expr.IsBound()); + EXPECT_EQ(expr.descr(), ValueDescr::Array(int32())); + + ASSERT_RAISES(Invalid, field_ref(FieldPath({0, 1})).Bind(schema)); + ASSERT_RAISES(Invalid, field_ref(FieldRef("a", "b")) + .Bind(Schema({field("a", struct_({field("b", int32()), + field("b", int64())}))}))); } TEST(Expression, BindCall) { @@ -614,6 +628,45 @@ TEST(Expression, ExecuteFieldRef) { {"a": -1, "b": 4.0} ])"), ArrayFromJSON(float64(), R"([7.5, 2.125, 4.0])")); + + ExpectRefIs(FieldRef(FieldPath({0, 0})), + ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([ + {"a": {"b": 6.125}}, + {"a": {"b": 0.0}}, + {"a": {"b": -1}} + ])"), + ArrayFromJSON(float64(), R"([6.125, 0.0, -1])")); + + ExpectRefIs(FieldRef("a", "b"), + ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([ + {"a": {"b": 6.125}}, + {"a": {"b": 0.0}}, + {"a": {"b": -1}} + ])"), + ArrayFromJSON(float64(), R"([6.125, 0.0, -1])")); + + ExpectRefIs(FieldRef("a", "b"), + ArrayFromJSON(struct_({field("a", struct_({field("b", float64())}))}), R"([ + {"a": {"b": 6.125}}, + {"a": null}, + {"a": {"b": null}} + ])"), + ArrayFromJSON(float64(), R"([6.125, null, null])")); + + ExpectRefIs( + FieldRef("a", "b"), + ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[[64.0]]"), + ScalarFromJSON(float64(), "64.0")); + + ExpectRefIs( + FieldRef("a", "b"), + ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[[null]]"), + ScalarFromJSON(float64(), "null")); + + ExpectRefIs( + FieldRef("a", "b"), + ScalarFromJSON(struct_({field("a", struct_({field("b", float64())}))}), "[null]"), + ScalarFromJSON(float64(), "null")); } Result NaiveExecuteScalarExpression(const Expression& expr, const Datum& input) { @@ -697,6 +750,18 @@ TEST(Expression, ExecuteCall) { {"a": 0.0}, {"a": -1} ])")); + + ExpectExecute( + call("add", {field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))}), + ArrayFromJSON(struct_({field("a", struct_({ + field("a", float64()), + field("b", float64()), + }))}), + R"([ + {"a": {"a": 6.125, "b": 3.375}}, + {"a": {"a": 0.0, "b": 1}}, + {"a": {"a": -1, "b": 4.75}} + ])")); } TEST(Expression, ExecuteDictionaryTransparent) { diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc index 8bbd81824510e..a89e23796d4b9 100644 --- a/cpp/src/arrow/compute/exec/hash_join.cc +++ b/cpp/src/arrow/compute/exec/hash_join.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/compute/exec/hash_join_dict.h" #include "arrow/compute/exec/task_util.h" #include "arrow/compute/kernels/row_encoder.h" @@ -96,6 +97,7 @@ class HashJoinBasicImpl : public HashJoinImpl { local_states_[i].is_initialized = false; local_states_[i].is_has_match_initialized = false; } + dict_probe_.Init(num_threads); has_hash_table_ = false; num_batches_produced_.store(0); @@ -144,12 +146,13 @@ class HashJoinBasicImpl : public HashJoinImpl { if (has_payload) { InitEncoder(0, HashJoinProjection::PAYLOAD, &local_state.exec_batch_payloads); } + local_state.is_initialized = true; } } Status EncodeBatch(int side, HashJoinProjection projection_handle, RowEncoder* encoder, - const ExecBatch& batch) { + const ExecBatch& batch, ExecBatch* opt_projected_batch = nullptr) { ExecBatch projected({}, batch.length); int num_cols = schema_mgr_->proj_maps[side].num_cols(projection_handle); projected.values.resize(num_cols); @@ -160,6 +163,10 @@ class HashJoinBasicImpl : public HashJoinImpl { projected.values[icol] = batch.values[to_input.get(icol)]; } + if (opt_projected_batch) { + *opt_projected_batch = projected; + } + return encoder->EncodeAndAppend(projected); } @@ -170,6 +177,8 @@ class HashJoinBasicImpl : public HashJoinImpl { std::vector* output_no_match, std::vector* output_match_left, std::vector* output_match_right) { + InitHasMatchIfNeeded(local_state); + ARROW_DCHECK(has_hash_table_); InitHasMatchIfNeeded(local_state); @@ -311,6 +320,8 @@ class HashJoinBasicImpl : public HashJoinImpl { ARROW_DCHECK(opt_right_ids); ARROW_ASSIGN_OR_RAISE(right_key, hash_table_keys_.Decode(batch_size_next, opt_right_ids)); + // Post process build side keys that use dictionary + RETURN_NOT_OK(dict_build_.PostDecode(schema_mgr_->proj_maps[1], &right_key, ctx_)); } if (has_right_payload) { ARROW_ASSIGN_OR_RAISE(right_payload, @@ -368,13 +379,48 @@ class HashJoinBasicImpl : public HashJoinImpl { return Status::OK(); } + void NullInfoFromBatch(const ExecBatch& batch, + std::vector* nn_bit_vectors, + std::vector* nn_offsets, + std::vector* nn_bit_vector_all_nulls) { + int num_cols = static_cast(batch.values.size()); + nn_bit_vectors->resize(num_cols); + nn_offsets->resize(num_cols); + nn_bit_vector_all_nulls->clear(); + for (int64_t i = 0; i < num_cols; ++i) { + const uint8_t* nn = nullptr; + int64_t offset = 0; + if (batch[i].is_array()) { + if (batch[i].array()->buffers[0] != NULLPTR) { + nn = batch[i].array()->buffers[0]->data(); + offset = batch[i].array()->offset; + } + } else { + ARROW_DCHECK(batch[i].is_scalar()); + if (!batch[i].scalar_as().is_valid) { + if (nn_bit_vector_all_nulls->empty()) { + nn_bit_vector_all_nulls->resize(BitUtil::BytesForBits(batch.length)); + memset(nn_bit_vector_all_nulls->data(), 0, + BitUtil::BytesForBits(batch.length)); + } + nn = nn_bit_vector_all_nulls->data(); + } + } + (*nn_bit_vectors)[i] = nn; + (*nn_offsets)[i] = offset; + } + } + Status ProbeBatch(size_t thread_index, const ExecBatch& batch) { ThreadLocalState& local_state = local_states_[thread_index]; InitLocalStateIfNeeded(thread_index); local_state.exec_batch_keys.Clear(); - RETURN_NOT_OK( - EncodeBatch(0, HashJoinProjection::KEY, &local_state.exec_batch_keys, batch)); + + ExecBatch batch_key_for_lookups; + + RETURN_NOT_OK(EncodeBatch(0, HashJoinProjection::KEY, &local_state.exec_batch_keys, + batch, &batch_key_for_lookups)); bool has_left_payload = (schema_mgr_->proj_maps[0].num_cols(HashJoinProjection::PAYLOAD) > 0); if (has_left_payload) { @@ -388,26 +434,24 @@ class HashJoinBasicImpl : public HashJoinImpl { local_state.match_left.clear(); local_state.match_right.clear(); + bool use_key_batch_for_dicts = dict_probe_.BatchRemapNeeded( + thread_index, schema_mgr_->proj_maps[0], schema_mgr_->proj_maps[1], ctx_); + RowEncoder* row_encoder_for_lookups = &local_state.exec_batch_keys; + if (use_key_batch_for_dicts) { + RETURN_NOT_OK(dict_probe_.EncodeBatch( + thread_index, schema_mgr_->proj_maps[0], schema_mgr_->proj_maps[1], dict_build_, + batch, &row_encoder_for_lookups, &batch_key_for_lookups, ctx_)); + } + + // Collect information about all nulls in key columns. + // std::vector non_null_bit_vectors; std::vector non_null_bit_vector_offsets; - int num_key_cols = schema_mgr_->proj_maps[0].num_cols(HashJoinProjection::KEY); - non_null_bit_vectors.resize(num_key_cols); - non_null_bit_vector_offsets.resize(num_key_cols); - auto from_batch = - schema_mgr_->proj_maps[0].map(HashJoinProjection::KEY, HashJoinProjection::INPUT); - for (int i = 0; i < num_key_cols; ++i) { - int input_col_id = from_batch.get(i); - const uint8_t* non_nulls = nullptr; - int64_t offset = 0; - if (batch[input_col_id].array()->buffers[0] != NULLPTR) { - non_nulls = batch[input_col_id].array()->buffers[0]->data(); - offset = batch[input_col_id].array()->offset; - } - non_null_bit_vectors[i] = non_nulls; - non_null_bit_vector_offsets[i] = offset; - } + std::vector all_nulls; + NullInfoFromBatch(batch_key_for_lookups, &non_null_bit_vectors, + &non_null_bit_vector_offsets, &all_nulls); - ProbeBatch_Lookup(&local_state, local_state.exec_batch_keys, non_null_bit_vectors, + ProbeBatch_Lookup(&local_state, *row_encoder_for_lookups, non_null_bit_vectors, non_null_bit_vector_offsets, &local_state.match, &local_state.no_match, &local_state.match_left, &local_state.match_right); @@ -427,7 +471,7 @@ class HashJoinBasicImpl : public HashJoinImpl { if (batches.empty()) { hash_table_empty_ = true; } else { - InitEncoder(1, HashJoinProjection::KEY, &hash_table_keys_); + dict_build_.InitEncoder(schema_mgr_->proj_maps[1], &hash_table_keys_, ctx_); bool has_payload = (schema_mgr_->proj_maps[1].num_cols(HashJoinProjection::PAYLOAD) > 0); if (has_payload) { @@ -441,11 +485,14 @@ class HashJoinBasicImpl : public HashJoinImpl { const ExecBatch& batch = batches[ibatch]; if (batch.length == 0) { continue; - } else { + } else if (hash_table_empty_) { hash_table_empty_ = false; + + RETURN_NOT_OK(dict_build_.Init(schema_mgr_->proj_maps[1], &batch, ctx_)); } int32_t num_rows_before = hash_table_keys_.num_rows(); - RETURN_NOT_OK(EncodeBatch(1, HashJoinProjection::KEY, &hash_table_keys_, batch)); + RETURN_NOT_OK(dict_build_.EncodeBatch(thread_index, schema_mgr_->proj_maps[1], + batch, &hash_table_keys_, ctx_)); if (has_payload) { RETURN_NOT_OK( EncodeBatch(1, HashJoinProjection::PAYLOAD, &hash_table_payloads_, batch)); @@ -456,6 +503,11 @@ class HashJoinBasicImpl : public HashJoinImpl { } } } + + if (hash_table_empty_) { + RETURN_NOT_OK(dict_build_.Init(schema_mgr_->proj_maps[1], nullptr, ctx_)); + } + return Status::OK(); } @@ -713,6 +765,11 @@ class HashJoinBasicImpl : public HashJoinImpl { std::vector has_match_; bool hash_table_empty_; + // Dictionary handling + // + HashJoinDictBuildMulti dict_build_; + HashJoinDictProbeMulti dict_probe_; + std::vector left_batches_; bool has_hash_table_; std::mutex left_batches_mutex_; diff --git a/cpp/src/arrow/compute/exec/hash_join.h b/cpp/src/arrow/compute/exec/hash_join.h index a2312e09653c7..6520e4ae4a3f3 100644 --- a/cpp/src/arrow/compute/exec/hash_join.h +++ b/cpp/src/arrow/compute/exec/hash_join.h @@ -31,10 +31,6 @@ namespace arrow { namespace compute { -// Identifiers for all different row schemas that are used in a join -// -enum class HashJoinProjection : int { INPUT = 0, KEY = 1, PAYLOAD = 2, OUTPUT = 3 }; - class ARROW_EXPORT HashJoinSchema { public: Status Init(JoinType join_type, const Schema& left_schema, @@ -70,6 +66,7 @@ class ARROW_EXPORT HashJoinSchema { SchemaProjectionMaps proj_maps[2]; private: + static bool IsTypeSupported(const DataType& type); static Result> VectorDiff(const Schema& schema, const std::vector& a, const std::vector& b); diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc new file mode 100644 index 0000000000000..195331a597604 --- /dev/null +++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc @@ -0,0 +1,665 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/hash_join_dict.h" + +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace compute { + +bool HashJoinDictUtil::KeyDataTypesValid( + const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type) { + bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY); + bool r_is_dict = (build_data_type->id() == Type::DICTIONARY); + DataType* l_type; + if (l_is_dict) { + const auto& dict_type = checked_cast(*probe_data_type); + l_type = dict_type.value_type().get(); + } else { + l_type = probe_data_type.get(); + } + DataType* r_type; + if (r_is_dict) { + const auto& dict_type = checked_cast(*build_data_type); + r_type = dict_type.value_type().get(); + } else { + r_type = build_data_type.get(); + } + return l_type->Equals(*r_type); +} + +Result> HashJoinDictUtil::IndexRemapUsingLUT( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& map_array, + const std::shared_ptr& data_type) { + ARROW_DCHECK(indices.is_array() || indices.is_scalar()); + + const uint8_t* map_non_nulls = map_array->buffers[0]->data(); + const int32_t* map = reinterpret_cast(map_array->buffers[1]->data()); + + ARROW_DCHECK(data_type->id() == Type::DICTIONARY); + const auto& dict_type = checked_cast(*data_type); + + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr result, + ConvertToInt32(dict_type.index_type(), indices, batch_length, ctx)); + + uint8_t* nns = result->buffers[0]->mutable_data(); + int32_t* ids = reinterpret_cast(result->buffers[1]->mutable_data()); + for (int64_t i = 0; i < batch_length; ++i) { + bool is_null = !BitUtil::GetBit(nns, i); + if (is_null) { + ids[i] = kNullId; + } else { + ARROW_DCHECK(ids[i] >= 0 && ids[i] < map_array->length); + if (!BitUtil::GetBit(map_non_nulls, ids[i])) { + BitUtil::ClearBit(nns, i); + ids[i] = kNullId; + } else { + ids[i] = map[ids[i]]; + } + } + } + + return result; +} + +namespace { +template +static Result> ConvertImp( + const std::shared_ptr& to_type, const Datum& input, int64_t batch_length, + ExecContext* ctx) { + ARROW_DCHECK(input.is_array() || input.is_scalar()); + bool is_scalar = input.is_scalar(); + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr to_buf, + AllocateBuffer(batch_length * sizeof(TO), ctx->memory_pool())); + TO* to = reinterpret_cast(to_buf->mutable_data()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr to_nn_buf, + AllocateBitmap(batch_length, ctx->memory_pool())); + uint8_t* to_nn = to_nn_buf->mutable_data(); + memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length)); + + if (!is_scalar) { + const ArrayData& arr = *input.array(); + const FROM* from = arr.GetValues(1); + DCHECK_EQ(arr.length, batch_length); + + for (int64_t i = 0; i < arr.length; ++i) { + to[i] = static_cast(from[i]); + // Make sure we did not lose information during cast + ARROW_DCHECK(static_cast(to[i]) == from[i]); + + bool is_null = (arr.buffers[0] != NULLPTR) && + !BitUtil::GetBit(arr.buffers[0]->data(), arr.offset + i); + if (is_null) { + BitUtil::ClearBit(to_nn, i); + } + } + + // Pass null buffer unchanged + return ArrayData::Make(to_type, arr.length, + {std::move(to_nn_buf), std::move(to_buf)}); + } else { + const auto& scalar = input.scalar_as(); + if (scalar.is_valid) { + const util::string_view data = scalar.view(); + DCHECK_EQ(data.size(), sizeof(FROM)); + const FROM from = *reinterpret_cast(data.data()); + const TO to_value = static_cast(from); + // Make sure we did not lose information during cast + ARROW_DCHECK(static_cast(to_value) == from); + + for (int64_t i = 0; i < batch_length; ++i) { + to[i] = to_value; + } + + memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length)); + return ArrayData::Make(to_type, batch_length, + {std::move(to_nn_buf), std::move(to_buf)}); + } else { + memset(to_nn, 0, BitUtil::BytesForBits(batch_length)); + return ArrayData::Make(to_type, batch_length, + {std::move(to_nn_buf), std::move(to_buf)}); + } + } +} +} // namespace + +Result> HashJoinDictUtil::ConvertToInt32( + const std::shared_ptr& from_type, const Datum& input, int64_t batch_length, + ExecContext* ctx) { + switch (from_type->id()) { + case Type::UINT8: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::INT8: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::UINT16: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::INT16: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::UINT32: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::INT32: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::UINT64: + return ConvertImp(int32(), input, batch_length, ctx); + case Type::INT64: + return ConvertImp(int32(), input, batch_length, ctx); + default: + ARROW_DCHECK(false); + return nullptr; + } +} + +Result> HashJoinDictUtil::ConvertFromInt32( + const std::shared_ptr& to_type, const Datum& input, int64_t batch_length, + ExecContext* ctx) { + switch (to_type->id()) { + case Type::UINT8: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::INT8: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::UINT16: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::INT16: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::UINT32: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::INT32: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::UINT64: + return ConvertImp(to_type, input, batch_length, ctx); + case Type::INT64: + return ConvertImp(to_type, input, batch_length, ctx); + default: + ARROW_DCHECK(false); + return nullptr; + } +} + +std::shared_ptr HashJoinDictUtil::ExtractDictionary(const Datum& data) { + return data.is_array() ? MakeArray(data.array()->dictionary) + : data.scalar_as().value.dictionary; +} + +Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictionary, + std::shared_ptr index_type, + std::shared_ptr value_type) { + index_type_ = std::move(index_type); + value_type_ = std::move(value_type); + hash_table_.clear(); + + if (!dictionary) { + ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(value_type_, 0)); + unified_dictionary_ = dict->data(); + return Status::OK(); + } + + dictionary_ = dictionary; + + // Initialize encoder + internal::RowEncoder encoder; + std::vector encoder_types; + encoder_types.emplace_back(value_type_, ValueDescr::ARRAY); + encoder.Init(encoder_types, ctx); + + // Encode all dictionary values + int64_t length = dictionary->data()->length; + if (length >= std::numeric_limits::max()) { + return Status::Invalid( + "Dictionary length in hash join must fit into signed 32-bit integer."); + } + ExecBatch batch({dictionary->data()}, length); + RETURN_NOT_OK(encoder.EncodeAndAppend(batch)); + + std::vector entries_to_take; + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr non_nulls_buf, + AllocateBitmap(length, ctx->memory_pool())); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr ids_buf, + AllocateBuffer(length * sizeof(int32_t), ctx->memory_pool())); + uint8_t* non_nulls = non_nulls_buf->mutable_data(); + int32_t* ids = reinterpret_cast(ids_buf->mutable_data()); + memset(non_nulls, 0xff, BitUtil::BytesForBits(length)); + + int32_t num_entries = 0; + for (int64_t i = 0; i < length; ++i) { + std::string str = encoder.encoded_row(static_cast(i)); + + // Do not insert null values into resulting dictionary. + // Null values will always be represented as null not an id pointing to a + // dictionary entry for null. + // + if (internal::KeyEncoder::IsNull(reinterpret_cast(str.data()))) { + ids[i] = HashJoinDictUtil::kNullId; + BitUtil::ClearBit(non_nulls, i); + continue; + } + + auto iter = hash_table_.find(str); + if (iter == hash_table_.end()) { + hash_table_.insert(std::make_pair(str, num_entries)); + ids[i] = num_entries; + entries_to_take.push_back(static_cast(i)); + ++num_entries; + } else { + ids[i] = iter->second; + } + } + + ARROW_ASSIGN_OR_RAISE(auto out, encoder.Decode(num_entries, entries_to_take.data())); + + unified_dictionary_ = out[0].array(); + remapped_ids_ = ArrayData::Make(DataTypeAfterRemapping(), length, + {std::move(non_nulls_buf), std::move(ids_buf)}); + + return Status::OK(); +} + +Result> HashJoinDictBuild::RemapInputValues( + ExecContext* ctx, const Datum& values, int64_t batch_length) const { + // Initialize encoder + // + internal::RowEncoder encoder; + std::vector encoder_types; + encoder_types.emplace_back(value_type_, ValueDescr::ARRAY); + encoder.Init(encoder_types, ctx); + + // Encode all + // + ARROW_DCHECK(values.is_array() || values.is_scalar()); + bool is_scalar = values.is_scalar(); + int64_t encoded_length = is_scalar ? 1 : batch_length; + ExecBatch batch({values}, encoded_length); + RETURN_NOT_OK(encoder.EncodeAndAppend(batch)); + + // Allocate output buffers + // + ARROW_ASSIGN_OR_RAISE(std::shared_ptr non_nulls_buf, + AllocateBitmap(batch_length, ctx->memory_pool())); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr ids_buf, + AllocateBuffer(batch_length * sizeof(int32_t), ctx->memory_pool())); + uint8_t* non_nulls = non_nulls_buf->mutable_data(); + int32_t* ids = reinterpret_cast(ids_buf->mutable_data()); + memset(non_nulls, 0xff, BitUtil::BytesForBits(batch_length)); + + // Populate output buffers (for scalar only the first entry is populated) + // + for (int64_t i = 0; i < encoded_length; ++i) { + std::string str = encoder.encoded_row(static_cast(i)); + if (internal::KeyEncoder::IsNull(reinterpret_cast(str.data()))) { + // Map nulls to nulls + BitUtil::ClearBit(non_nulls, i); + ids[i] = HashJoinDictUtil::kNullId; + } else { + auto iter = hash_table_.find(str); + if (iter == hash_table_.end()) { + ids[i] = HashJoinDictUtil::kMissingValueId; + } else { + ids[i] = iter->second; + } + } + } + + // Generate array of repeated values for scalar input + // + if (is_scalar) { + if (!BitUtil::GetBit(non_nulls, 0)) { + memset(non_nulls, 0, BitUtil::BytesForBits(batch_length)); + } + for (int64_t i = 1; i < batch_length; ++i) { + ids[i] = ids[0]; + } + } + + return ArrayData::Make(DataTypeAfterRemapping(), batch_length, + {std::move(non_nulls_buf), std::move(ids_buf)}); +} + +Result> HashJoinDictBuild::RemapInput( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& data_type) const { + auto dict = HashJoinDictUtil::ExtractDictionary(indices); + + if (!dictionary_->Equals(dict)) { + return Status::NotImplemented("Unifying differing dictionaries"); + } + + return HashJoinDictUtil::IndexRemapUsingLUT(ctx, indices, batch_length, remapped_ids_, + data_type); +} + +Result> HashJoinDictBuild::RemapOutput( + const ArrayData& indices32Bit, ExecContext* ctx) const { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr indices, + HashJoinDictUtil::ConvertFromInt32( + index_type_, Datum(indices32Bit), indices32Bit.length, ctx)); + + auto type = std::make_shared(index_type_, value_type_); + return ArrayData::Make(type, indices->length, indices->buffers, {}, + unified_dictionary_); +} + +void HashJoinDictBuild::CleanUp() { + index_type_.reset(); + value_type_.reset(); + hash_table_.clear(); + remapped_ids_.reset(); + unified_dictionary_.reset(); +} + +bool HashJoinDictProbe::KeyNeedsProcessing( + const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type) { + bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY); + bool r_is_dict = (build_data_type->id() == Type::DICTIONARY); + return l_is_dict || r_is_dict; +} + +std::shared_ptr HashJoinDictProbe::DataTypeAfterRemapping( + const std::shared_ptr& build_data_type) { + bool r_is_dict = (build_data_type->id() == Type::DICTIONARY); + if (r_is_dict) { + return HashJoinDictBuild::DataTypeAfterRemapping(); + } else { + return build_data_type; + } +} + +Result> HashJoinDictProbe::RemapInput( + const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length, + const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type, ExecContext* ctx) { + // Cases: + // 1. Dictionary(probe)-Dictionary(build) + // 2. Dictionary(probe)-Value(build) + // 3. Value(probe)-Dictionary(build) + // + bool l_is_dict = (probe_data_type->id() == Type::DICTIONARY); + bool r_is_dict = (build_data_type->id() == Type::DICTIONARY); + if (l_is_dict) { + auto dict = HashJoinDictUtil::ExtractDictionary(data); + const auto& dict_type = checked_cast(*probe_data_type); + + // Verify that the dictionary is always the same. + if (dictionary_) { + if (!dictionary_->Equals(dict)) { + return Status::NotImplemented( + "Unifying differing dictionaries for probe key of hash join"); + } + } else { + dictionary_ = dict; + + // Precompute helper data for the given dictionary if this is the first call. + if (r_is_dict) { + ARROW_DCHECK(opt_build_side); + ARROW_ASSIGN_OR_RAISE( + remapped_ids_, + opt_build_side->RemapInputValues(ctx, Datum(dict->data()), dict->length())); + } else { + std::vector encoder_types; + encoder_types.emplace_back(dict_type.value_type(), ValueDescr::ARRAY); + encoder_.Init(encoder_types, ctx); + ExecBatch batch({dict->data()}, dict->length()); + RETURN_NOT_OK(encoder_.EncodeAndAppend(batch)); + } + } + + if (r_is_dict) { + // CASE 1: + // Remap dictionary ids + return HashJoinDictUtil::IndexRemapUsingLUT(ctx, data, batch_length, remapped_ids_, + probe_data_type); + } else { + // CASE 2: + // Decode selected rows from encoder. + ARROW_ASSIGN_OR_RAISE(std::shared_ptr row_ids_arr, + HashJoinDictUtil::ConvertToInt32(dict_type.index_type(), data, + batch_length, ctx)); + // Change nulls to internal::RowEncoder::kRowIdForNulls() in index. + int32_t* row_ids = + reinterpret_cast(row_ids_arr->buffers[1]->mutable_data()); + const uint8_t* non_nulls = row_ids_arr->buffers[0]->data(); + for (int64_t i = 0; i < batch_length; ++i) { + if (!BitUtil::GetBit(non_nulls, i)) { + row_ids[i] = internal::RowEncoder::kRowIdForNulls(); + } + } + + ARROW_ASSIGN_OR_RAISE(ExecBatch batch, encoder_.Decode(batch_length, row_ids)); + return batch.values[0].array(); + } + } else { + // CASE 3: + // Map values to dictionary ids from build side. + // Values missing in the dictionary will get assigned a special constant + // HashJoinDictUtil::kMissingValueId (different than any valid id). + // + ARROW_DCHECK(r_is_dict); + ARROW_DCHECK(opt_build_side); + return opt_build_side->RemapInputValues(ctx, data, batch_length); + } +} + +void HashJoinDictProbe::CleanUp() { + dictionary_.reset(); + remapped_ids_.reset(); + encoder_.Clear(); +} + +Status HashJoinDictBuildMulti::Init( + const SchemaProjectionMaps& proj_map, + const ExecBatch* opt_non_empty_batch, ExecContext* ctx) { + int num_keys = proj_map.num_cols(HashJoinProjection::KEY); + needs_remap_.resize(num_keys); + remap_imp_.resize(num_keys); + for (int i = 0; i < num_keys; ++i) { + needs_remap_[i] = HashJoinDictBuild::KeyNeedsProcessing( + proj_map.data_type(HashJoinProjection::KEY, i)); + } + + bool build_side_empty = (opt_non_empty_batch == nullptr); + + if (!build_side_empty) { + auto key_to_input = proj_map.map(HashJoinProjection::KEY, HashJoinProjection::INPUT); + for (int i = 0; i < num_keys; ++i) { + const std::shared_ptr& data_type = + proj_map.data_type(HashJoinProjection::KEY, i); + if (data_type->id() == Type::DICTIONARY) { + const auto& dict_type = checked_cast(*data_type); + const auto& dict = HashJoinDictUtil::ExtractDictionary( + opt_non_empty_batch->values[key_to_input.get(i)]); + RETURN_NOT_OK(remap_imp_[i].Init(ctx, dict, dict_type.index_type(), + dict_type.value_type())); + } + } + } else { + for (int i = 0; i < num_keys; ++i) { + const std::shared_ptr& data_type = + proj_map.data_type(HashJoinProjection::KEY, i); + if (data_type->id() == Type::DICTIONARY) { + const auto& dict_type = checked_cast(*data_type); + RETURN_NOT_OK(remap_imp_[i].Init(ctx, nullptr, dict_type.index_type(), + dict_type.value_type())); + } + } + } + return Status::OK(); +} + +void HashJoinDictBuildMulti::InitEncoder( + const SchemaProjectionMaps& proj_map, RowEncoder* encoder, + ExecContext* ctx) { + int num_cols = proj_map.num_cols(HashJoinProjection::KEY); + std::vector data_types(num_cols); + for (int icol = 0; icol < num_cols; ++icol) { + std::shared_ptr data_type = + proj_map.data_type(HashJoinProjection::KEY, icol); + if (HashJoinDictBuild::KeyNeedsProcessing(data_type)) { + data_type = HashJoinDictBuild::DataTypeAfterRemapping(); + } + data_types[icol] = ValueDescr(data_type, ValueDescr::ARRAY); + } + encoder->Init(data_types, ctx); +} + +Status HashJoinDictBuildMulti::EncodeBatch( + size_t thread_index, const SchemaProjectionMaps& proj_map, + const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const { + ExecBatch projected({}, batch.length); + int num_cols = proj_map.num_cols(HashJoinProjection::KEY); + projected.values.resize(num_cols); + + auto to_input = proj_map.map(HashJoinProjection::KEY, HashJoinProjection::INPUT); + for (int icol = 0; icol < num_cols; ++icol) { + projected.values[icol] = batch.values[to_input.get(icol)]; + + if (needs_remap_[icol]) { + ARROW_ASSIGN_OR_RAISE( + projected.values[icol], + remap_imp_[icol].RemapInput(ctx, projected.values[icol], batch.length, + proj_map.data_type(HashJoinProjection::KEY, icol))); + } + } + return encoder->EncodeAndAppend(projected); +} + +Status HashJoinDictBuildMulti::PostDecode( + const SchemaProjectionMaps& proj_map, + ExecBatch* decoded_key_batch, ExecContext* ctx) { + // Post process build side keys that use dictionary + int num_keys = proj_map.num_cols(HashJoinProjection::KEY); + for (int i = 0; i < num_keys; ++i) { + if (needs_remap_[i]) { + ARROW_ASSIGN_OR_RAISE( + decoded_key_batch->values[i], + remap_imp_[i].RemapOutput(*decoded_key_batch->values[i].array(), ctx)); + } + } + return Status::OK(); +} + +void HashJoinDictProbeMulti::Init(size_t num_threads) { + local_states_.resize(num_threads); + for (size_t i = 0; i < local_states_.size(); ++i) { + local_states_[i].is_initialized = false; + } +} + +bool HashJoinDictProbeMulti::BatchRemapNeeded( + size_t thread_index, const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, ExecContext* ctx) { + InitLocalStateIfNeeded(thread_index, proj_map_probe, proj_map_build, ctx); + return local_states_[thread_index].any_needs_remap; +} + +void HashJoinDictProbeMulti::InitLocalStateIfNeeded( + size_t thread_index, const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, ExecContext* ctx) { + ThreadLocalState& local_state = local_states_[thread_index]; + + // Check if we need to remap any of the input keys because of dictionary encoding + // on either side of the join + // + int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY); + local_state.any_needs_remap = false; + local_state.needs_remap.resize(num_cols); + local_state.remap_imp.resize(num_cols); + for (int i = 0; i < num_cols; ++i) { + local_state.needs_remap[i] = HashJoinDictProbe::KeyNeedsProcessing( + proj_map_probe.data_type(HashJoinProjection::KEY, i), + proj_map_build.data_type(HashJoinProjection::KEY, i)); + if (local_state.needs_remap[i]) { + local_state.any_needs_remap = true; + } + } + + if (local_state.any_needs_remap) { + InitEncoder(proj_map_probe, proj_map_build, &local_state.post_remap_encoder, ctx); + } +} + +void HashJoinDictProbeMulti::InitEncoder( + const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, RowEncoder* encoder, + ExecContext* ctx) { + int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY); + std::vector data_types(num_cols); + for (int icol = 0; icol < num_cols; ++icol) { + std::shared_ptr data_type = + proj_map_probe.data_type(HashJoinProjection::KEY, icol); + std::shared_ptr build_data_type = + proj_map_build.data_type(HashJoinProjection::KEY, icol); + if (HashJoinDictProbe::KeyNeedsProcessing(data_type, build_data_type)) { + data_type = HashJoinDictProbe::DataTypeAfterRemapping(build_data_type); + } + data_types[icol] = ValueDescr(data_type, ValueDescr::ARRAY); + } + encoder->Init(data_types, ctx); +} + +Status HashJoinDictProbeMulti::EncodeBatch( + size_t thread_index, const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch, + RowEncoder** out_encoder, ExecBatch* opt_out_key_batch, ExecContext* ctx) { + ThreadLocalState& local_state = local_states_[thread_index]; + InitLocalStateIfNeeded(thread_index, proj_map_probe, proj_map_build, ctx); + + ExecBatch projected({}, batch.length); + int num_cols = proj_map_probe.num_cols(HashJoinProjection::KEY); + projected.values.resize(num_cols); + + auto to_input = proj_map_probe.map(HashJoinProjection::KEY, HashJoinProjection::INPUT); + for (int icol = 0; icol < num_cols; ++icol) { + projected.values[icol] = batch.values[to_input.get(icol)]; + + if (local_state.needs_remap[icol]) { + ARROW_ASSIGN_OR_RAISE( + projected.values[icol], + local_state.remap_imp[icol].RemapInput( + &(dict_build.get_dict_build(icol)), projected.values[icol], batch.length, + proj_map_probe.data_type(HashJoinProjection::KEY, icol), + proj_map_build.data_type(HashJoinProjection::KEY, icol), ctx)); + } + } + + if (opt_out_key_batch) { + *opt_out_key_batch = projected; + } + + local_state.post_remap_encoder.Clear(); + RETURN_NOT_OK(local_state.post_remap_encoder.EncodeAndAppend(projected)); + *out_encoder = &local_state.post_remap_encoder; + + return Status::OK(); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.h b/cpp/src/arrow/compute/exec/hash_join_dict.h new file mode 100644 index 0000000000000..26605cc449a0d --- /dev/null +++ b/cpp/src/arrow/compute/exec/hash_join_dict.h @@ -0,0 +1,315 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/schema_util.h" +#include "arrow/compute/kernels/row_encoder.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" + +// This file contains hash join logic related to handling of dictionary encoded key +// columns. +// +// A key column from probe side of the join can be matched against a key column from build +// side of the join, as long as the underlying value types are equal. That means that: +// - both scalars and arrays can be used and even mixed in the same column +// - dictionary column can be matched against non-dictionary column if underlying value +// types are equal +// - dictionary column can be matched against dictionary column with a different index +// type, and potentially using a different dictionary, if underlying value types are equal +// +// We currently require in hash join that for all dictionary encoded columns, the same +// dictionary is used in all input exec batches. +// +// In order to allow matching columns with different dictionaries, different dictionary +// index types, and dictionary key against non-dictionary key, internally comparisons will +// be evaluated after remapping values on both sides of the join to a common +// representation (which will be called "unified representation"). This common +// representation is a column of int32() type (not a dictionary column). It represents an +// index in the unified dictionary computed for the (only) dictionary present on build +// side (an empty dictionary is still created for an empty build side). Null value is +// always represented in this common representation as null int32 value, unified +// dictionary will never contain a null value (so there is no ambiguity of representing +// nulls as either index to a null entry in the dictionary or null index). +// +// Unified dictionary represents values present on build side. There may be values on +// probe side that are not present in it. All such values, that are not null, are mapped +// in the common representation to a special constant kMissingValueId. +// + +namespace arrow { +namespace compute { + +using internal::RowEncoder; + +/// Helper class with operations that are stateless and common to processing of dictionary +/// keys on both build and probe side. +class HashJoinDictUtil { + public: + // Null values in unified representation are always represented as null that has + // corresponding integer set to this constant + static constexpr int32_t kNullId = 0; + // Constant representing a value, that is not null, missing on the build side, in + // unified representation. + static constexpr int32_t kMissingValueId = -1; + + // Check if data types of corresponding pair of key column on build and probe side are + // compatible + static bool KeyDataTypesValid(const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type); + + // Input must be dictionary array or dictionary scalar. + // A precomputed and provided here lookup table in the form of int32() array will be + // used to remap input indices to unified representation. + // + static Result> IndexRemapUsingLUT( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& map_array, + const std::shared_ptr& data_type); + + // Return int32() array that contains indices of input dictionary array or scalar after + // type casting. + static Result> ConvertToInt32( + const std::shared_ptr& from_type, const Datum& input, + int64_t batch_length, ExecContext* ctx); + + // Return an array that contains elements of input int32() array after casting to a + // given integer type. This is used for mapping unified representation stored in the + // hash table on build side back to original input data type of hash join, when + // outputting hash join results to parent exec node. + // + static Result> ConvertFromInt32( + const std::shared_ptr& to_type, const Datum& input, int64_t batch_length, + ExecContext* ctx); + + // Return dictionary referenced in either dictionary array or dictionary scalar + static std::shared_ptr ExtractDictionary(const Datum& data); +}; + +/// Implements processing of dictionary arrays/scalars in key columns on the build side of +/// a hash join. +/// Each instance of this class corresponds to a single column and stores and +/// processes only the information related to that column. +/// Const methods are thread-safe, non-const methods are not (the caller must make sure +/// that only one thread at any time will access them). +/// +class HashJoinDictBuild { + public: + // Returns true if the key column (described in input by its data type) requires any + // pre- or post-processing related to handling dictionaries. + // + static bool KeyNeedsProcessing(const std::shared_ptr& build_data_type) { + return (build_data_type->id() == Type::DICTIONARY); + } + + // Data type of unified representation + static std::shared_ptr DataTypeAfterRemapping() { return int32(); } + + // Should be called only once in hash join, before processing any build or probe + // batches. + // + // Takes a pointer to the dictionary for a corresponding key column on the build side as + // an input. If the build side is empty, it still needs to be called, but with + // dictionary pointer set to null. + // + // Currently it is required that all input batches on build side share the same + // dictionary. For each input batch during its pre-processing, dictionary will be + // checked and error will be returned if it is different then the one provided in the + // call to this method. + // + // Unifies the dictionary. The order of the values is still preserved. + // Null and duplicate entries are removed. If the dictionary is already unified, its + // copy will be produced and stored within this class. + // + // Prepares the mapping from ids within original dictionary to the ids in the resulting + // dictionary. This is used later on to pre-process (map to unified representation) key + // column on build side. + // + // Prepares the reverse mapping (in the form of hash table) from values to the ids in + // the resulting dictionary. This will be used later on to pre-process (map to unified + // representation) key column on probe side. Values on probe side that are not present + // in the original dictionary will be mapped to a special constant kMissingValueId. The + // exception is made for nulls, which get always mapped to nulls (both when null is + // represented as a dictionary id pointing to a null and a null dictionary id). + // + Status Init(ExecContext* ctx, std::shared_ptr dictionary, + std::shared_ptr index_type, std::shared_ptr value_type); + + // Remap array or scalar values into unified representation (array of int32()). + // Outputs kMissingValueId if input value is not found in the unified dictionary. + // Outputs null for null input value (with corresponding data set to kNullId). + // + Result> RemapInputValues(ExecContext* ctx, + const Datum& values, + int64_t batch_length) const; + + // Remap dictionary array or dictionary scalar on build side to unified representation. + // Dictionary referenced in the input must match the dictionary that was + // given during initialization. + // The output is a dictionary array that references unified dictionary. + // + Result> RemapInput( + ExecContext* ctx, const Datum& indices, int64_t batch_length, + const std::shared_ptr& data_type) const; + + // Outputs dictionary array referencing unified dictionary, given an array with 32-bit + // ids. + // Used to post-process values looked up in a hash table on build side of the hash join + // before outputting to the parent exec node. + // + Result> RemapOutput(const ArrayData& indices32Bit, + ExecContext* ctx) const; + + // Release shared pointers and memory + void CleanUp(); + + private: + // Data type of dictionary ids for the input dictionary on build side + std::shared_ptr index_type_; + // Data type of values for the input dictionary on build side + std::shared_ptr value_type_; + // Mapping from (encoded as string) values to the ids in unified dictionary + std::unordered_map hash_table_; + // Mapping from input dictionary ids to unified dictionary ids + std::shared_ptr remapped_ids_; + // Input dictionary + std::shared_ptr dictionary_; + // Unified dictionary + std::shared_ptr unified_dictionary_; +}; + +/// Implements processing of dictionary arrays/scalars in key columns on the probe side of +/// a hash join. +/// Each instance of this class corresponds to a single column and stores and +/// processes only the information related to that column. +/// It is not thread-safe - every participating thread should use its own instance of +/// this class. +/// +class HashJoinDictProbe { + public: + static bool KeyNeedsProcessing(const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type); + + // Data type of the result of remapping input key column. + // + // The result of remapping is what is used in hash join for matching keys on build and + // probe side. The exact data types may be different, as described below, and therefore + // a common representation is needed for simplifying comparisons of pairs of keys on + // both sides. + // + // We support matching key that is of non-dictionary type with key that is of dictionary + // type, as long as the underlying value types are equal. We support matching when both + // keys are of dictionary type, regardless whether underlying dictionary index types are + // the same or not. + // + static std::shared_ptr DataTypeAfterRemapping( + const std::shared_ptr& build_data_type); + + // Should only be called if KeyNeedsProcessing method returns true for a pair of + // corresponding key columns from build and probe side. + // Converts values in order to match the common representation for + // both build and probe side used in hash table comparison. + // Supports arrays and scalars as input. + // Argument opt_build_side should be null if dictionary key on probe side is matched + // with non-dictionary key on build side. + // + Result> RemapInput( + const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length, + const std::shared_ptr& probe_data_type, + const std::shared_ptr& build_data_type, ExecContext* ctx); + + void CleanUp(); + + private: + // May be null if probe side key is non-dictionary. Otherwise it is used to verify that + // only a single dictionary is referenced in exec batch on probe side of hash join. + std::shared_ptr dictionary_; + // Mapping from dictionary on probe side of hash join (if it is used) to unified + // representation. + std::shared_ptr remapped_ids_; + // Encoder of key columns that uses unified representation instead of original data type + // for key columns that need to use it (have dictionaries on either side of the join). + internal::RowEncoder encoder_; +}; + +// Encapsulates dictionary handling logic for build side of hash join. +// +class HashJoinDictBuildMulti { + public: + Status Init(const SchemaProjectionMaps& proj_map, + const ExecBatch* opt_non_empty_batch, ExecContext* ctx); + static void InitEncoder(const SchemaProjectionMaps& proj_map, + RowEncoder* encoder, ExecContext* ctx); + Status EncodeBatch(size_t thread_index, + const SchemaProjectionMaps& proj_map, + const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const; + Status PostDecode(const SchemaProjectionMaps& proj_map, + ExecBatch* decoded_key_batch, ExecContext* ctx); + const HashJoinDictBuild& get_dict_build(int icol) const { return remap_imp_[icol]; } + + private: + std::vector needs_remap_; + std::vector remap_imp_; +}; + +// Encapsulates dictionary handling logic for probe side of hash join +// +class HashJoinDictProbeMulti { + public: + void Init(size_t num_threads); + bool BatchRemapNeeded(size_t thread_index, + const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + ExecContext* ctx); + Status EncodeBatch(size_t thread_index, + const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch, + RowEncoder** out_encoder, ExecBatch* opt_out_key_batch, + ExecContext* ctx); + + private: + void InitLocalStateIfNeeded( + size_t thread_index, const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, ExecContext* ctx); + static void InitEncoder(const SchemaProjectionMaps& proj_map_probe, + const SchemaProjectionMaps& proj_map_build, + RowEncoder* encoder, ExecContext* ctx); + struct ThreadLocalState { + bool is_initialized; + // Whether any key column needs remapping (because of dictionaries used) before doing + // join hash table lookups + bool any_needs_remap; + // Whether each key column needs remapping before doing join hash table lookups + std::vector needs_remap; + std::vector remap_imp; + // Encoder of key columns that uses unified representation instead of original data + // type for key columns that need to use it (have dictionaries on either side of the + // join). + RowEncoder post_remap_encoder; + }; + std::vector local_states_; +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc index 3e02054fbedf1..4bccb761070f4 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node.cc @@ -19,6 +19,7 @@ #include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/hash_join.h" +#include "arrow/compute/exec/hash_join_dict.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/util.h" @@ -33,6 +34,15 @@ using internal::checked_cast; namespace compute { +// Check if a type is supported in a join (as either a key or non-key column) +bool HashJoinSchema::IsTypeSupported(const DataType& type) { + const Type::type id = type.id(); + if (id == Type::DICTIONARY) { + return IsTypeSupported(*checked_cast(type).value_type()); + } + return is_fixed_width(id) || is_binary_like(id) || is_large_binary_like(id); +} + Result> HashJoinSchema::VectorDiff(const Schema& schema, const std::vector& a, const std::vector& b) { @@ -140,8 +150,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc // 2. Same number of key fields on left and right // 3. At least one key field // 4. Equal data types for corresponding key fields - // 5. Dictionary type is not supported in a key field - // 6. Some other data types may not be allowed in a key field + // 5. Some data types may not be allowed in a key field or non-key field // if (left_keys.size() != right_keys.size()) { return Status::Invalid("Different number of key fields on left (", left_keys.size(), @@ -163,18 +172,8 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc const FieldPath& match = result.ValueUnsafe(); const std::shared_ptr& type = (left_side ? left_schema.fields() : right_schema.fields())[match[0]]->type(); - if (type->id() == Type::DICTIONARY) { - return Status::Invalid( - "Dictionary type support for join key is not yet implemented, key field " - "reference: ", - field_ref.ToString(), left_side ? " on left " : " on right ", - "side of the join"); - } - if ((type->id() != Type::BOOL && !is_fixed_width(type->id()) && - !is_binary_like(type->id())) || - is_large_binary_like(type->id())) { - return Status::Invalid("Data type ", type->ToString(), - " is not supported in join key field"); + if (!IsTypeSupported(*type)) { + return Status::Invalid("Data type ", *type, " is not supported in join key field"); } } for (size_t i = 0; i < left_keys.size(); ++i) { @@ -184,11 +183,25 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc int right_id = right_ref.FindOne(right_schema).ValueUnsafe()[0]; const std::shared_ptr& left_type = left_schema.fields()[left_id]->type(); const std::shared_ptr& right_type = right_schema.fields()[right_id]->type(); - if (!left_type->Equals(right_type)) { - return Status::Invalid("Mismatched data types for corresponding join field keys: ", - left_ref.ToString(), " of type ", left_type->ToString(), - " and ", right_ref.ToString(), " of type ", - right_type->ToString()); + if (!HashJoinDictUtil::KeyDataTypesValid(left_type, right_type)) { + return Status::Invalid( + "Incompatible data types for corresponding join field keys: ", + left_ref.ToString(), " of type ", left_type->ToString(), " and ", + right_ref.ToString(), " of type ", right_type->ToString()); + } + } + for (const auto& field : left_schema.fields()) { + const auto& type = *field->type(); + if (!IsTypeSupported(type)) { + return Status::Invalid("Data type ", type, + " is not supported in join non-key field"); + } + } + for (const auto& field : right_schema.fields()) { + const auto& type = *field->type(); + if (!IsTypeSupported(type)) { + return Status::Invalid("Data type ", type, + " is not supported in join non-key field"); } } @@ -228,16 +241,6 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc field_ref.ToString(), left_side ? " on left " : " on right ", "side of the join"); } - const FieldPath& match = result.ValueUnsafe(); - const std::shared_ptr& type = - (left_side ? left_schema.fields() : right_schema.fields())[match[0]]->type(); - if (type->id() == Type::DICTIONARY) { - return Status::Invalid( - "Dictionary type support for join output field is not yet implemented, output " - "field reference: ", - field_ref.ToString(), left_side ? " on left " : " on right ", - "side of the join"); - } } return Status::OK(); } diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc index a5410b0d37a22..40738d1e229be 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc @@ -1059,18 +1059,8 @@ TEST(HashJoin, Random) { &(key_fields[i]), &(output_fields[i]), &(output_field_ids[i])); } - // Print test case parameters - // print num_rows, batch_size, join_type, join_cmp - std::cout << join_type_name << " " << key_cmp_str << " "; - key_types.Print(); - std::cout << " payload_l: "; - payload_types[0].Print(); - std::cout << " payload_r: "; - payload_types[1].Print(); - std::cout << " num_rows_l = " << num_rows_l << " num_rows_r = " << num_rows_r - << " batch size = " << batch_size - << " parallel = " << (parallel ? "true" : "false"); - std::cout << std::endl; + ARROW_SCOPED_TRACE(join_type_name, " ", key_cmp_str, + " parallel = ", (parallel ? "true" : "false")); // Run reference join implementation std::vector null_in_key_vectors[2]; @@ -1113,5 +1103,581 @@ TEST(HashJoin, Random) { } } +void DecodeScalarsAndDictionariesInBatch(ExecBatch* batch, MemoryPool* pool) { + for (size_t i = 0; i < batch->values.size(); ++i) { + if (batch->values[i].is_scalar()) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr col, + MakeArrayFromScalar(*(batch->values[i].scalar()), batch->length, pool)); + batch->values[i] = Datum(col); + } + if (batch->values[i].type()->id() == Type::DICTIONARY) { + const auto& dict_type = + checked_cast(*batch->values[i].type()); + std::shared_ptr indices = + ArrayData::Make(dict_type.index_type(), batch->values[i].array()->length, + batch->values[i].array()->buffers); + const std::shared_ptr& dictionary = batch->values[i].array()->dictionary; + ASSERT_OK_AND_ASSIGN(Datum col, Take(*dictionary, *indices)); + batch->values[i] = col; + } + } +} + +std::shared_ptr UpdateSchemaAfterDecodingDictionaries( + const std::shared_ptr& schema) { + std::vector> output_fields(schema->num_fields()); + for (int i = 0; i < schema->num_fields(); ++i) { + const std::shared_ptr& field = schema->field(i); + if (field->type()->id() == Type::DICTIONARY) { + const auto& dict_type = checked_cast(*field->type()); + output_fields[i] = std::make_shared(field->name(), dict_type.value_type(), + true /* nullable */); + } else { + output_fields[i] = field->Copy(); + } + } + return std::make_shared(std::move(output_fields)); +} + +void TestHashJoinDictionaryHelper( + JoinType join_type, JoinKeyCmp cmp, + // Whether to run parallel hash join. + // This requires generating multiple copies of each input batch on one side of the + // join. Expected results will be automatically adjusted to reflect the multiplication + // of input batches. + bool parallel, Datum l_key, Datum l_payload, Datum r_key, Datum r_payload, + Datum l_out_key, Datum l_out_payload, Datum r_out_key, Datum r_out_payload, + // Number of rows at the end of expected output that represent rows from the right + // side that do not have a match on the left side. This number is needed to + // automatically adjust expected result when multiplying input batches on the left + // side. + int expected_num_r_no_match, + // Whether to swap two inputs to the hash join + bool swap_sides) { + int64_t l_length = l_key.is_array() + ? l_key.array()->length + : l_payload.is_array() ? l_payload.array()->length : -1; + int64_t r_length = r_key.is_array() + ? r_key.array()->length + : r_payload.is_array() ? r_payload.array()->length : -1; + ARROW_DCHECK(l_length >= 0 && r_length >= 0); + + constexpr int batch_multiplicity_for_parallel = 2; + + // Split both sides into exactly two batches + int64_t l_first_length = l_length / 2; + int64_t r_first_length = r_length / 2; + BatchesWithSchema l_batches, r_batches; + l_batches.batches.resize(2); + r_batches.batches.resize(2); + ASSERT_OK_AND_ASSIGN( + l_batches.batches[0], + ExecBatch::Make({l_key.is_array() ? l_key.array()->Slice(0, l_first_length) : l_key, + l_payload.is_array() ? l_payload.array()->Slice(0, l_first_length) + : l_payload})); + ASSERT_OK_AND_ASSIGN( + l_batches.batches[1], + ExecBatch::Make( + {l_key.is_array() + ? l_key.array()->Slice(l_first_length, l_length - l_first_length) + : l_key, + l_payload.is_array() + ? l_payload.array()->Slice(l_first_length, l_length - l_first_length) + : l_payload})); + ASSERT_OK_AND_ASSIGN( + r_batches.batches[0], + ExecBatch::Make({r_key.is_array() ? r_key.array()->Slice(0, r_first_length) : r_key, + r_payload.is_array() ? r_payload.array()->Slice(0, r_first_length) + : r_payload})); + ASSERT_OK_AND_ASSIGN( + r_batches.batches[1], + ExecBatch::Make( + {r_key.is_array() + ? r_key.array()->Slice(r_first_length, r_length - r_first_length) + : r_key, + r_payload.is_array() + ? r_payload.array()->Slice(r_first_length, r_length - r_first_length) + : r_payload})); + l_batches.schema = + schema({field("l_key", l_key.type()), field("l_payload", l_payload.type())}); + r_batches.schema = + schema({field("r_key", r_key.type()), field("r_payload", r_payload.type())}); + + // Add copies of input batches on originally left side of the hash join + if (parallel) { + for (int i = 0; i < batch_multiplicity_for_parallel - 1; ++i) { + l_batches.batches.push_back(l_batches.batches[0]); + l_batches.batches.push_back(l_batches.batches[1]); + } + } + + auto exec_ctx = arrow::internal::make_unique( + default_memory_pool(), parallel ? arrow::internal::GetCpuThreadPool() : nullptr); + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); + ASSERT_OK_AND_ASSIGN( + ExecNode * l_source, + MakeExecNode("source", plan.get(), {}, + SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, + /*slow=*/false)})); + ASSERT_OK_AND_ASSIGN( + ExecNode * r_source, + MakeExecNode("source", plan.get(), {}, + SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, + /*slow=*/false)})); + HashJoinNodeOptions join_options{join_type, + {FieldRef(swap_sides ? "r_key" : "l_key")}, + {FieldRef(swap_sides ? "l_key" : "r_key")}, + {FieldRef(swap_sides ? "r_key" : "l_key"), + FieldRef(swap_sides ? "r_payload" : "l_payload")}, + {FieldRef(swap_sides ? "l_key" : "r_key"), + FieldRef(swap_sides ? "l_payload" : "r_payload")}, + {cmp}}; + ASSERT_OK_AND_ASSIGN(ExecNode * join, MakeExecNode("hashjoin", plan.get(), + {(swap_sides ? r_source : l_source), + (swap_sides ? l_source : r_source)}, + join_options)); + AsyncGenerator> sink_gen; + ASSERT_OK_AND_ASSIGN( + std::ignore, MakeExecNode("sink", plan.get(), {join}, SinkNodeOptions{&sink_gen})); + ASSERT_FINISHES_OK_AND_ASSIGN(auto res, StartAndCollect(plan.get(), sink_gen)); + + for (auto& batch : res) { + DecodeScalarsAndDictionariesInBatch(&batch, exec_ctx->memory_pool()); + } + std::shared_ptr output_schema = + UpdateSchemaAfterDecodingDictionaries(join->output_schema()); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr output, + TableFromExecBatches(output_schema, res)); + + ExecBatch expected_batch; + if (swap_sides) { + ASSERT_OK_AND_ASSIGN(expected_batch, ExecBatch::Make({r_out_key, r_out_payload, + l_out_key, l_out_payload})); + } else { + ASSERT_OK_AND_ASSIGN(expected_batch, ExecBatch::Make({l_out_key, l_out_payload, + r_out_key, r_out_payload})); + } + + DecodeScalarsAndDictionariesInBatch(&expected_batch, exec_ctx->memory_pool()); + + // Slice expected batch into two to separate rows on right side with no matches from + // everything else. + // + std::vector expected_batches; + ASSERT_OK_AND_ASSIGN( + auto prefix_batch, + ExecBatch::Make({expected_batch.values[0].array()->Slice( + 0, expected_batch.length - expected_num_r_no_match), + expected_batch.values[1].array()->Slice( + 0, expected_batch.length - expected_num_r_no_match), + expected_batch.values[2].array()->Slice( + 0, expected_batch.length - expected_num_r_no_match), + expected_batch.values[3].array()->Slice( + 0, expected_batch.length - expected_num_r_no_match)})); + for (int i = 0; i < (parallel ? batch_multiplicity_for_parallel : 1); ++i) { + expected_batches.push_back(prefix_batch); + } + if (expected_num_r_no_match > 0) { + ASSERT_OK_AND_ASSIGN( + auto suffix_batch, + ExecBatch::Make({expected_batch.values[0].array()->Slice( + expected_batch.length - expected_num_r_no_match, + expected_num_r_no_match), + expected_batch.values[1].array()->Slice( + expected_batch.length - expected_num_r_no_match, + expected_num_r_no_match), + expected_batch.values[2].array()->Slice( + expected_batch.length - expected_num_r_no_match, + expected_num_r_no_match), + expected_batch.values[3].array()->Slice( + expected_batch.length - expected_num_r_no_match, + expected_num_r_no_match)})); + expected_batches.push_back(suffix_batch); + } + + ASSERT_OK_AND_ASSIGN(std::shared_ptr
expected, + TableFromExecBatches(output_schema, expected_batches)); + + // Compare results + AssertTablesEqual(expected, output); +} + +TEST(HashJoin, Dictionary) { + auto int8_utf8 = dictionary(int8(), utf8()); + auto uint8_utf8 = arrow::dictionary(uint8(), utf8()); + auto int16_utf8 = arrow::dictionary(int16(), utf8()); + auto uint16_utf8 = arrow::dictionary(uint16(), utf8()); + auto int32_utf8 = arrow::dictionary(int32(), utf8()); + auto uint32_utf8 = arrow::dictionary(uint32(), utf8()); + auto int64_utf8 = arrow::dictionary(int64(), utf8()); + auto uint64_utf8 = arrow::dictionary(uint64(), utf8()); + std::shared_ptr dict_types[] = {int8_utf8, uint8_utf8, int16_utf8, + uint16_utf8, int32_utf8, uint32_utf8, + int64_utf8, uint64_utf8}; + + Random64Bit rng(43); + + // Dictionaries in payload columns + for (auto parallel : {false, true}) { + for (auto swap_sides : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, parallel, + // Input + ArrayFromJSON(utf8(), R"(["a", "c", "c", "d"])"), + DictArrayFromJSON(int8_utf8, R"([4, 2, 3, 0])", + R"(["p", "q", "r", null, "r"])"), + ArrayFromJSON(utf8(), R"(["a", "a", "b", "c"])"), + DictArrayFromJSON(int16_utf8, R"([0, 1, 0, 2])", R"(["r", null, "r", "q"])"), + // Expected output + ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", "d", null])"), + DictArrayFromJSON(int8_utf8, R"([4, 4, 2, 3, 0, null])", + R"(["p", "q", "r", null, "r"])"), + ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", null, "b"])"), + DictArrayFromJSON(int16_utf8, R"([0, 1, 2, 2, null, 0])", + R"(["r", null, "r", "q"])"), + 1, swap_sides); + } + } + + // Dictionaries in key columns + for (auto parallel : {false, true}) { + for (auto swap_sides : {false, true}) { + for (auto l_key_dict : {true, false}) { + for (auto r_key_dict : {true, false}) { + auto l_key_dict_type = dict_types[rng.from_range(0, 7)]; + auto r_key_dict_type = dict_types[rng.from_range(0, 7)]; + + auto l_key = l_key_dict ? DictArrayFromJSON(l_key_dict_type, R"([2, 2, 0, 1])", + R"(["b", null, "a"])") + : ArrayFromJSON(utf8(), R"(["a", "a", "b", null])"); + auto l_payload = ArrayFromJSON(utf8(), R"(["x", "y", "z", "y"])"); + auto r_key = r_key_dict + ? DictArrayFromJSON(int16_utf8, R"([1, 0, null, 1, 2])", + R"([null, "b", "c"])") + : ArrayFromJSON(utf8(), R"(["b", null, null, "b", "c"])"); + auto r_payload = ArrayFromJSON(utf8(), R"(["p", "r", "p", "q", "s"])"); + + // IS comparison function (null is equal to null when matching keys) + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::IS, parallel, + // Input + l_key, l_payload, r_key, r_payload, + // Expected + l_key_dict ? DictArrayFromJSON(l_key_dict_type, R"([2, 2, 0, 0, 1, 1, + null])", + R"(["b", null, "a"])") + : ArrayFromJSON(utf8(), R"(["a", "a", "b", "b", null, null, + null])"), + ArrayFromJSON(utf8(), R"(["x", "y", "z", "z", "y", "y", null])"), + r_key_dict + ? DictArrayFromJSON(r_key_dict_type, R"([null, null, 0, 0, null, null, + 1])", + R"(["b", "c"])") + : ArrayFromJSON(utf8(), R"([null, null, "b", "b", null, null, "c"])"), + ArrayFromJSON(utf8(), R"([null, null, "p", "q", "r", "p", "s"])"), 1, + swap_sides); + + // EQ comparison function (null is not matching null) + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, parallel, + // Input + l_key, l_payload, r_key, r_payload, + // Expected + l_key_dict ? DictArrayFromJSON(l_key_dict_type, + R"([2, 2, 0, 0, 1, null, null, null])", + R"(["b", null, "a"])") + : ArrayFromJSON( + utf8(), R"(["a", "a", "b", "b", null, null, null, null])"), + ArrayFromJSON(utf8(), R"(["x", "y", "z", "z", "y", null, null, null])"), + r_key_dict + ? DictArrayFromJSON(r_key_dict_type, + R"([null, null, 0, 0, null, null, null, 1])", + R"(["b", "c"])") + : ArrayFromJSON(utf8(), + R"([null, null, "b", "b", null, null, null, "c"])"), + ArrayFromJSON(utf8(), R"([null, null, "p", "q", null, "r", "p", "s"])"), 3, + swap_sides); + } + } + } + } + + // Empty build side + { + auto l_key_dict_type = dict_types[rng.from_range(0, 7)]; + auto l_payload_dict_type = dict_types[rng.from_range(0, 7)]; + auto r_key_dict_type = dict_types[rng.from_range(0, 7)]; + auto r_payload_dict_type = dict_types[rng.from_range(0, 7)]; + + for (auto parallel : {false, true}) { + for (auto swap_sides : {false, true}) { + for (auto cmp : {JoinKeyCmp::IS, JoinKeyCmp::EQ}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, cmp, parallel, + // Input + DictArrayFromJSON(l_key_dict_type, R"([2, 0, 1])", R"(["b", null, "a"])"), + DictArrayFromJSON(l_payload_dict_type, R"([2, 2, 0])", + R"(["x", "y", "z"])"), + DictArrayFromJSON(r_key_dict_type, R"([])", R"([null, "b", "c"])"), + DictArrayFromJSON(r_payload_dict_type, R"([])", R"(["p", "r", "s"])"), + // Expected + DictArrayFromJSON(l_key_dict_type, R"([2, 0, 1])", R"(["b", null, "a"])"), + DictArrayFromJSON(l_payload_dict_type, R"([2, 2, 0])", + R"(["x", "y", "z"])"), + DictArrayFromJSON(r_key_dict_type, R"([null, null, null])", + R"(["b", "c"])"), + DictArrayFromJSON(r_payload_dict_type, R"([null, null, null])", + R"(["p", "r", "s"])"), + 0, swap_sides); + } + } + } + } + + // Empty probe side + { + auto l_key_dict_type = dict_types[rng.from_range(0, 7)]; + auto l_payload_dict_type = dict_types[rng.from_range(0, 7)]; + auto r_key_dict_type = dict_types[rng.from_range(0, 7)]; + auto r_payload_dict_type = dict_types[rng.from_range(0, 7)]; + + for (auto parallel : {false, true}) { + for (auto swap_sides : {false, true}) { + for (auto cmp : {JoinKeyCmp::IS, JoinKeyCmp::EQ}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, cmp, parallel, + // Input + DictArrayFromJSON(l_key_dict_type, R"([])", R"(["b", null, "a"])"), + DictArrayFromJSON(l_payload_dict_type, R"([])", R"(["x", "y", "z"])"), + DictArrayFromJSON(r_key_dict_type, R"([2, 0, 1, null])", + R"([null, "b", "c"])"), + DictArrayFromJSON(r_payload_dict_type, R"([1, 1, null, 0])", + R"(["p", "r", "s"])"), + // Expected + DictArrayFromJSON(l_key_dict_type, R"([null, null, null, null])", + R"(["b", null, "a"])"), + DictArrayFromJSON(l_payload_dict_type, R"([null, null, null, null])", + R"(["x", "y", "z"])"), + DictArrayFromJSON(r_key_dict_type, R"([1, null, 0, null])", + R"(["b", "c"])"), + DictArrayFromJSON(r_payload_dict_type, R"([1, 1, null, 0])", + R"(["p", "r", "s"])"), + 4, swap_sides); + } + } + } + } +} + +TEST(HashJoin, Scalars) { + auto int8_utf8 = std::make_shared(int8(), utf8()); + auto int16_utf8 = std::make_shared(int16(), utf8()); + auto int32_utf8 = std::make_shared(int32(), utf8()); + + // Scalars in payload columns + for (auto use_scalar_dict : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/, + // Input + ArrayFromJSON(utf8(), R"(["a", "c", "c", "d"])"), + use_scalar_dict ? DictScalarFromJSON(int16_utf8, "1", R"(["z", "x", "y"])") + : ScalarFromJSON(utf8(), "\"x\""), + ArrayFromJSON(utf8(), R"(["a", "a", "b", "c"])"), + use_scalar_dict ? DictScalarFromJSON(int32_utf8, "0", R"(["z", "x", "y"])") + : ScalarFromJSON(utf8(), "\"z\""), + // Expected output + ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", "d", null])"), + ArrayFromJSON(utf8(), R"(["x", "x", "x", "x", "x", null])"), + ArrayFromJSON(utf8(), R"(["a", "a", "c", "c", null, "b"])"), + ArrayFromJSON(utf8(), R"(["z", "z", "z", "z", null, "z"])"), 1, + false /*swap sides*/); + } + + // Scalars in key columns + for (auto use_scalar_dict : {false, true}) { + for (auto swap_sides : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/, + // Input + use_scalar_dict ? DictScalarFromJSON(int8_utf8, "1", R"(["b", "a", "c"])") + : ScalarFromJSON(utf8(), "\"a\""), + ArrayFromJSON(utf8(), R"(["x", "y"])"), + ArrayFromJSON(utf8(), R"(["a", null, "b"])"), + ArrayFromJSON(utf8(), R"(["p", "q", "r"])"), + // Expected output + ArrayFromJSON(utf8(), R"(["a", "a", null, null])"), + ArrayFromJSON(utf8(), R"(["x", "y", null, null])"), + ArrayFromJSON(utf8(), R"(["a", "a", null, "b"])"), + ArrayFromJSON(utf8(), R"(["p", "p", "q", "r"])"), 2, swap_sides); + } + } + + // Null scalars in key columns + for (auto use_scalar_dict : {false, true}) { + for (auto swap_sides : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/, + // Input + use_scalar_dict ? DictScalarFromJSON(int16_utf8, "2", R"(["a", "b", null])") + : ScalarFromJSON(utf8(), "null"), + ArrayFromJSON(utf8(), R"(["x", "y"])"), + ArrayFromJSON(utf8(), R"(["a", null, "b"])"), + ArrayFromJSON(utf8(), R"(["p", "q", "r"])"), + // Expected output + ArrayFromJSON(utf8(), R"([null, null, null, null, null])"), + ArrayFromJSON(utf8(), R"(["x", "y", null, null, null])"), + ArrayFromJSON(utf8(), R"([null, null, "a", null, "b"])"), + ArrayFromJSON(utf8(), R"([null, null, "p", "q", "r"])"), 3, swap_sides); + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::IS, false /*parallel*/, + // Input + use_scalar_dict ? DictScalarFromJSON(int16_utf8, "null", R"(["a", "b", null])") + : ScalarFromJSON(utf8(), "null"), + ArrayFromJSON(utf8(), R"(["x", "y"])"), + ArrayFromJSON(utf8(), R"(["a", null, "b"])"), + ArrayFromJSON(utf8(), R"(["p", "q", "r"])"), + // Expected output + ArrayFromJSON(utf8(), R"([null, null, null, null])"), + ArrayFromJSON(utf8(), R"(["x", "y", null, null])"), + ArrayFromJSON(utf8(), R"([null, null, "a", "b"])"), + ArrayFromJSON(utf8(), R"(["q", "q", "p", "r"])"), 2, swap_sides); + } + } + + // Scalars with the empty build/probe side + for (auto use_scalar_dict : {false, true}) { + for (auto swap_sides : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/, + // Input + use_scalar_dict ? DictScalarFromJSON(int8_utf8, "1", R"(["b", "a", "c"])") + : ScalarFromJSON(utf8(), "\"a\""), + ArrayFromJSON(utf8(), R"(["x", "y"])"), ArrayFromJSON(utf8(), R"([])"), + ArrayFromJSON(utf8(), R"([])"), + // Expected output + ArrayFromJSON(utf8(), R"(["a", "a"])"), ArrayFromJSON(utf8(), R"(["x", "y"])"), + ArrayFromJSON(utf8(), R"([null, null])"), + ArrayFromJSON(utf8(), R"([null, null])"), 0, swap_sides); + } + } + + // Scalars vs dictionaries in key columns + for (auto use_scalar_dict : {false, true}) { + for (auto swap_sides : {false, true}) { + TestHashJoinDictionaryHelper( + JoinType::FULL_OUTER, JoinKeyCmp::EQ, false /*parallel*/, + // Input + use_scalar_dict ? DictScalarFromJSON(int32_utf8, "1", R"(["b", "a", "c"])") + : ScalarFromJSON(utf8(), "\"a\""), + ArrayFromJSON(utf8(), R"(["x", "y"])"), + DictArrayFromJSON(int32_utf8, R"([2, 2, 1])", R"(["b", null, "a"])"), + ArrayFromJSON(utf8(), R"(["p", "q", "r"])"), + // Expected output + ArrayFromJSON(utf8(), R"(["a", "a", "a", "a", null])"), + ArrayFromJSON(utf8(), R"(["x", "x", "y", "y", null])"), + ArrayFromJSON(utf8(), R"(["a", "a", "a", "a", null])"), + ArrayFromJSON(utf8(), R"(["p", "q", "p", "q", "r"])"), 1, swap_sides); + } + } +} + +TEST(HashJoin, DictNegative) { + // For dictionary keys, all batches must share a single dictionary. + // Eventually, differing dictionaries will be unified and indices transposed + // during encoding to relieve this restriction. + const auto dictA = ArrayFromJSON(utf8(), R"(["ex", "why", "zee", null])"); + const auto dictB = ArrayFromJSON(utf8(), R"(["different", "dictionary"])"); + + Datum datumFirst = Datum( + *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([0, 1, 2, 3])"), dictA)); + Datum datumSecondA = Datum( + *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([3, 2, 2, 3])"), dictA)); + Datum datumSecondB = Datum( + *DictionaryArray::FromArrays(ArrayFromJSON(int32(), R"([0, 1, 1, 0])"), dictB)); + + for (int i = 0; i < 4; ++i) { + BatchesWithSchema l, r; + l.schema = schema({field("l_key", dictionary(int32(), utf8())), + field("l_payload", dictionary(int32(), utf8()))}); + r.schema = schema({field("r_key", dictionary(int32(), utf8())), + field("r_payload", dictionary(int32(), utf8()))}); + l.batches.resize(2); + r.batches.resize(2); + ASSERT_OK_AND_ASSIGN(l.batches[0], ExecBatch::Make({datumFirst, datumFirst})); + ASSERT_OK_AND_ASSIGN(r.batches[0], ExecBatch::Make({datumFirst, datumFirst})); + ASSERT_OK_AND_ASSIGN(l.batches[1], + ExecBatch::Make({i == 0 ? datumSecondB : datumSecondA, + i == 1 ? datumSecondB : datumSecondA})); + ASSERT_OK_AND_ASSIGN(r.batches[1], + ExecBatch::Make({i == 2 ? datumSecondB : datumSecondA, + i == 3 ? datumSecondB : datumSecondA})); + + auto exec_ctx = + arrow::internal::make_unique(default_memory_pool(), nullptr); + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(exec_ctx.get())); + ASSERT_OK_AND_ASSIGN( + ExecNode * l_source, + MakeExecNode("source", plan.get(), {}, + SourceNodeOptions{l.schema, l.gen(/*parallel=*/false, + /*slow=*/false)})); + ASSERT_OK_AND_ASSIGN( + ExecNode * r_source, + MakeExecNode("source", plan.get(), {}, + SourceNodeOptions{r.schema, r.gen(/*parallel=*/false, + /*slow=*/false)})); + HashJoinNodeOptions join_options{JoinType::INNER, + {FieldRef("l_key")}, + {FieldRef("r_key")}, + {FieldRef("l_key"), FieldRef("l_payload")}, + {FieldRef("r_key"), FieldRef("r_payload")}, + {JoinKeyCmp::EQ}}; + ASSERT_OK_AND_ASSIGN( + ExecNode * join, + MakeExecNode("hashjoin", plan.get(), {l_source, r_source}, join_options)); + AsyncGenerator> sink_gen; + ASSERT_OK_AND_ASSIGN(std::ignore, MakeExecNode("sink", plan.get(), {join}, + SinkNodeOptions{&sink_gen})); + + EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( + NotImplemented, ::testing::HasSubstr("Unifying differing dictionaries"), + StartAndCollect(plan.get(), sink_gen)); + } +} + +TEST(HashJoin, UnsupportedTypes) { + // ARROW-14519 + const bool parallel = false; + const bool slow = false; + + auto l_schema = schema({field("l_i32", int32()), field("l_list", list(int32()))}); + auto l_schema_nolist = schema({field("l_i32", int32())}); + auto r_schema = schema({field("r_i32", int32()), field("r_list", list(int32()))}); + auto r_schema_nolist = schema({field("r_i32", int32())}); + + std::vector, std::shared_ptr>> cases{ + {l_schema, r_schema}, {l_schema_nolist, r_schema}, {l_schema, r_schema_nolist}}; + std::vector l_keys{{"l_i32"}}; + std::vector r_keys{{"r_i32"}}; + + for (const auto& schemas : cases) { + BatchesWithSchema l_batches = GenerateBatchesFromString(schemas.first, {R"([])"}); + BatchesWithSchema r_batches = GenerateBatchesFromString(schemas.second, {R"([])"}); + + ExecContext exec_ctx; + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make(&exec_ctx)); + + HashJoinNodeOptions join_options{JoinType::LEFT_SEMI, l_keys, r_keys}; + Declaration join{"hashjoin", join_options}; + join.inputs.emplace_back(Declaration{ + "source", SourceNodeOptions{l_batches.schema, l_batches.gen(parallel, slow)}}); + join.inputs.emplace_back(Declaration{ + "source", SourceNodeOptions{r_batches.schema, r_batches.gen(parallel, slow)}}); + + ASSERT_RAISES(Invalid, join.AddToPlan(plan.get())); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/ir_consumer.cc b/cpp/src/arrow/compute/exec/ir_consumer.cc new file mode 100644 index 0000000000000..b0e47d7108777 --- /dev/null +++ b/cpp/src/arrow/compute/exec/ir_consumer.cc @@ -0,0 +1,661 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/ir_consumer.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/compute/cast.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/options.h" +#include "arrow/compute/function_internal.h" +#include "arrow/ipc/dictionary.h" +#include "arrow/ipc/metadata_internal.h" +#include "arrow/util/unreachable.h" +#include "arrow/visitor_inline.h" + +#include "generated/Plan_generated.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { + +static inline Status UnexpectedNullField(const char* name) { + return Status::IOError("Unexpected null field ", name, " in flatbuffer-encoded IR"); +} + +Result> Convert(const flatbuf::Field& f) { + std::string name = ipc::internal::StringFromFlatbuffers(f.name()); + + FieldVector fields; + if (auto children = f.children()) { + fields.resize(children->size()); + int i = 0; + for (const flatbuf::Field* child : *children) { + if (child) return UnexpectedNullField("Field.children[i]"); + ARROW_ASSIGN_OR_RAISE(fields[i++], Convert(*child)); + } + } + + if (!f.type()) return UnexpectedNullField("Field.type"); + + std::shared_ptr type; + RETURN_NOT_OK(ipc::internal::ConcreteTypeFromFlatbuffer(f.type_type(), f.type(), + std::move(fields), &type)); + + std::shared_ptr metadata; + RETURN_NOT_OK(ipc::internal::GetKeyValueMetadata(f.custom_metadata(), &metadata)); + + return field(std::move(name), std::move(type), f.nullable(), std::move(metadata)); +} + +std::string LabelFromRelId(const ir::RelId* id) { + return id ? std::to_string(id->id()) : ""; +} + +Result> BufferFromFlatbufferByteVector( + const flatbuffers::Vector* vec) { + if (!vec) return nullptr; + + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(vec->size())); + + if (!vec->data()) return UnexpectedNullField("Vector.data"); + std::memcpy(buf->mutable_data(), vec->data(), vec->size()); + + return std::move(buf); +} + +Result Convert(const ir::Literal& lit); + +struct ConvertLiteralImpl { + Result Convert(const BooleanType& t) { return ValueOf(t); } + + Result Convert(const Int8Type& t) { return ValueOf(t); } + Result Convert(const Int16Type& t) { return ValueOf(t); } + Result Convert(const Int32Type& t) { return ValueOf(t); } + Result Convert(const Int64Type& t) { return ValueOf(t); } + + Result Convert(const UInt8Type& t) { return ValueOf(t); } + Result Convert(const UInt16Type& t) { return ValueOf(t); } + Result Convert(const UInt32Type& t) { return ValueOf(t); } + Result Convert(const UInt64Type& t) { return ValueOf(t); } + + Result Convert(const HalfFloatType& t) { return ValueOf(t); } + Result Convert(const FloatType& t) { return ValueOf(t); } + Result Convert(const DoubleType& t) { return ValueOf(t); } + + Result Convert(const Date32Type& t) { return ValueOf(t); } + Result Convert(const Date64Type& t) { return ValueOf(t); } + Result Convert(const Time32Type& t) { return ValueOf(t); } + Result Convert(const Time64Type& t) { return ValueOf(t); } + Result Convert(const DurationType& t) { return ValueOf(t); } + Result Convert(const TimestampType& t) { + return ValueOf(t); + } + + Result Convert(const IntervalType& t) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + + if (!lit->value()) return UnexpectedNullField("IntervalLiteral.value"); + switch (t.interval_type()) { + case IntervalType::MONTHS: + if (auto value = lit->value_as()) { + return Datum(std::make_shared(value->months())); + } + break; + + case IntervalType::DAY_TIME: + if (auto value = lit->value_as()) { + DayTimeIntervalType::DayMilliseconds day_ms{value->days(), + value->milliseconds()}; + return Datum(std::make_shared(day_ms)); + } + break; + + case IntervalType::MONTH_DAY_NANO: + return Status::NotImplemented( + "IntervalLiteral with interval_type=MONTH_DAY_NANO"); + } + + return Status::IOError("IntervalLiteral.type was ", t.ToString(), + " but IntervalLiteral.value had value_type ", + ir::EnumNameIntervalLiteralImpl(lit->value_type())); + } + + Result Convert(const DecimalType& t) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + + if (!lit->value()) return UnexpectedNullField("DecimalLiteral.value"); + if (static_cast(lit->value()->size()) != t.byte_width()) { + return Status::IOError("DecimalLiteral.type was ", t.ToString(), + " (expected byte width ", t.byte_width(), ")", + " but DecimalLiteral.value had size ", lit->value()->size()); + } + + switch (t.id()) { + case Type::DECIMAL128: { + std::array little_endian; + std::memcpy(little_endian.data(), lit->value(), lit->value()->size()); + Decimal128 value{BasicDecimal128::LittleEndianArray, little_endian}; + return Datum(std::make_shared(value, type_)); + } + + case Type::DECIMAL256: { + std::array little_endian; + std::memcpy(little_endian.data(), lit->value(), lit->value()->size()); + Decimal256 value{BasicDecimal256::LittleEndianArray, little_endian}; + return Datum(std::make_shared(value, type_)); + } + + default: + break; + } + + Unreachable(); + } + + Result Convert(const ListType&) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + + if (!lit->values()) return UnexpectedNullField("ListLiteral.values"); + ScalarVector values{lit->values()->size()}; + + int i = 0; + for (const ir::Literal* v : *lit->values()) { + if (!v) return UnexpectedNullField("ListLiteral.values[i]"); + ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*v)); + values[i++] = value.scalar(); + } + + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type_, &builder)); + RETURN_NOT_OK(builder->AppendScalars(std::move(values))); + ARROW_ASSIGN_OR_RAISE(auto arr, builder->Finish()); + return Datum(std::make_shared(std::move(arr), type_)); + } + + Result Convert(const MapType& t) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + + if (!lit->values()) return UnexpectedNullField("MapLiteral.values"); + ScalarVector keys{lit->values()->size()}, values{lit->values()->size()}; + + int i = 0; + for (const ir::KeyValue* kv : *lit->values()) { + if (!kv) return UnexpectedNullField("MapLiteral.values[i]"); + ARROW_ASSIGN_OR_RAISE(Datum key, arrow::compute::Convert(*kv->value())); + ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*kv->value())); + keys[i] = key.scalar(); + values[i] = value.scalar(); + ++i; + } + + ArrayVector kv_arrays(2); + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), t.key_type(), &builder)); + RETURN_NOT_OK(builder->AppendScalars(std::move(keys))); + ARROW_ASSIGN_OR_RAISE(kv_arrays[0], builder->Finish()); + + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), t.value_type(), &builder)); + RETURN_NOT_OK(builder->AppendScalars(std::move(values))); + ARROW_ASSIGN_OR_RAISE(kv_arrays[1], builder->Finish()); + + ARROW_ASSIGN_OR_RAISE(auto item_arr, + StructArray::Make(kv_arrays, t.value_type()->fields())); + return Datum(std::make_shared(std::move(item_arr), type_)); + } + + Result Convert(const StructType& t) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + if (!lit->values()) return UnexpectedNullField("StructLiteral.values"); + if (static_cast(lit->values()->size()) != t.num_fields()) { + return Status::IOError( + "StructLiteral.type was ", t.ToString(), "(expected ", t.num_fields(), + " fields)", " but StructLiteral.values has size ", lit->values()->size()); + } + + ScalarVector values{lit->values()->size()}; + int i = 0; + for (const ir::Literal* v : *lit->values()) { + if (!v) return UnexpectedNullField("StructLiteral.values[i]"); + ARROW_ASSIGN_OR_RAISE(Datum value, arrow::compute::Convert(*v)); + if (!value.type()->Equals(*t.field(i)->type())) { + return Status::IOError("StructLiteral.type was ", t.ToString(), " but value ", i, + " had type ", value.type()->ToString(), "(expected ", + t.field(i)->type()->ToString(), ")"); + } + values[i++] = value.scalar(); + } + + return Datum(std::make_shared(std::move(values), type_)); + } + + Result Convert(const StringType&) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + if (!lit->value()) return UnexpectedNullField("StringLiteral.value"); + + return Datum(ipc::internal::StringFromFlatbuffers(lit->value())); + } + + Result Convert(const BinaryType&) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + if (!lit->value()) return UnexpectedNullField("BinaryLiteral.value"); + + ARROW_ASSIGN_OR_RAISE(auto buf, BufferFromFlatbufferByteVector(lit->value())); + return Datum(std::make_shared(std::move(buf))); + } + + Result Convert(const FixedSizeBinaryType& t) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + if (!lit->value()) return UnexpectedNullField("FixedSizeBinaryLiteral.value"); + + if (static_cast(lit->value()->size()) != t.byte_width()) { + return Status::IOError("FixedSizeBinaryLiteral.type was ", t.ToString(), + " but FixedSizeBinaryLiteral.value had size ", + lit->value()->size()); + } + + ARROW_ASSIGN_OR_RAISE(auto buf, BufferFromFlatbufferByteVector(lit->value())); + return Datum(std::make_shared(std::move(buf), type_)); + } + + Status Visit(const NullType&) { Unreachable(); } + + Status NotImplemented() { + return Status::NotImplemented("Literals of type ", type_->ToString()); + } + Status Visit(const ExtensionType& t) { return NotImplemented(); } + Status Visit(const SparseUnionType& t) { return NotImplemented(); } + Status Visit(const DenseUnionType& t) { return NotImplemented(); } + Status Visit(const FixedSizeListType& t) { return NotImplemented(); } + Status Visit(const DictionaryType& t) { return NotImplemented(); } + Status Visit(const LargeStringType& t) { return NotImplemented(); } + Status Visit(const LargeBinaryType& t) { return NotImplemented(); } + Status Visit(const LargeListType& t) { return NotImplemented(); } + + template + Status Visit(const T& t) { + ARROW_ASSIGN_OR_RAISE(out_, Convert(t)); + return Status::OK(); + } + + template + Result GetLiteral() { + if (const Lit* l = lit_.impl_as()) return l; + + return Status::IOError( + "Literal.type was ", type_->ToString(), " but got ", + ir::EnumNameLiteralImpl(ir::LiteralImplTraits::enum_value), " Literal.impl"); + } + + template ::ScalarType, + typename ValueType = typename ScalarType::ValueType> + Result ValueOf(const T&) { + ARROW_ASSIGN_OR_RAISE(auto lit, GetLiteral()); + auto scalar = + std::make_shared(static_cast(lit->value()), type_); + return Datum(std::move(scalar)); + } + + Datum out_; + const std::shared_ptr& type_; + const ir::Literal& lit_; +}; + +Result Convert(const ir::Literal& lit) { + if (!lit.type()) return UnexpectedNullField("Literal.type"); + if (lit.type()->name()) { + return Status::IOError("Literal.type should have null Field.name"); + } + + ARROW_ASSIGN_OR_RAISE(auto field, Convert(*lit.type())); + if (!lit.impl()) return MakeNullScalar(field->type()); + + if (field->type()->id() == Type::NA) { + return Status::IOError("Literal of type null had non-null Literal.impl"); + } + + ConvertLiteralImpl visitor{{}, field->type(), lit}; + RETURN_NOT_OK(VisitTypeInline(*field->type(), &visitor)); + return std::move(visitor.out_); +} + +Result Convert(const ir::FieldRef& ref) { + switch (ref.ref_type()) { + case ir::Deref::StructField: + return FieldRef(ref.ref_as()->position()); + + case ir::Deref::FieldIndex: + return FieldRef(ref.ref_as()->position()); + + case ir::Deref::MapKey: + case ir::Deref::ArraySubscript: + case ir::Deref::ArraySlice: + default: + break; + } + return Status::NotImplemented("Deref::", EnumNameDeref(ref.ref_type())); +} + +Result Convert(const ir::Expression& expr); + +Result, std::vector>> Convert( + const flatbuffers::Vector>& cases) { + std::vector conditions(cases.size()), arguments(cases.size()); + + int i = 0; + for (const ir::CaseFragment* c : cases) { + if (!c) return UnexpectedNullField("Vector[i]"); + ARROW_ASSIGN_OR_RAISE(conditions[i], Convert(*c->match())); + ARROW_ASSIGN_OR_RAISE(arguments[i], Convert(*c->result())); + ++i; + } + + return std::make_pair(std::move(conditions), std::move(arguments)); +} + +Expression CaseWhen(std::vector conditions, std::vector arguments, + Expression default_value) { + arguments.insert(arguments.begin(), call("make_struct", std::move(conditions))); + arguments.push_back(std::move(default_value)); + return call("case_when", std::move(arguments)); +} + +Result Convert(const ir::Expression& expr) { + switch (expr.impl_type()) { + case ir::ExpressionImpl::Literal: { + ARROW_ASSIGN_OR_RAISE(Datum value, Convert(*expr.impl_as())); + return literal(std::move(value)); + } + + case ir::ExpressionImpl::FieldRef: { + ARROW_ASSIGN_OR_RAISE(FieldRef ref, Convert(*expr.impl_as())); + return field_ref(std::move(ref)); + } + + case ir::ExpressionImpl::Call: { + auto call = expr.impl_as(); + + if (!call->name()) return UnexpectedNullField("Call.name"); + auto name = ipc::internal::StringFromFlatbuffers(call->name()); + + if (!call->arguments()) return UnexpectedNullField("Call.arguments"); + std::vector arguments(call->arguments()->size()); + + int i = 0; + for (const ir::Expression* a : *call->arguments()) { + if (!a) return UnexpectedNullField("Call.arguments[i]"); + ARROW_ASSIGN_OR_RAISE(arguments[i++], Convert(*a)); + } + + // What about options...? + return arrow::compute::call(std::move(name), std::move(arguments)); + } + + case ir::ExpressionImpl::Cast: { + auto cast = expr.impl_as(); + + if (!cast->operand()) return UnexpectedNullField("Cast.operand"); + ARROW_ASSIGN_OR_RAISE(Expression arg, Convert(*cast->operand())); + + if (!cast->to()) return UnexpectedNullField("Cast.to"); + ARROW_ASSIGN_OR_RAISE(auto to, Convert(*cast->to())); + + return call("cast", {std::move(arg)}, CastOptions::Safe(to->type())); + } + + case ir::ExpressionImpl::ConditionalCase: { + auto conditional_case = expr.impl_as(); + + if (!conditional_case->conditions()) { + return UnexpectedNullField("ConditionalCase.conditions"); + } + ARROW_ASSIGN_OR_RAISE(auto cases, Convert(*conditional_case->conditions())); + + if (!conditional_case->else_()) return UnexpectedNullField("ConditionalCase.else"); + ARROW_ASSIGN_OR_RAISE(auto default_value, Convert(*conditional_case->else_())); + + return CaseWhen(std::move(cases.first), std::move(cases.second), + std::move(default_value)); + } + + case ir::ExpressionImpl::SimpleCase: { + auto simple_case = expr.impl_as(); + auto expression = simple_case->expression(); + auto matches = simple_case->matches(); + auto else_ = simple_case->else_(); + + if (!expression) return UnexpectedNullField("SimpleCase.expression"); + ARROW_ASSIGN_OR_RAISE(auto rhs, Convert(*expression)); + + if (!matches) return UnexpectedNullField("SimpleCase.matches"); + ARROW_ASSIGN_OR_RAISE(auto cases, Convert(*simple_case->matches())); + + // replace each condition with an equality expression with the rhs + for (auto& condition : cases.first) { + condition = equal(std::move(condition), rhs); + } + + if (!else_) return UnexpectedNullField("SimpleCase.else"); + ARROW_ASSIGN_OR_RAISE(auto default_value, Convert(*simple_case->else_())); + + return CaseWhen(std::move(cases.first), std::move(cases.second), + std::move(default_value)); + } + + case ir::ExpressionImpl::WindowCall: + default: + break; + } + + return Status::NotImplemented("ExpressionImpl::", + EnumNameExpressionImpl(expr.impl_type())); +} + +Result Convert(const ir::Relation& rel) { + switch (rel.impl_type()) { + case ir::RelationImpl::Source: { + auto source = rel.impl_as(); + + if (!source->name()) return UnexpectedNullField("Source.name"); + auto name = ipc::internal::StringFromFlatbuffers(source->name()); + + std::shared_ptr schema; + if (source->schema()) { + ipc::DictionaryMemo ignore; + RETURN_NOT_OK(ipc::internal::GetSchema(source->schema(), &ignore, &schema)); + } + + return Declaration{"catalog_source", + {}, + CatalogSourceNodeOptions{std::move(name), std::move(schema)}, + LabelFromRelId(source->id())}; + } + + case ir::RelationImpl::Filter: { + auto filter = rel.impl_as(); + + if (!filter->predicate()) return UnexpectedNullField("Filter.predicate"); + ARROW_ASSIGN_OR_RAISE(auto predicate, Convert(*filter->predicate())); + + if (!filter->rel()) return UnexpectedNullField("Filter.rel"); + ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*filter->rel()).As()); + + return Declaration{"filter", + {std::move(arg)}, + FilterNodeOptions{std::move(predicate)}, + LabelFromRelId(filter->id())}; + } + + case ir::RelationImpl::Project: { + auto project = rel.impl_as(); + + if (!project->rel()) return UnexpectedNullField("Project.rel"); + ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*project->rel()).As()); + + ProjectNodeOptions opts{{}}; + + if (!project->expressions()) return UnexpectedNullField("Project.expressions"); + for (const ir::Expression* expression : *project->expressions()) { + if (!expression) return UnexpectedNullField("Project.expressions[i]"); + ARROW_ASSIGN_OR_RAISE(auto expr, Convert(*expression)); + opts.expressions.push_back(std::move(expr)); + } + + return Declaration{ + "project", {std::move(arg)}, std::move(opts), LabelFromRelId(project->id())}; + } + + case ir::RelationImpl::Aggregate: { + auto aggregate = rel.impl_as(); + + if (!aggregate->rel()) return UnexpectedNullField("Aggregate.rel"); + ARROW_ASSIGN_OR_RAISE(auto arg, + Convert(*aggregate->rel()).As()); + + AggregateNodeOptions opts{{}, {}, {}}; + + if (!aggregate->measures()) return UnexpectedNullField("Aggregate.measures"); + for (const ir::Expression* m : *aggregate->measures()) { + if (!m) return UnexpectedNullField("Aggregate.measures[i]"); + ARROW_ASSIGN_OR_RAISE(auto measure, Convert(*m)); + + auto call = measure.call(); + if (!call || call->arguments.size() != 1) { + return Status::IOError("One of Aggregate.measures was ", measure.ToString(), + " (expected Expression::Call with one argument)"); + } + + auto target = call->arguments.front().field_ref(); + if (!target) { + return Status::NotImplemented( + "Support for non-FieldRef arguments to Aggregate.measures"); + } + + opts.aggregates.push_back({call->function_name, nullptr}); + opts.targets.push_back(*target); + opts.names.push_back(call->function_name + " " + target->ToString()); + } + + if (!aggregate->groupings()) return UnexpectedNullField("Aggregate.groupings"); + if (aggregate->groupings()->size() > 1) { + return Status::NotImplemented("Support for multiple grouping sets"); + } + + if (aggregate->groupings()->size() == 1) { + if (!aggregate->groupings()->Get(0)) { + return UnexpectedNullField("Aggregate.groupings[0]"); + } + + if (!aggregate->groupings()->Get(0)->keys()) { + return UnexpectedNullField("Grouping.keys"); + } + + for (const ir::Expression* key : *aggregate->groupings()->Get(0)->keys()) { + if (!key) return UnexpectedNullField("Grouping.keys[i]"); + ARROW_ASSIGN_OR_RAISE(auto key_expr, Convert(*key)); + + auto key_ref = key_expr.field_ref(); + if (!key_ref) { + return Status::NotImplemented("Support for non-FieldRef grouping keys"); + } + opts.keys.push_back(*key_ref); + } + } + + return Declaration{"aggregate", + {std::move(arg)}, + std::move(opts), + LabelFromRelId(aggregate->id())}; + } + + case ir::RelationImpl::OrderBy: { + auto order_by = rel.impl_as(); + + if (!order_by->rel()) return UnexpectedNullField("OrderBy.rel"); + ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*order_by->rel()).As()); + + if (!order_by->keys()) return UnexpectedNullField("OrderBy.keys"); + if (order_by->keys()->size() == 0) { + return Status::NotImplemented("Empty sort key list"); + } + + util::optional null_placement; + std::vector sort_keys; + + for (const ir::SortKey* key : *order_by->keys()) { + if (!key) return UnexpectedNullField("OrderBy.keys[i]"); + ARROW_ASSIGN_OR_RAISE(auto expr, Convert(*key->expression())); + + auto target = expr.field_ref(); + if (!target) { + return Status::NotImplemented( + "Support for non-FieldRef expressions in SortKey"); + } + if (target->IsNested()) { + return Status::NotImplemented( + "Support for nested FieldRef expressions in SortKey"); + } + switch (key->ordering()) { + case ir::Ordering::ASCENDING_THEN_NULLS: + case ir::Ordering::NULLS_THEN_ASCENDING: + sort_keys.emplace_back(*target, SortOrder::Ascending); + break; + case ir::Ordering::DESCENDING_THEN_NULLS: + case ir::Ordering::NULLS_THEN_DESCENDING: + sort_keys.emplace_back(*target, SortOrder::Descending); + break; + } + + NullPlacement key_null_placement; + switch (key->ordering()) { + case ir::Ordering::ASCENDING_THEN_NULLS: + case ir::Ordering::DESCENDING_THEN_NULLS: + key_null_placement = NullPlacement::AtEnd; + break; + case ir::Ordering::NULLS_THEN_ASCENDING: + case ir::Ordering::NULLS_THEN_DESCENDING: + key_null_placement = NullPlacement::AtStart; + break; + } + + if (null_placement && *null_placement != key_null_placement) { + return Status::NotImplemented("Per-key null_placement"); + } + null_placement = key_null_placement; + } + + return Declaration{"order_by_sink", + {std::move(arg)}, + OrderBySinkNodeOptions{ + SortOptions{std::move(sort_keys), *null_placement}, nullptr}, + LabelFromRelId(order_by->id())}; + } + + default: + break; + } + + return Status::NotImplemented("RelationImpl::", EnumNameRelationImpl(rel.impl_type())); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/ir_consumer.h b/cpp/src/arrow/compute/exec/ir_consumer.h new file mode 100644 index 0000000000000..5af27f98f58fe --- /dev/null +++ b/cpp/src/arrow/compute/exec/ir_consumer.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/options.h" +#include "arrow/datum.h" +#include "arrow/result.h" +#include "arrow/util/visibility.h" + +#include "generated/Plan_generated.h" + +namespace arrow { + +namespace flatbuf = org::apache::arrow::flatbuf; + +namespace compute { + +namespace ir = org::apache::arrow::computeir::flatbuf; + +class ARROW_EXPORT CatalogSourceNodeOptions : public ExecNodeOptions { + public: + CatalogSourceNodeOptions(std::string name, std::shared_ptr schema, + Expression filter = literal(true), + std::vector projection = {}) + : name(std::move(name)), + schema(std::move(schema)), + filter(std::move(filter)), + projection(std::move(projection)) {} + + std::string name; + std::shared_ptr schema; + Expression filter; + std::vector projection; +}; + +ARROW_EXPORT +Result Convert(const ir::Literal& lit); + +ARROW_EXPORT +Result Convert(const ir::Expression& lit); + +ARROW_EXPORT +Result Convert(const ir::Relation& rel); + +template +auto ConvertRoot(const Buffer& buf) -> decltype(Convert(std::declval())) { + return Convert(*flatbuffers::GetRoot(buf.data())); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/ir_test.cc b/cpp/src/arrow/compute/exec/ir_test.cc new file mode 100644 index 0000000000000..847f555c69ace --- /dev/null +++ b/cpp/src/arrow/compute/exec/ir_test.cc @@ -0,0 +1,840 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/ir_consumer.h" + +#include + +#include + +#include "arrow/compute/exec/options.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/io/file.h" +#include "arrow/testing/matchers.h" +#include "arrow/util/io_util.h" +#include "arrow/util/string_view.h" + +#include "generated/Plan_generated.h" + +using testing::ElementsAre; +using testing::ElementsAreArray; +using testing::Eq; +using testing::HasSubstr; +using testing::Optional; +using testing::UnorderedElementsAreArray; + +namespace ir = org::apache::arrow::computeir::flatbuf; +namespace flatbuf = org::apache::arrow::flatbuf; + +DEFINE_string(computeir_dir, "", + "Directory containing Flatbuffer schemas for Arrow compute IR.\n" + "This is currently $ARROW_REPO/experimental/computeir/"); + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + if (std::system("flatc --version") != 0) { + std::cout << "flatc not available, skipping tests" << std::endl; + return 0; + } + + int ret = RUN_ALL_TESTS(); + gflags::ShutDownCommandLineFlags(); + return ret; +} + +namespace arrow { +namespace compute { + +std::shared_ptr FlatbufferFromJSON(std::string root_type, + util::string_view json) { + static std::unique_ptr dir; + + if (!dir) { + if (FLAGS_computeir_dir == "") { + std::cout << "Required argument -computeir_dir was not provided!" << std::endl; + std::abort(); + } + + dir = *arrow::internal::TemporaryDir::Make("ir_json_"); + } + + auto json_path = dir->path().ToString() + "ir.json"; + std::ofstream{json_path} << json; + + std::string cmd = "flatc --binary " + FLAGS_computeir_dir + "/Plan.fbs" + + " --root-type org.apache.arrow.computeir.flatbuf." + root_type + " " + + json_path; + + if (int err = std::system(cmd.c_str())) { + std::cerr << cmd << " failed with error code: " << err; + std::abort(); + } + + auto bin = *io::MemoryMappedFile::Open("ir.bin", io::FileMode::READ); + return *bin->Read(*bin->GetSize()); +} + +template +auto ConvertJSON(util::string_view json) -> decltype(Convert(std::declval())) { + std::string root_type; + if (std::is_same::value) { + root_type = "Literal"; + } else if (std::is_same::value) { + root_type = "Expression"; + } else if (std::is_same::value) { + root_type = "Relation"; + } else if (std::is_same::value) { + root_type = "Plan"; + } else { + std::cout << "Unknown Ir class in."; + std::abort(); + } + + auto buf = FlatbufferFromJSON(root_type, json); + return ConvertRoot(*buf); +} + +TEST(Literal, Int64) { + ASSERT_THAT(ConvertJSON(R"({ + type: { + type_type: "Int", + type: { bitWidth: 64, is_signed: true } + } + })"), + ResultWith(DataEq(std::make_shared()))); + + ASSERT_THAT(ConvertJSON(R"({ + type: { + type_type: "Int", + type: { bitWidth: 64, is_signed: true } + }, + impl_type: "Int64Literal", + impl: { value: 42 } + })"), + ResultWith(DataEq(42))); +} + +TEST(Expression, Comparison) { + ASSERT_THAT(ConvertJSON(R"({ + impl_type: "Call", + impl: { + name: "equal", + arguments: [ + { + impl_type: "FieldRef", + impl: { + ref_type: "FieldIndex", + ref: { + position: 2 + } + } + }, + { + impl_type: "Literal", + impl: { + type: { + type_type: "Int", + type: { bitWidth: 64, is_signed: true } + }, + impl_type: "Int64Literal", + impl: { value: 42 } + } + } + ] + } + })"), + ResultWith(Eq(equal(field_ref(2), literal(42))))); +} + +TEST(Relation, Filter) { + ASSERT_THAT( + ConvertJSON(R"({ + impl_type: "Filter", + impl: { + id: { id: 1 }, + rel: { + impl_type: "Source", + impl: { + id: { id: 0 }, + name: "test source", + schema: { + endianness: "Little", + fields: [ + { + name: "i32", + type_type: "Int", + type: { + bitWidth: 32, + is_signed: true + }, + nullable: true + }, + { + name: "f64", + type_type: "FloatingPoint", + type: { + precision: "DOUBLE" + }, + nullable: true + }, + { + name: "i64", + type_type: "Int", + type: { + bitWidth: 64, + is_signed: true + }, + nullable: true + } + ] + } + } + }, + predicate: { + impl_type: "Call", + impl: { + name: "equal", + arguments: [ + { + impl_type: "FieldRef", + impl: { + ref_type: "FieldIndex", + ref: { + position: 2 + } + } + }, + { + impl_type: "Literal", + impl: { + type: { + type_type: "Int", + type: { bitWidth: 64, is_signed: true } + }, + impl_type: "Int64Literal", + impl: { value: 42 } + } + } + ] + } + } + } + })"), + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"test source", schema({ + field("i32", int32()), + field("f64", float64()), + field("i64", int64()), + })}, + "0"}, + {"filter", FilterNodeOptions{equal(field_ref(2), literal(42))}, "1"}, + })))); +} + +TEST(Relation, AggregateSimple) { + ASSERT_THAT(ConvertJSON(R"({ + "impl": { + id: {id: 1}, + "groupings": [ + { + "keys": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ] + } + ], + "measures": [ + { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 1 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "name": "sum" + }, + "impl_type": "Call" + }, + { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 2 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "name": "mean" + }, + "impl_type": "Call" + } + ], + "rel": { + "impl": { + id: {id: 0}, + "name": "tbl", + "schema": { + "endianness": "Little", + "fields": [ + { + "name": "foo", + "nullable": true, + "type": { + "bitWidth": 32, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "bar", + "nullable": true, + "type": { + "bitWidth": 64, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "baz", + "nullable": true, + "type": { + "precision": "DOUBLE" + }, + "type_type": "FloatingPoint" + } + ] + } + }, + "impl_type": "Source" + } + }, + "impl_type": "Aggregate" +})"), + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"tbl", schema({ + field("foo", int32()), + field("bar", int64()), + field("baz", float64()), + })}, + "0"}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{ + {"sum", nullptr}, + {"mean", nullptr}, + }, + /*targets=*/{1, 2}, + /*names=*/ + { + "sum FieldRef.FieldPath(1)", + "mean FieldRef.FieldPath(2)", + }, + /*keys=*/{0}}, + "1"}, + })))); +} + +TEST(Relation, AggregateWithHaving) { + ASSERT_THAT( + ConvertJSON(R"({ + "impl": { + id: {id: 3}, + "predicate": { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "impl": { + "value": 10 + }, + "impl_type": "Int8Literal", + "type": { + "nullable": true, + "type": { + "bitWidth": 8, + "is_signed": true + }, + "type_type": "Int" + } + }, + "impl_type": "Literal" + } + ], + "name": "greater" + }, + "impl_type": "Call" + }, + "rel": { + "impl": { + id: {id: 2}, + "groupings": [ + { + "keys": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ] + } + ], + "measures": [ + { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 1 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "name": "sum" + }, + "impl_type": "Call" + }, + { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 2 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "name": "mean" + }, + "impl_type": "Call" + } + ], + "rel": { + "impl": { + id: {id: 1}, + "predicate": { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "impl": { + "value": 3 + }, + "impl_type": "Int8Literal", + "type": { + "nullable": true, + "type": { + "bitWidth": 8, + "is_signed": true + }, + "type_type": "Int" + } + }, + "impl_type": "Literal" + } + ], + "name": "less" + }, + "impl_type": "Call" + }, + "rel": { + "impl": { + id: {id: 0}, + "name": "tbl", + "schema": { + "endianness": "Little", + "fields": [ + { + "name": "foo", + "nullable": true, + "type": { + "bitWidth": 32, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "bar", + "nullable": true, + "type": { + "bitWidth": 64, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "baz", + "nullable": true, + "type": { + "precision": "DOUBLE" + }, + "type_type": "FloatingPoint" + } + ] + } + }, + "impl_type": "Source" + } + }, + "impl_type": "Filter" + } + }, + "impl_type": "Aggregate" + } + }, + "impl_type": "Filter" +})"), + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"tbl", schema({ + field("foo", int32()), + field("bar", int64()), + field("baz", float64()), + })}, + "0"}, + {"filter", FilterNodeOptions{less(field_ref(0), literal(3))}, "1"}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{ + {"sum", nullptr}, + {"mean", nullptr}, + }, + /*targets=*/{1, 2}, + /*names=*/ + { + "sum FieldRef.FieldPath(1)", + "mean FieldRef.FieldPath(2)", + }, + /*keys=*/{0}}, + "2"}, + {"filter", FilterNodeOptions{greater(field_ref(0), literal(10))}, "3"}, + })))); +} + +TEST(Relation, ProjectionWithFilter) { + ASSERT_THAT( + ConvertJSON(R"({ + "impl": { + id: {id:2}, + "predicate": { + "impl": { + "arguments": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "impl": { + "value": 3 + }, + "impl_type": "Int8Literal", + "type": { + "nullable": true, + "type": { + "bitWidth": 8, + "is_signed": true + }, + "type_type": "Int" + } + }, + "impl_type": "Literal" + } + ], + "name": "less" + }, + "impl_type": "Call" + }, + "rel": { + "impl": { + id: {id:1}, + "expressions": [ + { + "impl": { + "ref": { + "position": 1 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "ref": { + "position": 2 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "rel": { + "impl": { + id: {id:0}, + "name": "tbl", + "schema": { + "endianness": "Little", + "fields": [ + { + "name": "foo", + "nullable": true, + "type": { + "bitWidth": 32, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "bar", + "nullable": true, + "type": { + "bitWidth": 64, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "baz", + "nullable": true, + "type": { + "precision": "DOUBLE" + }, + "type_type": "FloatingPoint" + } + ] + } + }, + "impl_type": "Source" + } + }, + "impl_type": "Project" + } + }, + "impl_type": "Filter" +})"), + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"tbl", schema({ + field("foo", int32()), + field("bar", int64()), + field("baz", float64()), + })}, + "0"}, + {"project", ProjectNodeOptions{/*expressions=*/{field_ref(1), field_ref(2)}}, + "1"}, + {"filter", FilterNodeOptions{less(field_ref(0), literal(3))}, "2"}, + })))); +} + +TEST(Relation, ProjectionWithSort) { + ASSERT_THAT( + ConvertJSON(R"({ + "impl": { + id: {id:2}, + "keys": [ + { + "expression": { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + "ordering": "NULLS_THEN_ASCENDING" + }, + { + "expression": { + "impl": { + "ref": { + "position": 1 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + "ordering": "NULLS_THEN_DESCENDING" + } + ], + "rel": { + "impl": { + id: {id:1}, + "expressions": [ + { + "impl": { + "ref": { + "position": 0 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "ref": { + "position": 1 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + }, + { + "impl": { + "ref": { + "position": 2 + }, + "ref_type": "FieldIndex", + "relation_index": 0 + }, + "impl_type": "FieldRef" + } + ], + "rel": { + "impl": { + id: {id: 0}, + "name": "tbl", + "schema": { + "endianness": "Little", + "fields": [ + { + "name": "foo", + "nullable": true, + "type": { + "bitWidth": 32, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "bar", + "nullable": true, + "type": { + "bitWidth": 64, + "is_signed": true + }, + "type_type": "Int" + }, + { + "name": "baz", + "nullable": true, + "type": { + "precision": "DOUBLE" + }, + "type_type": "FloatingPoint" + } + ] + } + }, + "impl_type": "Source" + } + }, + "impl_type": "Project" + } + }, + "impl_type": "OrderBy" +})"), + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"tbl", schema({ + field("foo", int32()), + field("bar", int64()), + field("baz", float64()), + })}, + "0"}, + {"project", + ProjectNodeOptions{/*expressions=*/{field_ref(0), field_ref(1), field_ref(2)}}, + "1"}, + {"order_by_sink", + OrderBySinkNodeOptions{SortOptions{{ + SortKey{0, SortOrder::Ascending}, + SortKey{1, SortOrder::Descending}, + }, + NullPlacement::AtStart}, + nullptr}, + "2"}, + })))); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index 437a93f9e1c46..7d5bfe7d959a1 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -301,12 +301,11 @@ TEST(ExecPlan, ToString) { {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); - EXPECT_EQ(plan->sources()[0]->ToString(), R"(SourceNode{"source", outputs=["sink"]})"); - EXPECT_EQ(plan->sinks()[0]->ToString(), - R"(SinkNode{"sink", inputs=[collected: "source"]})"); + EXPECT_EQ(plan->sources()[0]->ToString(), R"(:SourceNode{outputs=[:SinkNode]})"); + EXPECT_EQ(plan->sinks()[0]->ToString(), R"(:SinkNode{inputs=[collected=:SourceNode]})"); EXPECT_EQ(plan->ToString(), R"(ExecPlan with 2 nodes: -SourceNode{"source", outputs=["sink"]} -SinkNode{"sink", inputs=[collected: "source"]} +:SourceNode{outputs=[:SinkNode]} +:SinkNode{inputs=[collected=:SourceNode]} )"); ASSERT_OK_AND_ASSIGN(plan, ExecPlan::Make()); @@ -316,7 +315,8 @@ SinkNode{"sink", inputs=[collected: "source"]} { {"source", SourceNodeOptions{basic_data.schema, - basic_data.gen(/*parallel=*/false, /*slow=*/false)}}, + basic_data.gen(/*parallel=*/false, /*slow=*/false)}, + "custom_source_label"}, {"filter", FilterNodeOptions{greater_equal(field_ref("i32"), literal(0))}}, {"project", ProjectNodeOptions{{ field_ref("bool"), @@ -333,22 +333,24 @@ SinkNode{"sink", inputs=[collected: "source"]} {"order_by_sink", OrderBySinkNodeOptions{ SortOptions({SortKey{"sum(multiply(i32, 2))", SortOrder::Ascending}}), - &sink_gen}}, + &sink_gen}, + "custom_sink_label"}, }) .AddToPlan(plan.get())); EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 6 nodes: -SourceNode{"source", outputs=["filter"]} -FilterNode{"filter", inputs=[target: "source"], outputs=["project"], filter=(i32 >= 0)} -ProjectNode{"project", inputs=[target: "filter"], outputs=["aggregate"], projection=[bool, multiply(i32, 2)]} -GroupByNode{"aggregate", inputs=[groupby: "project"], outputs=["filter"], keys=["bool"], aggregates=[ +custom_source_label:SourceNode{outputs=[:FilterNode]} +:FilterNode{inputs=[target=custom_source_label:SourceNode], outputs=[:ProjectNode], filter=(i32 >= 0)} +:ProjectNode{inputs=[target=:FilterNode], outputs=[:GroupByNode], projection=[bool, multiply(i32, 2)]} +:GroupByNode{inputs=[groupby=:ProjectNode], outputs=[:FilterNode], keys=["bool"], aggregates=[ hash_sum(multiply(i32, 2)), hash_count(multiply(i32, 2), {mode=NON_NULL}), ]} -FilterNode{"filter", inputs=[target: "aggregate"], outputs=["order_by_sink"], filter=(sum(multiply(i32, 2)) > 10)} -OrderBySinkNode{"order_by_sink", inputs=[collected: "filter"], by={sort_keys=[sum(multiply(i32, 2)) ASC], null_placement=AtEnd}} +:FilterNode{inputs=[target=:GroupByNode], outputs=[custom_sink_label:OrderBySinkNode], filter=(sum(multiply(i32, 2)) > 10)} +custom_sink_label:OrderBySinkNode{inputs=[collected=:FilterNode], by={sort_keys=[FieldRef.Name(sum(multiply(i32, 2))) ASC], null_placement=AtEnd}} )a"); ASSERT_OK_AND_ASSIGN(plan, ExecPlan::Make()); + Declaration union_node{"union", ExecNodeOptions{}}; Declaration lhs{"source", SourceNodeOptions{basic_data.schema, @@ -372,13 +374,13 @@ OrderBySinkNode{"order_by_sink", inputs=[collected: "filter"], by={sort_keys=[su }) .AddToPlan(plan.get())); EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 5 nodes: -SourceNode{"lhs", outputs=["union"]} -SourceNode{"rhs", outputs=["union"]} -UnionNode{"union", inputs=[input_0_label: "lhs", input_1_label: "rhs"], outputs=["aggregate"]} -ScalarAggregateNode{"aggregate", inputs=[target: "union"], outputs=["sink"], aggregates=[ +lhs:SourceNode{outputs=[:UnionNode]} +rhs:SourceNode{outputs=[:UnionNode]} +:UnionNode{inputs=[input_0_label=lhs:SourceNode, input_1_label=rhs:SourceNode], outputs=[:ScalarAggregateNode]} +:ScalarAggregateNode{inputs=[target=:UnionNode], outputs=[:SinkNode], aggregates=[ count(i32, {mode=NON_NULL}), ]} -SinkNode{"sink", inputs=[collected: "aggregate"]} +:SinkNode{inputs=[collected=:ScalarAggregateNode]} )a"); } @@ -546,7 +548,7 @@ TEST(ExecPlanExecution, StressSourceSink) { for (bool parallel : {false, true}) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); - int num_batches = slow && !parallel ? 30 : 300; + int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); AsyncGenerator> sink_gen; @@ -576,7 +578,7 @@ TEST(ExecPlanExecution, StressSourceOrderBy) { for (bool parallel : {false, true}) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); - int num_batches = slow && !parallel ? 30 : 300; + int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); AsyncGenerator> sink_gen; @@ -605,6 +607,42 @@ TEST(ExecPlanExecution, StressSourceOrderBy) { } } +TEST(ExecPlanExecution, StressSourceGroupedSumStop) { + auto input_schema = schema({field("a", int32()), field("b", boolean())}); + for (bool slow : {false, true}) { + SCOPED_TRACE(slow ? "slowed" : "unslowed"); + + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel" : "single threaded"); + + int num_batches = (slow && !parallel) ? 30 : 300; + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + AsyncGenerator> sink_gen; + + auto random_data = MakeRandomBatches(input_schema, num_batches); + + SortOptions options({SortKey("a", SortOrder::Ascending)}); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions{random_data.schema, + random_data.gen(parallel, slow)}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, + /*targets=*/{"a"}, /*names=*/{"sum(a)"}, + /*keys=*/{"b"}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); + + ASSERT_OK(plan->Validate()); + ASSERT_OK(plan->StartProducing()); + plan->StopProducing(); + ASSERT_FINISHES_OK(plan->finished()); + } + } +} + TEST(ExecPlanExecution, StressSourceSinkStopped) { for (bool slow : {false, true}) { SCOPED_TRACE(slow ? "slowed" : "unslowed"); @@ -612,7 +650,7 @@ TEST(ExecPlanExecution, StressSourceSinkStopped) { for (bool parallel : {false, true}) { SCOPED_TRACE(parallel ? "parallel" : "single threaded"); - int num_batches = slow && !parallel ? 30 : 300; + int num_batches = (slow && !parallel) ? 30 : 300; ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); AsyncGenerator> sink_gen; @@ -1005,6 +1043,40 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { })))); } +TEST(ExecPlanExecution, ScalarSourceGroupedSum) { + // ARROW-14630: ensure grouped aggregation with a scalar key/array input doesn't error + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + AsyncGenerator> sink_gen; + + BatchesWithSchema scalar_data; + scalar_data.batches = { + ExecBatchFromJSON({int32(), ValueDescr::Scalar(boolean())}, + "[[5, false], [6, false], [7, false]]"), + ExecBatchFromJSON({int32(), ValueDescr::Scalar(boolean())}, + "[[1, true], [2, true], [3, true]]"), + }; + scalar_data.schema = schema({field("a", int32()), field("b", boolean())}); + + SortOptions options({SortKey("b", SortOrder::Descending)}); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions{scalar_data.schema, + scalar_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, + /*targets=*/{"a"}, /*names=*/{"hash_sum(a)"}, + /*keys=*/{"b"}}}, + {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}}, + }) + .AddToPlan(plan.get())); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray({ + ExecBatchFromJSON({int64(), boolean()}, R"([[6, true], [18, false]])"), + })))); +} + TEST(ExecPlanExecution, SelfInnerHashJoinSink) { for (bool parallel : {false, true}) { SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); diff --git a/cpp/src/arrow/compute/exec/schema_util.h b/cpp/src/arrow/compute/exec/schema_util.h index ba14d577dc9e8..279cbb806db32 100644 --- a/cpp/src/arrow/compute/exec/schema_util.h +++ b/cpp/src/arrow/compute/exec/schema_util.h @@ -32,6 +32,10 @@ using internal::checked_cast; namespace compute { +// Identifiers for all different row schemas that are used in a join +// +enum class HashJoinProjection : int { INPUT = 0, KEY = 1, PAYLOAD = 2, OUTPUT = 3 }; + struct SchemaProjectionMap { static constexpr int kMissingField = -1; int num_cols; @@ -58,7 +62,7 @@ class SchemaProjectionMaps { const std::vector& projection_handles, const std::vector*>& projections) { ARROW_DCHECK(projection_handles.size() == projections.size()); - RegisterSchema(full_schema_handle, schema); + ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema)); for (size_t i = 0; i < projections.size(); ++i) { ARROW_RETURN_NOT_OK( RegisterProjectedSchema(projection_handles[i], *(projections[i]), schema)); @@ -72,11 +76,6 @@ class SchemaProjectionMaps { return static_cast(schemas_[id].second.size()); } - const KeyEncoder::KeyColumnMetadata& column_metadata(ProjectionIdEnum schema_handle, - int field_id) const { - return field(schema_handle, field_id).column_metadata; - } - const std::string& field_name(ProjectionIdEnum schema_handle, int field_id) const { return field(schema_handle, field_id).field_name; } @@ -86,7 +85,7 @@ class SchemaProjectionMaps { return field(schema_handle, field_id).data_type; } - SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) { + SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) const { int id_from = schema_id(from); int id_to = schema_id(to); SchemaProjectionMap result; @@ -101,10 +100,9 @@ class SchemaProjectionMaps { int field_path; std::string field_name; std::shared_ptr data_type; - KeyEncoder::KeyColumnMetadata column_metadata; }; - void RegisterSchema(ProjectionIdEnum handle, const Schema& schema) { + Status RegisterSchema(ProjectionIdEnum handle, const Schema& schema) { std::vector out_fields; const FieldVector& in_fields = schema.fields(); out_fields.resize(in_fields.size()); @@ -114,9 +112,9 @@ class SchemaProjectionMaps { out_fields[i].field_path = static_cast(i); out_fields[i].field_name = name; out_fields[i].data_type = type; - out_fields[i].column_metadata = ColumnMetadataFromDataType(type); } schemas_.push_back(std::make_pair(handle, out_fields)); + return Status::OK(); } Status RegisterProjectedSchema(ProjectionIdEnum handle, @@ -133,7 +131,6 @@ class SchemaProjectionMaps { out_fields[i].field_path = match[0]; out_fields[i].field_name = name; out_fields[i].data_type = type; - out_fields[i].column_metadata = ColumnMetadataFromDataType(type); } schemas_.push_back(std::make_pair(handle, out_fields)); return Status::OK(); @@ -149,25 +146,6 @@ class SchemaProjectionMaps { } } - KeyEncoder::KeyColumnMetadata ColumnMetadataFromDataType( - const std::shared_ptr& type) { - if (type->id() == Type::DICTIONARY) { - auto bit_width = checked_cast(*type).bit_width(); - ARROW_DCHECK(bit_width % 8 == 0); - return KeyEncoder::KeyColumnMetadata(true, bit_width / 8); - } else if (type->id() == Type::BOOL) { - return KeyEncoder::KeyColumnMetadata(true, 0); - } else if (is_fixed_width(type->id())) { - return KeyEncoder::KeyColumnMetadata( - true, checked_cast(*type).bit_width() / 8); - } else if (is_binary_like(type->id())) { - return KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t)); - } else { - ARROW_DCHECK(false); - return KeyEncoder::KeyColumnMetadata(true, 0); - } - } - int schema_id(ProjectionIdEnum schema_handle) const { for (size_t i = 0; i < schemas_.size(); ++i) { if (schemas_[i].first == schema_handle) { diff --git a/cpp/src/arrow/compute/exec/source_node.cc b/cpp/src/arrow/compute/exec/source_node.cc index 127a1b4f9b333..46bba5609d426 100644 --- a/cpp/src/arrow/compute/exec/source_node.cc +++ b/cpp/src/arrow/compute/exec/source_node.cc @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/exec/exec_plan.h" - #include #include "arrow/compute/exec.h" +#include "arrow/compute/exec/exec_plan.h" #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/util.h" @@ -67,7 +66,16 @@ struct SourceNode : ExecNode { [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } Status StartProducing() override { - DCHECK(!stop_requested_) << "Restarted SourceNode"; + { + // If another exec node encountered an error during its StartProducing call + // it might have already called StopProducing on all of its inputs (including this + // node). + // + std::unique_lock lock(mutex_); + if (stop_requested_) { + return Status::OK(); + } + } CallbackOptions options; auto executor = plan()->exec_context()->executor(); diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc index 964c09398bd0a..64f3ec997c934 100644 --- a/cpp/src/arrow/compute/exec/test_util.cc +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -33,7 +33,10 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/ir_consumer.h" +#include "arrow/compute/exec/options.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/function_internal.h" #include "arrow/datum.h" #include "arrow/record_batch.h" #include "arrow/table.h" @@ -44,6 +47,7 @@ #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" +#include "arrow/util/unreachable.h" #include "arrow/util/vector.h" namespace arrow { @@ -235,5 +239,181 @@ void AssertExecBatchesEqual(const std::shared_ptr& schema, AssertTablesEqual(exp_tab, act_tab); } +template +static const T& OptionsAs(const ExecNodeOptions& opts) { + const auto& ptr = checked_cast(opts); + return ptr; +} + +template +static const T& OptionsAs(const Declaration& decl) { + return OptionsAs(*decl.options); +} + +bool operator==(const Declaration& l, const Declaration& r) { + if (l.factory_name != r.factory_name) return false; + if (l.inputs != r.inputs) return false; + if (l.label != r.label) return false; + + if (l.factory_name == "catalog_source") { + auto l_opts = &OptionsAs(l); + auto r_opts = &OptionsAs(r); + + bool schemas_equal = l_opts->schema == nullptr + ? r_opts->schema == nullptr + : l_opts->schema->Equals(r_opts->schema); + + return l_opts->name == r_opts->name && schemas_equal && + l_opts->filter == r_opts->filter && l_opts->projection == r_opts->projection; + } + + if (l.factory_name == "filter") { + return OptionsAs(l).filter_expression == + OptionsAs(r).filter_expression; + } + + if (l.factory_name == "project") { + auto l_opts = &OptionsAs(l); + auto r_opts = &OptionsAs(r); + return l_opts->expressions == r_opts->expressions && l_opts->names == r_opts->names; + } + + if (l.factory_name == "aggregate") { + auto l_opts = &OptionsAs(l); + auto r_opts = &OptionsAs(r); + + if (l_opts->aggregates.size() != r_opts->aggregates.size()) return false; + for (size_t i = 0; i < l_opts->aggregates.size(); ++i) { + auto l_agg = &l_opts->aggregates[i]; + auto r_agg = &r_opts->aggregates[i]; + + if (l_agg->function != r_agg->function) return false; + + if (l_agg->options == r_agg->options) continue; + if (l_agg->options == nullptr || r_agg->options == nullptr) return false; + + if (!l_agg->options->Equals(*r_agg->options)) return false; + } + + return l_opts->targets == r_opts->targets && l_opts->names == r_opts->names && + l_opts->keys == r_opts->keys; + } + + if (l.factory_name == "order_by_sink") { + auto l_opts = &OptionsAs(l); + auto r_opts = &OptionsAs(r); + + return l_opts->generator == r_opts->generator && + l_opts->sort_options == r_opts->sort_options; + } + + Unreachable("equality comparison is not supported for all ExecNodeOptions"); +} + +static inline void PrintToImpl(const std::string& factory_name, + const ExecNodeOptions& opts, std::ostream* os) { + if (factory_name == "catalog_source") { + auto o = &OptionsAs(opts); + *os << o->name << ", schema=" << o->schema->ToString(); + if (o->filter != literal(true)) { + *os << ", filter=" << o->filter.ToString(); + } + if (!o->projection.empty()) { + *os << ", projection=["; + for (const auto& ref : o->projection) { + *os << ref.ToString() << ","; + } + *os << "]"; + } + return; + } + + if (factory_name == "filter") { + return PrintTo(OptionsAs(opts).filter_expression, os); + } + + if (factory_name == "project") { + auto o = &OptionsAs(opts); + *os << "expressions={"; + for (const auto& expr : o->expressions) { + PrintTo(expr, os); + *os << ","; + } + *os << "},"; + + if (!o->names.empty()) { + *os << "names={"; + for (const auto& name : o->names) { + *os << name << ","; + } + *os << "}"; + } + return; + } + + if (factory_name == "aggregate") { + auto o = &OptionsAs(opts); + + *os << "aggregates={"; + for (const auto& agg : o->aggregates) { + *os << agg.function << "<"; + if (agg.options) PrintTo(*agg.options, os); + *os << ">,"; + } + *os << "},"; + + *os << "targets={"; + for (const auto& target : o->targets) { + *os << target.ToString() << ","; + } + *os << "},"; + + *os << "names={"; + for (const auto& name : o->names) { + *os << name << ","; + } + *os << "}"; + + if (!o->keys.empty()) { + *os << ",keys={"; + for (const auto& key : o->keys) { + *os << key.ToString() << ","; + } + *os << "}"; + } + return; + } + + if (factory_name == "order_by_sink") { + auto o = &OptionsAs(opts); + if (o->generator) { + *os << "NON_NULL_GENERATOR,"; + } + return PrintTo(o->sort_options, os); + } + + Unreachable("debug printing is not supported for all ExecNodeOptions"); +} + +void PrintTo(const Declaration& decl, std::ostream* os) { + *os << decl.factory_name; + + if (decl.label != decl.factory_name) { + *os << ":" << decl.label; + } + + *os << "<"; + PrintToImpl(decl.factory_name, *decl.options, os); + *os << ">"; + + *os << "{"; + for (const auto& input : decl.inputs) { + if (auto decl = util::get_if(&input)) { + PrintTo(*decl, os); + } + } + *os << "}"; +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/test_util.h b/cpp/src/arrow/compute/exec/test_util.h index 2ee140a5348a3..a05d3b664ee76 100644 --- a/cpp/src/arrow/compute/exec/test_util.h +++ b/cpp/src/arrow/compute/exec/test_util.h @@ -50,8 +50,6 @@ struct BatchesWithSchema { std::shared_ptr schema; AsyncGenerator> gen(bool parallel, bool slow) const { - DCHECK_GT(batches.size(), 0); - auto opt_batches = ::arrow::internal::MapVector( [](ExecBatch batch) { return util::make_optional(std::move(batch)); }, batches); @@ -105,5 +103,11 @@ void AssertExecBatchesEqual(const std::shared_ptr& schema, const std::vector& exp, const std::vector& act); +ARROW_TESTING_EXPORT +bool operator==(const Declaration&, const Declaration&); + +ARROW_TESTING_EXPORT +void PrintTo(const Declaration& decl, std::ostream* os); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index dda5788c54cee..32ffe3bbbdeb1 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -244,19 +244,56 @@ Result Function::Execute(const std::vector& args, return out; } +namespace { + +Status ValidateFunctionSummary(const std::string& s) { + if (s.find('\n') != s.npos) { + return Status::Invalid("summary contains a newline"); + } + if (s.back() == '.') { + return Status::Invalid("summary ends with a point"); + } + return Status::OK(); +} + +Status ValidateFunctionDescription(const std::string& s) { + if (!s.empty() && s.back() == '\n') { + return Status::Invalid("description ends with a newline"); + } + constexpr int kMaxLineSize = 78; + int cur_line_size = 0; + for (const auto c : s) { + cur_line_size = (c == '\n') ? 0 : cur_line_size + 1; + if (cur_line_size > kMaxLineSize) { + return Status::Invalid("description line length exceeds ", kMaxLineSize, + " characters"); + } + } + return Status::OK(); +} + +} // namespace + Status Function::Validate() const { if (!doc_->summary.empty()) { // Documentation given, check its contents int arg_count = static_cast(doc_->arg_names.size()); - if (arg_count == arity_.num_args) { - return Status::OK(); + // Some varargs functions allow 0 vararg, others expect at least 1, + // hence the two possible values below. + bool arg_count_match = (arg_count == arity_.num_args) || + (arity_.is_varargs && arg_count == arity_.num_args + 1); + if (!arg_count_match) { + return Status::Invalid( + "In function '", name_, + "': ", "number of argument names for function documentation != function arity"); + } + Status st = ValidateFunctionSummary(doc_->summary); + if (st.ok()) { + st &= ValidateFunctionDescription(doc_->description); } - if (arity_.is_varargs && arg_count == arity_.num_args + 1) { - return Status::OK(); + if (!st.ok()) { + return st.WithMessage("In function '", name_, "': ", st.message()); } - return Status::Invalid( - "In function '", name_, - "': ", "number of argument names for function documentation != function arity"); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h index 587b9396b5f8d..0395ab3e9fb10 100644 --- a/cpp/src/arrow/compute/function_internal.h +++ b/cpp/src/arrow/compute/function_internal.h @@ -265,7 +265,7 @@ template static inline enable_if_same> GenericTypeSingleton() { std::vector> fields; - fields.emplace_back(new Field("name", GenericTypeSingleton())); + fields.emplace_back(new Field("target", GenericTypeSingleton())); fields.emplace_back(new Field("order", GenericTypeSingleton())); return std::make_shared(std::move(fields)); } @@ -283,16 +283,20 @@ static inline Result> GenericToScalar(bool value) { return MakeScalar(value); } +static inline Result> GenericToScalar(const FieldRef& ref) { + return MakeScalar(ref.ToDotPath()); +} + template ::value>> static inline Result> GenericToScalar(const T value) { using CType = typename EnumTraits::CType; return GenericToScalar(static_cast(value)); } -static inline Result> GenericToScalar(const SortKey& value) { - ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name)); - ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order)); - return StructScalar::Make({name, order}, {"name", "order"}); +static inline Result> GenericToScalar(const SortKey& key) { + ARROW_ASSIGN_OR_RAISE(auto target, GenericToScalar(key.target)); + ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(key.order)); + return StructScalar::Make({target, order}, {"target", "order"}); } static inline Result> GenericToScalar( @@ -398,6 +402,13 @@ static inline enable_if_same_result GenericFromScalar( return holder.value->ToString(); } +template +static inline enable_if_same_result GenericFromScalar( + const std::shared_ptr& value) { + ARROW_ASSIGN_OR_RAISE(auto path, GenericFromScalar(value)); + return FieldRef::FromDotPath(path); +} + template static inline enable_if_same_result GenericFromScalar( const std::shared_ptr& value) { @@ -406,11 +417,11 @@ static inline enable_if_same_result GenericFromScalar( } if (!value->is_valid) return Status::Invalid("Got null scalar"); const auto& holder = checked_cast(*value); - ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name")); + ARROW_ASSIGN_OR_RAISE(auto target_holder, holder.field("target")); ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order")); - ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar(name_holder)); + ARROW_ASSIGN_OR_RAISE(auto target, GenericFromScalar(target_holder)); ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar(order_holder)); - return SortKey{std::move(name), order}; + return SortKey{std::move(target), order}; } template diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 25697f7d33b11..38575553b3e69 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -422,9 +422,8 @@ void AddMinOrMaxAggKernel(ScalarAggregateFunction* func, auto init = [min_max_func]( KernelContext* ctx, const KernelInitArgs& args) -> Result> { - std::vector inputs = args.inputs; - ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchBest(&inputs)); - KernelInitArgs new_args{kernel, inputs, args.options}; + ARROW_ASSIGN_OR_RAISE(auto kernel, min_max_func->DispatchExact(args.inputs)); + KernelInitArgs new_args{kernel, args.inputs, args.options}; return kernel->init(ctx, new_args); }; @@ -654,6 +653,20 @@ struct IndexImpl : public ScalarAggregator { int64_t index = -1; }; +template <> +struct IndexImpl : public ScalarAggregator { + explicit IndexImpl(IndexOptions, KernelState*) {} + + Status Consume(KernelContext*, const ExecBatch&) override { return Status::OK(); } + + Status MergeFrom(KernelContext*, KernelState&&) override { return Status::OK(); } + + Status Finalize(KernelContext*, Datum* out) override { + out->value = std::make_shared(-1); + return Status::OK(); + } +}; + struct IndexInit { std::unique_ptr state; KernelContext* ctx; @@ -667,6 +680,11 @@ struct IndexInit { return Status::NotImplemented("Index kernel not implemented for ", type.ToString()); } + Status Visit(const NullType&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + Status Visit(const BooleanType&) { state.reset(new IndexImpl(options, ctx->state())); return Status::OK(); @@ -684,6 +702,17 @@ struct IndexInit { return Status::OK(); } + Status Visit(const FixedSizeBinaryType&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_decimal Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + template enable_if_date Visit(const Type&) { state.reset(new IndexImpl(options, ctx->state())); @@ -712,8 +741,14 @@ struct IndexInit { if (!args.options) { return Status::Invalid("Must provide IndexOptions for index kernel"); } - IndexInit visitor(ctx, static_cast(*args.options), - *args.inputs[0].type); + const auto& options = static_cast(*args.options); + if (!options.value) { + return Status::Invalid("Must provide IndexOptions.value for index kernel"); + } else if (!options.value->type->Equals(*args.inputs[0].type)) { + return Status::TypeError("Expected IndexOptions.value to be of type ", + *args.inputs[0].type, ", but got ", *options.value->type); + } + IndexInit visitor(ctx, options, *args.inputs[0].type); return visitor.Create(); } }; @@ -839,27 +874,25 @@ const FunctionDoc min_or_max_doc{ {"array"}, "ScalarAggregateOptions"}; -const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", - ("Null values are ignored by default.\n" - "If null values are taken into account by setting " - "ScalarAggregateOptions parameter skip_nulls = false then " - "Kleene logic is used.\n" - "See KleeneOr for more details on Kleene logic."), - {"array"}, - "ScalarAggregateOptions"}; - -const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", - ("Null values are ignored by default.\n" - "If null values are taken into account by setting " - "ScalarAggregateOptions parameter skip_nulls = false then " - "Kleene logic is used.\n" - "See KleeneAnd for more details on Kleene logic."), - {"array"}, - "ScalarAggregateOptions"}; +const FunctionDoc any_doc{ + "Test whether any element in a boolean array evaluates to true", + ("Null values are ignored by default.\n" + "If the `skip_nulls` option is set to false, then Kleene logic is used.\n" + "See \"kleene_or\" for more details on Kleene logic."), + {"array"}, + "ScalarAggregateOptions"}; + +const FunctionDoc all_doc{ + "Test whether all elements in a boolean array evaluate to true", + ("Null values are ignored by default.\n" + "If the `skip_nulls` option is set to false, then Kleene logic is used.\n" + "See \"kleene_and\" for more details on Kleene logic."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc index_doc{"Find the index of the first occurrence of a given value", - ("The result is always computed as an int64_t, regardless\n" - "of the offset type of the input array."), + ("-1 is returned if the value is not found in the array.\n" + "The search value is specified in IndexOptions."), {"array"}, "IndexOptions"}; @@ -1003,6 +1036,9 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddBasicAggKernels(IndexInit::Init, BaseBinaryTypes(), int64(), func.get()); AddBasicAggKernels(IndexInit::Init, PrimitiveTypes(), int64(), func.get()); AddBasicAggKernels(IndexInit::Init, TemporalTypes(), int64(), func.get()); + AddBasicAggKernels(IndexInit::Init, + {fixed_size_binary(1), decimal128(1, 0), decimal256(1, 0), null()}, + int64(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index 39cfeb039a84b..5aef6d5f12ec1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -322,6 +322,8 @@ static void BenchmarkGroupBy(benchmark::State& state, BenchmarkSetArgsWithSizes(bench, {1 * 1024 * 1024}); \ }) +// Grouped Sum + GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] { auto summand = rng.Float64(args.size, /*min=*/0.0, @@ -463,6 +465,39 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] { BenchmarkGroupBy(state, {{"hash_sum", NULLPTR}}, {summand}, {int_key, str_key}); }); +// Grouped MinMax + +GROUP_BY_BENCHMARK(MinMaxDoublesGroupedByMediumInt, [&] { + auto input = rng.Float64(args.size, + /*min=*/0.0, + /*max=*/1.0e14, + /*null_probability=*/args.null_proportion, + /*nan_probability=*/args.null_proportion / 10); + auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); + + BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key}); +}); + +GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] { + auto input = rng.String(args.size, + /*min_length=*/0, + /*max_length=*/64, + /*null_probability=*/args.null_proportion); + auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); + + BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key}); +}); + +GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] { + auto input = rng.String(args.size, + /*min_length=*/0, + /*max_length=*/512, + /*null_probability=*/args.null_proportion); + auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); + + BenchmarkGroupBy(state, {{"hash_min_max", NULLPTR}}, {input}, {int_key}); +}); + // // Sum // diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index 22a54558f4e8a..946ec01900c5b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -21,6 +21,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_run_reader.h" +#include "arrow/util/int128_internal.h" #include "arrow/util/logging.h" namespace arrow { @@ -111,6 +112,26 @@ void AddAggKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateFinalize finalize, ScalarAggregateFunction* func, SimdLevel::type simd_level = SimdLevel::NONE); +using arrow::internal::VisitSetBitRunsVoid; + +template +struct GetSumType; + +template +struct GetSumType> { + using SumType = double; +}; + +template +struct GetSumType> { + using SumType = arrow::internal::int128_t; +}; + +template +struct GetSumType> { + using SumType = typename TypeTraits::CType; +}; + // SumArray must be parameterized with the SIMD level since it's called both from // translation units with and without vectorization. Normally it gets inlined but // if not, without the parameter, we'll have multiple definitions of the same diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index f225f6bf569c3..f35d025f02ed2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -40,10 +40,13 @@ constexpr char kCountFieldName[] = "count"; constexpr uint64_t kCountEOF = ~0ULL; -template +template ::CType> Result> PrepareOutput(int64_t n, KernelContext* ctx, Datum* out) { - const auto& mode_type = TypeTraits::type_singleton(); + DCHECK_EQ(Type::STRUCT, out->type()->id()); + const auto& out_type = checked_cast(*out->type()); + DCHECK_EQ(2, out_type.num_fields()); + const auto& mode_type = out_type.field(0)->type(); const auto& count_type = int64(); auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0); @@ -61,10 +64,7 @@ Result> PrepareOutput(int64_t n, KernelContext* ctx, count_buffer = count_data->template GetMutableValues(1); } - const auto& out_type = - struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)}); - *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0)); - + *out = Datum(ArrayData::Make(out->type(), n, {nullptr}, {mode_data, count_data}, 0)); return std::make_pair(mode_buffer, count_buffer); } @@ -72,7 +72,7 @@ Result> PrepareOutput(int64_t n, KernelContext* ctx, // suboptimal for tiny or large n, possibly okay as we're not in hot path template Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) { - using CType = typename InType::c_type; + using CType = typename TypeTraits::CType; using ValueCountPair = std::pair; auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) { @@ -203,13 +203,25 @@ struct CountModer { } }; -// copy and sort approach for floating points or integers with wide value range +// copy and sort approach for floating points, decimals, or integers with wide +// value range // O(n) space, O(nlogn) time template struct SortModer { - using CType = typename T::c_type; + using CType = typename TypeTraits::CType; using Allocator = arrow::stl::allocator; + template + static enable_if_floating_point GetNan() { + return static_cast(NAN); + } + + template + static enable_if_t::value, CType> GetNan() { + DCHECK(false); + return static_cast(0); + } + Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const Datum& datum = batch[0]; const int64_t in_length = datum.length() - datum.null_count(); @@ -246,7 +258,7 @@ struct SortModer { if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) { // handle NAN at last if (nan_count > 0) { - auto value_count = std::make_pair(static_cast(NAN), nan_count); + auto value_count = std::make_pair(GetNan(), nan_count); nan_count = 0; return value_count; } @@ -318,13 +330,18 @@ struct Moder::value && }; template -struct Moder::value>> { +struct Moder> { + SortModer impl; +}; + +template +struct Moder> { SortModer impl; }; template Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) { - using CType = typename T::c_type; + using CType = typename TypeTraits::CType; const ModeOptions& options = ModeState::Get(ctx); if ((!options.skip_nulls && !scalar.is_valid) || @@ -366,40 +383,44 @@ struct ModeExecutor { } }; -VectorKernel NewModeKernel(const std::shared_ptr& in_type) { +Result ModeType(KernelContext*, const std::vector& descrs) { + return ValueDescr::Array( + struct_({field(kModeFieldName, descrs[0].type), field(kCountFieldName, int64())})); +} + +VectorKernel NewModeKernel(const std::shared_ptr& in_type, + ArrayKernelExec exec) { VectorKernel kernel; kernel.init = ModeState::Init; kernel.can_execute_chunkwise = false; kernel.output_chunked = false; - auto out_type = - struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())}); - kernel.signature = - KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type)); - return kernel; -} - -void AddBooleanModeKernel(VectorFunction* func) { - VectorKernel kernel = NewModeKernel(boolean()); - kernel.exec = ModeExecutor::Exec; - DCHECK_OK(func->AddKernel(kernel)); -} - -void AddNumericModeKernels(VectorFunction* func) { - for (const auto& type : NumericTypes()) { - VectorKernel kernel = NewModeKernel(type); - kernel.exec = GenerateNumeric(*type); - DCHECK_OK(func->AddKernel(kernel)); + switch (in_type->id()) { + case Type::DECIMAL128: + case Type::DECIMAL256: + kernel.signature = + KernelSignature::Make({InputType(in_type->id())}, OutputType(ModeType)); + break; + default: { + auto out_type = + struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())}); + kernel.signature = KernelSignature::Make({InputType(in_type->id())}, + ValueDescr::Array(std::move(out_type))); + break; + } } + kernel.exec = std::move(exec); + return kernel; } const FunctionDoc mode_doc{ - "Calculate the modal (most common) values of a numeric array", - ("Returns top-n most common values and number of times they occur in an array.\n" - "Result is an array of `struct`, where T is the input type.\n" - "Values with larger counts are returned before smaller counts.\n" - "If there are more than one values with same count, smaller one is returned first.\n" + "Compute the modal (most common) values of a numeric array", + ("Compute the n most common values and their respective occurrence counts.\n" + "The output has type `struct`, where T is the\n" + "input type.\n" + "The results are ordered by descending `count` first, and ascending `mode`\n" + "when breaking ties.\n" "Nulls are ignored. If there are no non-null values in the array,\n" - "empty array is returned."), + "an empty array is returned."), {"array"}, "ModeOptions"}; @@ -409,8 +430,17 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) { static auto default_options = ModeOptions::Defaults(); auto func = std::make_shared("mode", Arity::Unary(), &mode_doc, &default_options); - AddBooleanModeKernel(func.get()); - AddNumericModeKernels(func.get()); + DCHECK_OK(func->AddKernel( + NewModeKernel(boolean(), ModeExecutor::Exec))); + for (const auto& type : NumericTypes()) { + DCHECK_OK(func->AddKernel( + NewModeKernel(type, GenerateNumeric(*type)))); + } + // Type parameters are ignored + DCHECK_OK(func->AddKernel( + NewModeKernel(decimal128(1, 0), ModeExecutor::Exec))); + DCHECK_OK(func->AddKernel( + NewModeKernel(decimal256(1, 0), ModeExecutor::Exec))); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 62e375e695087..cd2410cc9eb75 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -71,10 +71,21 @@ uint64_t QuantileToDataPoint(size_t length, double q, return datapoint_index; } +template +double DataPointToDouble(T value, const DataType&) { + return static_cast(value); +} +double DataPointToDouble(const Decimal128& value, const DataType& ty) { + return value.ToDouble(checked_cast(ty).scale()); +} +double DataPointToDouble(const Decimal256& value, const DataType& ty) { + return value.ToDouble(checked_cast(ty).scale()); +} + // copy and nth_element approach, large memory footprint template struct SortQuantiler { - using CType = typename InType::c_type; + using CType = typename TypeTraits::CType; using Allocator = arrow::stl::allocator; Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -106,8 +117,7 @@ struct SortQuantiler { // prepare out array // out type depends on options const bool is_datapoint = IsDataPoint(options); - const std::shared_ptr out_type = - is_datapoint ? TypeTraits::type_singleton() : float64(); + const std::shared_ptr out_type = is_datapoint ? datum.type() : float64(); int64_t out_length = options.q.size(); if (in_buffer.empty()) { return MakeArrayOfNull(out_type, out_length, ctx->memory_pool()).Value(out); @@ -142,8 +152,9 @@ struct SortQuantiler { double* out_buffer = out_data->template GetMutableValues(1); for (int64_t i = 0; i < out_length; ++i) { const int64_t q_index = q_indices[i]; - out_buffer[q_index] = GetQuantileByInterp( - in_buffer, &last_index, options.q[q_index], options.interpolation); + out_buffer[q_index] = + GetQuantileByInterp(in_buffer, &last_index, options.q[q_index], + options.interpolation, *datum.type()); } } } @@ -170,8 +181,8 @@ struct SortQuantiler { // return quantile interpolated from adjacent input data points double GetQuantileByInterp(std::vector& in, uint64_t* last_index, - double q, - enum QuantileOptions::Interpolation interpolation) { + double q, enum QuantileOptions::Interpolation interpolation, + const DataType& in_type) { const double index = (in.size() - 1) * q; const uint64_t lower_index = static_cast(index); const double fraction = index - lower_index; @@ -181,7 +192,7 @@ struct SortQuantiler { std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index); } - const double lower_value = static_cast(in[lower_index]); + const double lower_value = DataPointToDouble(in[lower_index], in_type); if (fraction == 0) { *last_index = lower_index; return lower_value; @@ -197,7 +208,7 @@ struct SortQuantiler { } *last_index = lower_index; - const double higher_value = static_cast(in[higher_index]); + const double higher_value = DataPointToDouble(in[higher_index], in_type); if (interpolation == QuantileOptions::LINEAR) { // more stable than naive linear interpolation @@ -399,10 +410,15 @@ struct ExactQuantiler::value>> { SortQuantiler impl; }; +template +struct ExactQuantiler::value>> { + SortQuantiler impl; +}; + template Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options, const Scalar& scalar, Datum* out) { - using CType = typename T::c_type; + using CType = typename TypeTraits::CType; ArrayData* output = out->mutable_array(); output->length = options.q.size(); auto out_type = IsDataPoint(options) ? scalar.type : float64(); @@ -433,7 +449,7 @@ Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options, } else { double* out_buffer = output->template GetMutableValues(1); for (int64_t i = 0; i < output->length; i++) { - out_buffer[i] = static_cast(UnboxScalar::Unbox(scalar)); + out_buffer[i] = DataPointToDouble(UnboxScalar::Unbox(scalar), *scalar.type); } } return Status::OK(); @@ -486,6 +502,18 @@ void AddQuantileKernels(VectorFunction* func) { base.exec = GenerateNumeric(*ty); DCHECK_OK(func->AddKernel(base)); } + { + base.signature = + KernelSignature::Make({InputType(Type::DECIMAL128)}, OutputType(ResolveOutput)); + base.exec = QuantileExecutor::Exec; + DCHECK_OK(func->AddKernel(base)); + } + { + base.signature = + KernelSignature::Make({InputType(Type::DECIMAL256)}, OutputType(ResolveOutput)); + base.exec = QuantileExecutor::Exec; + DCHECK_OK(func->AddKernel(base)); + } } const FunctionDoc quantile_doc{ diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc index 0fddf38f575c9..7c86267d94006 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc @@ -34,13 +34,25 @@ template struct TDigestImpl : public ScalarAggregator { using ThisType = TDigestImpl; using ArrayType = typename TypeTraits::ArrayType; - using CType = typename ArrowType::c_type; + using CType = typename TypeTraits::CType; - explicit TDigestImpl(const TDigestOptions& options) + TDigestImpl(const TDigestOptions& options, const DataType& in_type) : options{options}, tdigest{options.delta, options.buffer_size}, count{0}, - all_valid{true} {} + decimal_scale{0}, + all_valid{true} { + if (is_decimal_type::value) { + decimal_scale = checked_cast(in_type).scale(); + } + } + + template + double ToDouble(T value) const { + return static_cast(value); + } + double ToDouble(const Decimal128& value) const { return value.ToDouble(decimal_scale); } + double ToDouble(const Decimal256& value) const { return value.ToDouble(decimal_scale); } Status Consume(KernelContext*, const ExecBatch& batch) override { if (!this->all_valid) return Status::OK(); @@ -57,7 +69,7 @@ struct TDigestImpl : public ScalarAggregator { VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) { for (int64_t i = 0; i < len; ++i) { - this->tdigest.NanAdd(values[pos + i]); + this->tdigest.NanAdd(ToDouble(values[pos + i])); } }); } @@ -66,7 +78,7 @@ struct TDigestImpl : public ScalarAggregator { if (batch[0].scalar()->is_valid) { this->count += 1; for (int64_t i = 0; i < batch.length; i++) { - this->tdigest.NanAdd(value); + this->tdigest.NanAdd(ToDouble(value)); } } } @@ -110,6 +122,7 @@ struct TDigestImpl : public ScalarAggregator { const TDigestOptions options; TDigest tdigest; int64_t count; + int32_t decimal_scale; bool all_valid; }; @@ -132,8 +145,14 @@ struct TDigestInitState { } template - enable_if_t::value, Status> Visit(const Type&) { - state.reset(new TDigestImpl(options)); + enable_if_number Visit(const Type&) { + state.reset(new TDigestImpl(options, in_type)); + return Status::OK(); + } + + template + enable_if_decimal Visit(const Type&) { + state.reset(new TDigestImpl(options, in_type)); return Status::OK(); } @@ -154,7 +173,7 @@ void AddTDigestKernels(KernelInit init, const std::vector>& types, ScalarAggregateFunction* func) { for (const auto& ty : types) { - auto sig = KernelSignature::Make({InputType(ty)}, float64()); + auto sig = KernelSignature::Make({InputType(ty->id())}, float64()); AddAggKernel(std::move(sig), init, func); } } @@ -179,6 +198,7 @@ std::shared_ptr AddTDigestAggKernels() { auto func = std::make_shared( "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options); AddTDigestKernels(TDigestInit, NumericTypes(), func.get()); + AddTDigestKernels(TDigestInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 992f73698648d..c7cdf3fd91e54 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1384,8 +1384,6 @@ template class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel {}; class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; -class TestDayTimeIntervalMinMaxKernel - : public TestPrimitiveMinMaxKernel {}; TEST_F(TestBooleanMinMaxKernel, Basics) { ScalarAggregateOptions options; @@ -1647,7 +1645,7 @@ TEST(TestNullMinMaxKernel, Basics) { template class TestBaseBinaryMinMaxKernel : public ::testing::Test {}; -TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BaseBinaryArrowTypes); TYPED_TEST(TestBaseBinaryMinMaxKernel, Basics) { std::vector chunked_input1 = {R"(["cc", "", "aa", "b", "c"])", R"(["d", "", null, "b", "c"])"}; @@ -2109,24 +2107,23 @@ TEST_F(TestAllKernel, Basics) { // Index // +void CheckIndex(Datum array, const std::shared_ptr& value, int64_t expected) { + IndexOptions options(value); + ASSERT_OK_AND_ASSIGN(Datum out, Index(array, options)); + const Int64Scalar& out_index = out.scalar_as(); + ASSERT_EQ(out_index.value, expected); +} + template class TestIndexKernel : public ::testing::Test { public: using ScalarType = typename TypeTraits::ScalarType; - void AssertIndexIs(const Datum& array, const std::shared_ptr& value, - int64_t expected) { - IndexOptions options(value); - ASSERT_OK_AND_ASSIGN(Datum out, Index(array, options)); - const Int64Scalar& out_index = out.scalar_as(); - ASSERT_EQ(out_index.value, expected); - } - void AssertIndexIs(const std::string& json, const std::shared_ptr& value, int64_t expected) { SCOPED_TRACE("Value: " + value->ToString()); SCOPED_TRACE("Input: " + json); auto array = ArrayFromJSON(type_singleton(), json); - AssertIndexIs(array, value, expected); + CheckIndex(array, value, expected); } void AssertIndexIs(const std::vector& json, @@ -2134,7 +2131,7 @@ class TestIndexKernel : public ::testing::Test { SCOPED_TRACE("Value: " + value->ToString()); auto array = ChunkedArrayFromJSON(type_singleton(), json); SCOPED_TRACE("Input: " + array->ToString()); - AssertIndexIs(array, value, expected); + CheckIndex(array, value, expected); } std::shared_ptr type_singleton() { return std::make_shared(); } @@ -2208,7 +2205,7 @@ TYPED_TEST(TestNumericIndexKernel, Random) { if (expected >= 0) break; } - this->AssertIndexIs(Datum(chunked_array), value, expected); + CheckIndex(Datum(chunked_array), value, expected); } } @@ -2249,7 +2246,7 @@ TYPED_TEST(TestBooleanIndexKernel, Basics) { template class TestStringIndexKernel : public TestIndexKernel {}; -TYPED_TEST_SUITE(TestStringIndexKernel, BinaryArrowTypes); +TYPED_TEST_SUITE(TestStringIndexKernel, BaseBinaryArrowTypes); TYPED_TEST(TestStringIndexKernel, Basics) { auto buffer = Buffer::FromString("foo"); auto value = std::make_shared(buffer); @@ -2265,10 +2262,96 @@ TYPED_TEST(TestStringIndexKernel, Basics) { this->AssertIndexIs(R"(["foo", null, null])", null_value, -1); } +TEST(TestIndexKernel, FixedSizeBinary) { + auto ty = fixed_size_binary(3); + auto buffer = Buffer::FromString("foo"); + auto value = std::make_shared(buffer, ty); + auto null_value = std::make_shared(buffer, ty); + null_value->is_valid = false; + + CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["foo"])"), value, 0); + CheckIndex(ArrayFromJSON(ty, R"(["bar", "bar", "bar", "bar"])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["bar", "bar", "bar", "bar", "foo"])"), value, 4); + CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), null_value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["foo", null, null])"), null_value, -1); +} + +TEST(TestIndexKernel, Decimal) { + for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + std::shared_ptr value, null_value; + if (ty->id() == Type::DECIMAL128) { + value = std::make_shared(Decimal128(123), ty); + null_value = std::make_shared(ty); + } else { + value = std::make_shared(Decimal256(123), ty); + null_value = std::make_shared(ty); + } + + CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["1.23"])"), value, 0); + CheckIndex(ArrayFromJSON(ty, R"(["9.99", "9.99", "9.99", "9.99"])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["9.99", "9.99", "9.99", "9.99", "1.23"])"), value, + 4); + CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"([null, null, null])"), null_value, -1); + CheckIndex(ArrayFromJSON(ty, R"(["1.23", null, null])"), null_value, -1); + } +} + +TEST(TestIndexKernel, Null) { + auto ty = null(); + auto value = std::make_shared(); + + CheckIndex(ArrayFromJSON(ty, R"([])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"([null])"), value, -1); + CheckIndex(ArrayFromJSON(ty, R"([null, null, null, null])"), value, -1); +} + +TEST(TestIndexKernel, Errors) { + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr( + "Expected IndexOptions.value to be of type string, but got int32"), + Index(ArrayFromJSON(utf8(), R"(["a"])"), + IndexOptions(ScalarFromJSON(int32(), "1")))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("Expected IndexOptions.value to be of type timestamp[ns], " + "but got timestamp[ms]"), + Index(ArrayFromJSON(timestamp(TimeUnit::NANO), R"(["2020-01-01"])"), + IndexOptions(ScalarFromJSON(timestamp(TimeUnit::MILLI), R"("2020-01-01")")))); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Must provide IndexOptions.value"), + Index(ArrayFromJSON(utf8(), R"(["a"])"), IndexOptions(nullptr))); +} + // // Mode // +template +void CheckModes(const Datum& array, const ModeOptions options, + const std::vector& expected_modes, + const std::vector& expected_counts) { + ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options)); + ValidateOutput(out); + const StructArray out_array(out.array()); + ASSERT_EQ(out_array.length(), expected_modes.size()); + ASSERT_EQ(out_array.num_fields(), 2); + + const CType* out_modes = out_array.field(0)->data()->GetValues(1); + const int64_t* out_counts = out_array.field(1)->data()->GetValues(1); + for (int i = 0; i < out_array.length(); ++i) { + // equal or nan equal + ASSERT_TRUE((expected_modes[i] == out_modes[i]) || + (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i])); + ASSERT_EQ(expected_counts[i], out_counts[i]); + } +} + template class TestPrimitiveModeKernel : public ::testing::Test { public: @@ -2279,21 +2362,7 @@ class TestPrimitiveModeKernel : public ::testing::Test { void AssertModesAre(const Datum& array, const ModeOptions options, const std::vector& expected_modes, const std::vector& expected_counts) { - ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options)); - ValidateOutput(out); - const StructArray out_array(out.array()); - ASSERT_EQ(out_array.length(), expected_modes.size()); - ASSERT_EQ(out_array.num_fields(), 2); - - const CType* out_modes = out_array.field(0)->data()->GetValues(1); - const int64_t* out_counts = out_array.field(1)->data()->GetValues(1); - for (int i = 0; i < out_array.length(); ++i) { - // equal or nan equal - ASSERT_TRUE( - (expected_modes[i] == out_modes[i]) || - (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i])); - ASSERT_EQ(expected_counts[i], out_counts[i]); - } + CheckModes(array, options, expected_modes, expected_counts); } void AssertModesAre(const std::string& json, const int n, @@ -2522,6 +2591,89 @@ TYPED_TEST(TestFloatingModeKernel, Floats) { this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), ModeOptions(/*n=*/1)); } +template +class TestDecimalModeKernel : public ::testing::Test { + public: + using CType = typename TypeTraits::CType; + + void AssertModesAre(const Datum& array, const ModeOptions options, + const std::vector& expected_modes, + const std::vector& expected_counts) { + CheckModes(array, options, values(expected_modes), expected_counts); + } + + CType value(const std::string& s) const { + EXPECT_OK_AND_ASSIGN(auto out, CType::FromString(s)); + return out; + } + + std::vector values(const std::vector& strings) const { + std::vector values; + for (const auto& s : strings) { + values.push_back(value(s)); + } + return values; + } + + std::shared_ptr type_instance() { return std::make_shared(4, 2); } +}; + +TYPED_TEST_SUITE(TestDecimalModeKernel, DecimalArrowTypes); + +TYPED_TEST(TestDecimalModeKernel, Decimals) { + auto ty = this->type_instance(); + this->AssertModesAre(ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01"])"), + ModeOptions(1), {"5.01"}, {3}); + this->AssertModesAre( + ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01", "-1.42"])"), + ModeOptions(1), {"-1.42"}, {3}); + this->AssertModesAre( + ArrayFromJSON(ty, R"(["5.01", "-1.42", "-1.42", "5.01", "5.01", "-1.42"])"), + ModeOptions(2), {"-1.42", "5.01"}, {3, 3}); + + this->AssertModesAre(ArrayFromJSON(ty, "[]"), ModeOptions(1), {}, {}); + + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"), + ModeOptions(/*n=*/1), {"-2.00"}, {2}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {}, {}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), + {"-2.00"}, {2}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["-2.00", "-2.00", null])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), {}, + {}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00"])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), + {"-2.00"}, {2}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00", "-2.00", null])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {}, + {}); + this->AssertModesAre(ArrayFromJSON(ty, R"(["1.86", "-2.00"])"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {}, + {}); + + this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {"0.00"}, {1}); + this->AssertModesAre(ScalarFromJSON(ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {}, {}); + this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2), {}, + {}); + this->AssertModesAre(ScalarFromJSON(ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2), {}, + {}); + this->AssertModesAre(ScalarFromJSON(ty, R"("0.00")"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2), {}, + {}); + this->AssertModesAre(ScalarFromJSON(ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2), {}, + {}); + this->AssertModesAre(ScalarFromJSON(ty, R"("5.00")"), ModeOptions(/*n=*/1), {"5.00"}, + {1}); + this->AssertModesAre(ScalarFromJSON(ty, "null"), ModeOptions(/*n=*/1), {}, {}); +} + TEST_F(TestInt8ModeKernelValueRange, Basics) { this->AssertModeIs("[0, 127, -128, -128]", -128, 2); this->AssertModeIs("[127, 127, 127]", 127, 3); @@ -2624,6 +2776,24 @@ TEST_F(TestInt32ModeKernel, Sliced) { // Variance/Stddev // +void CheckVarStd(const Datum& array, const VarianceOptions& options, + double expected_var) { + ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); + ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); + auto var = checked_cast(out_var.scalar().get()); + auto std = checked_cast(out_std.scalar().get()); + ASSERT_TRUE(var->is_valid && std->is_valid); + // Near zero these macros don't work as well + // (and MinGW can give results slightly off from zero) + if (std::abs(expected_var) < 1e-20) { + ASSERT_NEAR(std->value * std->value, var->value, 1e-20); + ASSERT_NEAR(var->value, expected_var, 1e-20); + } else { + ASSERT_DOUBLE_EQ(std->value * std->value, var->value); + ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP + } +} + template class TestPrimitiveVarStdKernel : public ::testing::Test { public: @@ -2632,12 +2802,12 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { void AssertVarStdIs(const Array& array, const VarianceOptions& options, double expected_var) { - AssertVarStdIsInternal(array, options, expected_var); + CheckVarStd(array, options, expected_var); } void AssertVarStdIs(const std::shared_ptr& array, const VarianceOptions& options, double expected_var) { - AssertVarStdIsInternal(array, options, expected_var); + CheckVarStd(array, options, expected_var); } void AssertVarStdIs(const std::string& json, const VarianceOptions& options, @@ -2675,17 +2845,6 @@ class TestPrimitiveVarStdKernel : public ::testing::Test { std::shared_ptr type_singleton() { return Traits::type_singleton(); } private: - void AssertVarStdIsInternal(const Datum& array, const VarianceOptions& options, - double expected_var) { - ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); - ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); - auto var = checked_cast(out_var.scalar().get()); - auto std = checked_cast(out_std.scalar().get()); - ASSERT_TRUE(var->is_valid && std->is_valid); - ASSERT_DOUBLE_EQ(std->value * std->value, var->value); - ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP - } - void AssertVarStdIsInvalidInternal(const Datum& array, const VarianceOptions& options) { ASSERT_OK_AND_ASSIGN(Datum out_var, Variance(array, options)); ASSERT_OK_AND_ASSIGN(Datum out_std, Stddev(array, options)); @@ -2935,6 +3094,18 @@ TEST_F(TestVarStdKernelIntegerLength, Basics) { } #endif +TEST(TestVarStdKernel, Decimal) { + // Effectively treated as double, sanity check results here + for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + CheckVarStd(ArrayFromJSON(ty, R"(["1.00"])"), VarianceOptions(), 0); + CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00", "3.00"])"), VarianceOptions(), + 0.6666666666666666); + CheckVarStd(ScalarFromJSON(ty, R"("1.00")"), VarianceOptions(), 0); + CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00"])"), + VarianceOptions(/*ddof=*/1), 0.5); + } +} + // // Quantile // @@ -3476,6 +3647,24 @@ TEST(TestQuantileKernel, AllNullsOrNaNs) { } } +TEST(TestQuantileKernel, Decimal) { + auto check = [](const std::shared_ptr& input, QuantileOptions options, + const std::shared_ptr& expected) { + ASSERT_OK_AND_ASSIGN(Datum out, Quantile(input, options)); + auto out_array = out.make_array(); + ValidateOutput(*out_array); + AssertArraysEqual(*expected, *out_array, /*verbose=*/true); + }; + for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + check(ArrayFromJSON(ty, R"(["1.00", "5.00", null])"), + QuantileOptions(0.5, QuantileOptions::LINEAR), + ArrayFromJSON(float64(), R"([3.00])")); + check(ArrayFromJSON(ty, R"(["1.00", "2.00", "5.00"])"), + QuantileOptions(0.5, QuantileOptions::NEAREST), + ArrayFromJSON(ty, R"(["2.00"])")); + } +} + TEST(TestQuantileKernel, Scalar) { for (const auto& ty : {float64(), int64(), uint64()}) { QuantileOptions options(std::vector{0.0, 0.5, 1.0}); @@ -3543,6 +3732,17 @@ TEST(TestTDigestKernel, AllNullsOrNaNs) { } } +TEST(TestTDigestKernel, Decimal) { + for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + ASSERT_OK_AND_ASSIGN(auto decimal_array, + TDigest(ArrayFromJSON(ty, R"(["1.00", "2.00", "3.25"])"))); + ASSERT_OK_AND_ASSIGN(auto float_array, + TDigest(ArrayFromJSON(float64(), "[1, 2, 3.25]"))); + AssertArraysApproxEqual(*float_array.make_array(), *decimal_array.make_array(), + /*verbose=*/true); + } +} + TEST(TestTDigestKernel, Scalar) { for (const auto& ty : {float64(), int64(), uint64()}) { TDigestOptions options(std::vector{0.0, 0.5, 1.0}); diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index d0d3c514fae2e..feb98718aee3c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -36,12 +36,21 @@ using arrow::internal::VisitSetBitRunsVoid; template struct VarStdState { using ArrayType = typename TypeTraits::ArrayType; - using CType = typename ArrowType::c_type; + using CType = typename TypeTraits::CType; using ThisType = VarStdState; - explicit VarStdState(VarianceOptions options) : options(options) {} + explicit VarStdState(int32_t decimal_scale, VarianceOptions options) + : decimal_scale(decimal_scale), options(options) {} - // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm` + template + double ToDouble(T value) const { + return static_cast(value); + } + double ToDouble(const Decimal128& value) const { return value.ToDouble(decimal_scale); } + double ToDouble(const Decimal256& value) const { return value.ToDouble(decimal_scale); } + + // float/double/int64/decimal: calculate `m2` (sum((X-mean)^2)) with `two pass + // algorithm` // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm template enable_if_t::value || (sizeof(CType) > 4)> Consume( @@ -52,14 +61,13 @@ struct VarStdState { return; } - using SumType = - typename std::conditional::value, double, int128_t>::type; - SumType sum = SumArray(*array.data()); + using SumType = typename internal::GetSumType::SumType; + SumType sum = internal::SumArray(*array.data()); - const double mean = static_cast(sum) / count; - const double m2 = - SumArray(*array.data(), [mean](CType value) { - const double v = static_cast(value); + const double mean = ToDouble(sum) / count; + const double m2 = internal::SumArray( + *array.data(), [this, mean](CType value) { + const double v = ToDouble(value); return (v - mean) * (v - mean); }); @@ -102,7 +110,7 @@ struct VarStdState { }); // merge variance - ThisType state(options); + ThisType state(decimal_scale, options); state.count = var_std.count; state.mean = var_std.mean(); state.m2 = var_std.m2(); @@ -116,7 +124,7 @@ struct VarStdState { this->m2 = 0; if (scalar.is_valid) { this->count = count; - this->mean = static_cast(UnboxScalar::Unbox(scalar)); + this->mean = ToDouble(UnboxScalar::Unbox(scalar)); } else { this->count = 0; this->mean = 0; @@ -141,6 +149,7 @@ struct VarStdState { &this->mean, &this->m2); } + const int32_t decimal_scale; const VarianceOptions options; int64_t count = 0; double mean = 0; @@ -153,9 +162,9 @@ struct VarStdImpl : public ScalarAggregator { using ThisType = VarStdImpl; using ArrayType = typename TypeTraits::ArrayType; - explicit VarStdImpl(const std::shared_ptr& out_type, + explicit VarStdImpl(int32_t decimal_scale, const std::shared_ptr& out_type, const VarianceOptions& options, VarOrStd return_type) - : out_type(out_type), state(options), return_type(return_type) {} + : out_type(out_type), state(decimal_scale, options), return_type(return_type) {} Status Consume(KernelContext*, const ExecBatch& batch) override { if (batch[0].is_array()) { @@ -216,8 +225,16 @@ struct VarStdInitState { } template - enable_if_t::value, Status> Visit(const Type&) { - state.reset(new VarStdImpl(out_type, options, return_type)); + enable_if_number Visit(const Type&) { + state.reset( + new VarStdImpl(/*decimal_scale=*/0, out_type, options, return_type)); + return Status::OK(); + } + + template + enable_if_decimal Visit(const Type&) { + state.reset(new VarStdImpl(checked_cast(in_type).scale(), + out_type, options, return_type)); return Status::OK(); } @@ -247,7 +264,7 @@ void AddVarStdKernels(KernelInit init, const std::vector>& types, ScalarAggregateFunction* func) { for (const auto& ty : types) { - auto sig = KernelSignature::Make({InputType(ty)}, float64()); + auto sig = KernelSignature::Make({InputType(ty->id())}, float64()); AddAggKernel(std::move(sig), init, func); } } @@ -275,6 +292,7 @@ std::shared_ptr AddStddevAggKernels() { auto func = std::make_shared( "stddev", Arity::Unary(), &stddev_doc, &default_std_options); AddVarStdKernels(StddevInit, NumericTypes(), func.get()); + AddVarStdKernels(StddevInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; } @@ -283,6 +301,7 @@ std::shared_ptr AddVarianceAggKernels() { auto func = std::make_shared( "variance", Arity::Unary(), &variance_doc, &default_var_options); AddVarStdKernels(VarianceInit, NumericTypes(), func.get()); + AddVarStdKernels(VarianceInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 438362585b5ed..a4914edbc8be7 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -188,6 +188,38 @@ struct GetOutputType { using T = Decimal256; }; +// ---------------------------------------------------------------------- +// enable_if helpers for C types + +template +using is_unsigned_integer_value = + std::integral_constant::value && std::is_unsigned::value>; + +template +using is_signed_integer_value = + std::integral_constant::value && std::is_signed::value>; + +template +using enable_if_signed_integer_value = enable_if_t::value, R>; + +template +using enable_if_unsigned_integer_value = + enable_if_t::value, R>; + +template +using enable_if_integer_value = + enable_if_t::value || is_unsigned_integer_value::value, + R>; + +template +using enable_if_floating_value = enable_if_t::value, R>; + +template +using enable_if_decimal_value = + enable_if_t::value || std::is_same::value, + R>; + // ---------------------------------------------------------------------- // Iteration / value access utilities @@ -1193,13 +1225,15 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) { case Type::DURATION: case Type::INTERVAL_DAY_TIME: return Generator::Exec; + case Type::INTERVAL_MONTH_DAY_NANO: + return Generator::Exec; default: DCHECK(false); return ExecFail; } } -// similar to GenerateTypeAgnosticPrimitive, but for variable types +// similar to GenerateTypeAgnosticPrimitive, but for base variable binary types template