From c2ca9bcedeb004f9d7f5d3e1aafc7b83ce6c1e3f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 29 Jan 2024 15:39:10 -0500 Subject: [PATCH 01/74] GH-39837: [Go][Flight] Allow cloning existing cookies in middleware (#39838) ### Rationale for this change This is needed for https://github.com/apache/arrow-adbc/issues/1194 to facilitate better connection handling for flight clients in ADBC by copying the existing cookies over when creating a sub-client. ### What changes are included in this PR? Creating a `Clone` method on the `CookieMiddleware` so that a user can create and hold a reference to a specific cookie middleware instance and then create new ones on the fly that copy over the existing cookies at that moment. ### Are these changes tested? Yes. ### Are there any user-facing changes? No * Closes: #39837 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/flight/cookie_middleware.go | 24 +++++++++ go/arrow/flight/cookie_middleware_test.go | 60 +++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/go/arrow/flight/cookie_middleware.go b/go/arrow/flight/cookie_middleware.go index 27754a13b829a..39c86d8303434 100644 --- a/go/arrow/flight/cookie_middleware.go +++ b/go/arrow/flight/cookie_middleware.go @@ -23,6 +23,7 @@ import ( "sync" "time" + "golang.org/x/exp/maps" "google.golang.org/grpc/metadata" ) @@ -40,11 +41,34 @@ func NewClientCookieMiddleware() ClientMiddleware { return CreateClientMiddleware(&clientCookieMiddleware{jar: make(map[string]http.Cookie)}) } +func NewCookieMiddleware() CookieMiddleware { + return &clientCookieMiddleware{jar: make(map[string]http.Cookie)} +} + +// CookieMiddleware is a go-routine safe middleware for flight clients +// which properly handles Set-Cookie headers for storing cookies. +// This can be passed into `CreateClientMiddleware` to create a new +// middleware object. You can also clone it to create middleware for a +// new client which starts with the same cookies. +type CookieMiddleware interface { + CustomClientMiddleware + // Clone creates a new CookieMiddleware that starts out with the same + // cookies that this one already has. This is useful when creating a + // new client connection for the same server. + Clone() CookieMiddleware +} + type clientCookieMiddleware struct { jar map[string]http.Cookie mx sync.Mutex } +func (cc *clientCookieMiddleware) Clone() CookieMiddleware { + cc.mx.Lock() + defer cc.mx.Unlock() + return &clientCookieMiddleware{jar: maps.Clone(cc.jar)} +} + func (cc *clientCookieMiddleware) StartCall(ctx context.Context) context.Context { cc.mx.Lock() defer cc.mx.Unlock() diff --git a/go/arrow/flight/cookie_middleware_test.go b/go/arrow/flight/cookie_middleware_test.go index 0adf4927652d4..4007d056b2c99 100644 --- a/go/arrow/flight/cookie_middleware_test.go +++ b/go/arrow/flight/cookie_middleware_test.go @@ -239,3 +239,63 @@ func TestCookieExpiration(t *testing.T) { cookieMiddleware.expectedCookies = map[string]string{} makeReq(client, t) } + +func TestCookiesClone(t *testing.T) { + cookieMiddleware := &serverAddCookieMiddleware{} + + s := flight.NewServerWithMiddleware([]flight.ServerMiddleware{ + flight.CreateServerMiddleware(cookieMiddleware), + }) + s.Init("localhost:0") + f := &flightServer{} + s.RegisterFlightService(f) + + go s.Serve() + defer s.Shutdown() + + makeReq := func(c flight.Client, t *testing.T) { + flightStream, err := c.ListFlights(context.Background(), &flight.Criteria{}) + assert.NoError(t, err) + + for { + _, err := flightStream.Recv() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + assert.NoError(t, err) + } + } + } + + credsOpt := grpc.WithTransportCredentials(insecure.NewCredentials()) + cookies := flight.NewCookieMiddleware() + client1, err := flight.NewClientWithMiddleware(s.Addr().String(), nil, + []flight.ClientMiddleware{flight.CreateClientMiddleware(cookies)}, credsOpt) + require.NoError(t, err) + defer client1.Close() + + // set cookies + cookieMiddleware.cookies = []*http.Cookie{ + {Name: "foo", Value: "bar"}, + {Name: "foo2", Value: "bar2", MaxAge: 1}, + } + makeReq(client1, t) + + // validate set + cookieMiddleware.expectedCookies = map[string]string{ + "foo": "bar", "foo2": "bar2", + } + makeReq(client1, t) + + client2, err := flight.NewClientWithMiddleware(s.Addr().String(), nil, + []flight.ClientMiddleware{flight.CreateClientMiddleware(cookies.Clone())}, credsOpt) + require.NoError(t, err) + defer client2.Close() + + // validate clone worked + cookieMiddleware.expectedCookies = map[string]string{ + "foo": "bar", "foo2": "bar2", + } + makeReq(client2, t) +} From fc3278ffb78e6f4f79cd20160bf911efa5a09ba1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jan 2024 06:01:22 +0900 Subject: [PATCH 02/74] MINOR: [Java] Bump org.immutables:value from 2.8.2 to 2.10.0 in /java (#39831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.immutables:value](https://github.com/immutables/immutables) from 2.8.2 to 2.10.0.
Release notes

Sourced from org.immutables:value's releases.

2.10.0

JakartaEE support

Style flag jakarta = true Mainly package change for annotations and types like Validator

Miscellaneous

  • JDK9 unmodifiable collections for List, Set, Map, style flag jdk9Collections = true
  • Suppress from method, style flag from = ""
  • Non-strict modifiables allows reading unset attributes, style flag strictModifiables = false
  • Fixes in nested type_use annotations.
  • Performance: better initial capacity for collections
  • Refinements and fixes to Criteria modules
  • Plus many other refinements and maintance, see below

Workarounds for Gradle

  • imports for not-yet-generated types : add options.sourcepath
  • disable incremental compilation (options.incremental), may also help is some complex cases

What's Changed (since some last year's release)

New Contributors

Full Changelog: https://github.com/immutables/immutables/compare/2.9.3...2.10.0

2.9.3

Maintenance release

What's Changed

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.immutables:value&package-manager=maven&previous-version=2.8.2&new-version=2.10.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 3951f1c1bc8ed..2423e2d495d11 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -438,7 +438,7 @@ org.immutables value - 2.8.2 + 2.10.0 @@ -653,7 +653,7 @@ org.immutables value - 2.8.2 + 2.10.0 provided From 7fd59739fddf4b614c68d57e24068542b4cf2884 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jan 2024 06:01:42 +0900 Subject: [PATCH 03/74] MINOR: [Java] Bump org.apache.maven.plugins:maven-gpg-plugin from 1.5 to 3.1.0 in /java (#39832) Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 1.5 to 3.1.0.
Commits
  • 699e2ad [maven-release-plugin] prepare release maven-gpg-plugin-3.1.0
  • f314f8e [MGPG-97] use gpgverify plugin to check dependencies signatures
  • bad6b57 [MGPG-96] add INFO message
  • 0498a82 [MGPG-95] don't GPG-sign .sigstore signatures
  • 09b5be9 Auto-link MGPG Jira
  • 1e0472f extract FilesCollector
  • af9ccfd [MGPG-94] Ignore reformatting
  • 5e51734 [MGPG-94] Integration tests - convert and reformat bsh to groovy
  • 955ea0e [MGPG-94] Reformat code
  • e160f43 [MGPG-94] Bump maven-plugins from 36 to 39
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-gpg-plugin&package-manager=maven&previous-version=1.5&new-version=3.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/gandiva/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index d0290b6814ed5..6337efcf7e348 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -96,7 +96,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.5 + 3.1.0 sign-artifacts From 3b8b700348f5d73fa4cfdb2780b0bde5d83a7f22 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jan 2024 06:02:06 +0900 Subject: [PATCH 04/74] MINOR: [Java] Bump org.apache.hadoop:hadoop-common from 2.7.1 to 3.3.6 in /java (#39833) Bumps org.apache.hadoop:hadoop-common from 2.7.1 to 3.3.6. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.hadoop:hadoop-common&package-manager=maven&previous-version=2.7.1&new-version=3.3.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/adapter/orc/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 265a9a71b80e2..79e51470a426e 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -75,7 +75,7 @@ org.apache.hadoop hadoop-common - 3.3.3 + 3.3.6 test diff --git a/java/pom.xml b/java/pom.xml index 2423e2d495d11..3947f76cae849 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -37,7 +37,7 @@ 1.60.0 3.23.1 2.16.0 - 2.7.1 + 3.3.6 23.5.26 1.11.3 From 91d65b79f71a1be6a0bf7426e0ee91dd2e65a852 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jan 2024 06:02:31 +0900 Subject: [PATCH 05/74] MINOR: [Java] Bump io.netty:netty-bom from 4.1.105.Final to 4.1.106.Final in /java (#39834) Bumps [io.netty:netty-bom](https://github.com/netty/netty) from 4.1.105.Final to 4.1.106.Final.
Commits
  • 9d0ec7b [maven-release-plugin] prepare release netty-4.1.106.Final
  • e2859f4 Short-circuit ByteBuf::release (#13782)
  • d9ca50d Prevent sharing the index of the continuation frame header ByteBuf. (#13786)
  • 0e7c27c DnsNameResolver: Fail query if id space is exhausted (#13784)
  • b194741 [maven-release-plugin] prepare for next development iteration
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.netty:netty-bom&package-manager=maven&previous-version=4.1.105.Final&new-version=4.1.106.Final)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 3947f76cae849..4888f833df096 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.1 2.0.11 33.0.0-jre - 4.1.105.Final + 4.1.106.Final 1.60.0 3.23.1 2.16.0 From 63498c2891c757aca016305c61e4a0ba82faed2b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jan 2024 06:02:55 +0900 Subject: [PATCH 06/74] MINOR: [Java] Bump org.apache.maven.plugins:maven-enforcer-plugin from 3.0.0-M2 to 3.4.1 in /java (#39835) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-enforcer-plugin](https://github.com/apache/maven-enforcer) from 3.0.0-M2 to 3.4.1.
Release notes

Sourced from org.apache.maven.plugins:maven-enforcer-plugin's releases.

3.4.1

🐛 Bug Fixes

👻 Maintenance

3.4.0

🚀 New features and improvements

🐛 Bug Fixes

📦 Dependency updates

📝 Documentation updates

  • Clarify availability of AbstractEnforcerRule (#278) @​kwin

👻 Maintenance

  • Bump org.junit:junit-bom from 5.9.3 to 5.10.0 (#280) @​dependabot
  • Bump snappy-java from 1.1.8.3 to 1.1.10.1 in /maven-enforcer-plugin/src/it/projects/dependency-convergence_transitive_provided/module1 (#273) @​dependabot
  • [MNG-6829] - Replace StringUtils#isEmpty(String) and #isNotEmpty(String) (#272) @​timtebeek

3.3.0

... (truncated)

Commits
  • d8a21ee [maven-release-plugin] prepare release enforcer-3.4.1
  • 66250c0 [MENFORCER-491] Fix plugin documentation generation
  • 5d32e6c [MENFORCER-490] Declare maven-enforcer-plugin dependencies (#285)
  • d258109 [MENFORCER-490] Declare org.eclipse.sisu.plexus dependencies (#283)
  • 2aa71e7 [MENFORCER-490] Declare maven-enforcer-extension dependencies (#284)
  • d4ec8e1 [MENFORCER-490] Declare maven-enforcer-extension dependencies (#282)
  • b35e4a0 [maven-release-plugin] prepare for next development iteration
  • 3d365f7 [maven-release-plugin] prepare release enforcer-3.4.0
  • 5feb93a [MENFORCER-489] Bump commons-lang3 from 3.12.0 to 3.13.0
  • 8f2de47 Bump org.junit:junit-bom from 5.9.3 to 5.10.0
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-enforcer-plugin&package-manager=maven&previous-version=3.0.0-M2&new-version=3.4.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 4888f833df096..3e595648ed085 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -445,7 +445,7 @@
maven-enforcer-plugin - 3.0.0-M2 + 3.4.1 org.apache.maven.plugins From b778ace6622614035acc1bbe17b06bdc8141d9fe Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 30 Jan 2024 11:54:57 +0900 Subject: [PATCH 07/74] GH-39841: [GLib] Add support for GLib 2.56 again (#39842) ### Rationale for this change It's still used in CentOS 7 and AlmaLinux 8. ### What changes are included in this PR? Don't use `g_time_zone_get_identifier()` with GLib < 2.58. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39841 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/basic-data-type.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 0de9466eee456..98b2c92104507 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1212,7 +1212,8 @@ garrow_timestamp_data_type_class_init(GArrowTimestampDataTypeClass *klass) /** * garrow_timestamp_data_type_new: * @unit: The unit of the timestamp data. - * @time_zone: (nullable): The time zone of the timestamp data. + * @time_zone: (nullable): The time zone of the timestamp data. If based GLib + * is less than 2.58, this is ignored. * * Returns: A newly created the number of * seconds/milliseconds/microseconds/nanoseconds since UNIX epoch in @@ -1226,9 +1227,11 @@ garrow_timestamp_data_type_new(GArrowTimeUnit unit, { auto arrow_unit = garrow_time_unit_to_raw(unit); std::string arrow_timezone; +#if GLIB_CHECK_VERSION(2, 58, 0) if (time_zone) { arrow_timezone = g_time_zone_get_identifier(time_zone); } +#endif auto arrow_data_type = arrow::timestamp(arrow_unit, arrow_timezone); auto data_type = GARROW_TIMESTAMP_DATA_TYPE(g_object_new(GARROW_TYPE_TIMESTAMP_DATA_TYPE, From c6ab28677ddf22799f3db277137708ac5b070acd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jan 2024 09:16:53 +0100 Subject: [PATCH 08/74] GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758) ### Rationale for this change Fixing the pinning syntax so we get the latest 0.14.x version (which is currently 0.14.4) * Closes: #39640 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/conda_env_sphinx.txt | 2 +- docs/requirements.txt | 2 +- docs/source/python/api/compute.rst | 2 +- docs/source/python/compute.rst | 4 ++-- docs/source/python/pandas.rst | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index d0f494d2e085d..0e50875fc1ef8 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme=0.14.1 +pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/requirements.txt b/docs/requirements.txt index aee2eb662c06b..5d6fec7ddf72e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ breathe ipython numpydoc -pydata-sphinx-theme==0.14.1 +pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b879643017a90..928c607d139ce 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -590,4 +590,4 @@ User-Defined Functions :toctree: ../generated/ register_scalar_function - ScalarUdfContext + UdfContext diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index e8a5b613c6099..c02059a4f8faa 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -445,9 +445,9 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of -:class:`pyarrow.compute.ScalarUdfContext`. +:class:`pyarrow.compute.UdfContext`. This context exposes several useful attributes, particularly a -:attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for +:attr:`~pyarrow.compute.UdfContext.memory_pool` to be used for allocations in the context of the user-defined function. You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index fda90c4f2a58c..23a4b73bd0965 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -197,7 +197,7 @@ use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow .. ipython:: python - df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=3)}) + df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=3)}) df.dtypes df From 787afa1594586d2d556d21471647f9cd2c55b18f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jan 2024 12:54:19 +0100 Subject: [PATCH 09/74] GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652) ### Rationale for this change First step for https://github.com/apache/arrow/issues/39633: exposing the Array, DataType and Scalar classes for BinaryView and StringView, such that those can already be represented in pyarrow. (I exposed a variant of StringBuilder as well, just for now to be able to create test data) * Closes: #39651 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/api/arrays.rst | 4 ++ docs/source/python/api/datatypes.rst | 4 ++ python/pyarrow/__init__.py | 7 ++- python/pyarrow/array.pxi | 14 +++++ python/pyarrow/builder.pxi | 66 ++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 9 +++ python/pyarrow/lib.pxd | 8 +++ python/pyarrow/lib.pyx | 2 + python/pyarrow/scalar.pxi | 10 ++++ python/pyarrow/src/arrow/python/helpers.cc | 2 + python/pyarrow/tests/test_builder.py | 21 ++++++- python/pyarrow/tests/test_misc.py | 4 ++ python/pyarrow/tests/test_scalars.py | 28 ++++++++- python/pyarrow/tests/test_types.py | 8 +++ python/pyarrow/types.pxi | 32 +++++++++++ python/pyarrow/types.py | 10 ++++ 16 files changed, 223 insertions(+), 6 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 73b5e063ff1a0..b858862dcff01 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -63,6 +63,8 @@ may expose data type-specific methods or properties. FixedSizeBinaryArray LargeBinaryArray LargeStringArray + BinaryViewArray, + StringViewArray, Time32Array Time64Array Date32Array @@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties. FixedSizeBinaryScalar LargeBinaryScalar LargeStringScalar + BinaryViewScalar + StringViewScalar Time32Scalar Time64Scalar Date32Scalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 4066ef314234d..642c243b21af0 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas. large_binary large_string large_utf8 + binary_view + string_view decimal128 list_ large_list @@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category is_large_binary is_large_unicode is_large_string + is_binary_view + is_string_view is_fixed_size_binary is_map is_dictionary diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9da94885ec6b2..4dbd1258d3cea 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -163,7 +163,7 @@ def print_entry(label, value): time32, time64, timestamp, date32, date64, duration, month_day_nano_interval, float16, float32, float64, - binary, string, utf8, + binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, list_, large_list, map_, struct, @@ -205,6 +205,7 @@ def print_entry(label, value): FixedSizeListArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, + BinaryViewArray, StringViewArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, TimestampArray, @@ -223,8 +224,8 @@ def print_entry(label, value): Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, MonthDayNanoIntervalScalar, - BinaryScalar, LargeBinaryScalar, - StringScalar, LargeStringScalar, + BinaryScalar, LargeBinaryScalar, BinaryViewScalar, + StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, ExtensionScalar) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1416f5f4346d9..1029f3a629817 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array): null_count, offset) +cdef class StringViewArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) view data type. + """ + + cdef class BinaryArray(Array): """ Concrete class for Arrow arrays of variable-sized binary data type. @@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array): return ( self.ap).total_values_length() +cdef class BinaryViewArray(Array): + """ + Concrete class for Arrow arrays of variable-sized binary view data type. + """ + + cdef class DictionaryArray(Array): """ Concrete class for dictionary-encoded Arrow arrays. @@ -3669,6 +3681,8 @@ cdef dict _array_classes = { _Type_STRING: StringArray, _Type_LARGE_BINARY: LargeBinaryArray, _Type_LARGE_STRING: LargeStringArray, + _Type_BINARY_VIEW: BinaryViewArray, + _Type_STRING_VIEW: StringViewArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, _Type_DECIMAL128: Decimal128Array, diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi index a34ea5412e14a..2af39e2c589e6 100644 --- a/python/pyarrow/builder.pxi +++ b/python/pyarrow/builder.pxi @@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable): def __len__(self): return self.builder.get().length() + + +cdef class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + cdef: + unique_ptr[CStringViewBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CStringViewBuilder(pool)) + + def append(self, value): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, (bytes, str)): + self.builder.get().Append(tobytes(value)) + else: + raise TypeError('StringViewBuilder only accepts string objects') + + def append_values(self, values): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + for value in values: + self.append(value) + + def finish(self): + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 74e92594b04e5..d92f09da779b6 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY" _Type_LARGE_STRING" arrow::Type::LARGE_STRING" _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY" + _Type_BINARY_VIEW" arrow::Type::BINARY_VIEW" + _Type_STRING_VIEW" arrow::Type::STRING_VIEW" _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" @@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil: cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder): CStringBuilder(CMemoryPool* pool) + CStatus Append(const c_string& value) + + cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder): + CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool) + CStatus Append(const char* value, int32_t length) + cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder): + CStringViewBuilder(CMemoryPool* pool) CStatus Append(const c_string& value) cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 58ec34addbc0a..c1104864066e9 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -445,6 +445,14 @@ cdef class BinaryArray(Array): pass +cdef class StringViewArray(Array): + pass + + +cdef class BinaryViewArray(Array): + pass + + cdef class DictionaryArray(Array): cdef: object _indices, _dictionary diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 29a0bed55949c..b0368b67f790e 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -106,6 +106,8 @@ Type_STRING = _Type_STRING Type_LARGE_BINARY = _Type_LARGE_BINARY Type_LARGE_STRING = _Type_LARGE_STRING Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY +Type_BINARY_VIEW = _Type_BINARY_VIEW +Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST Type_MAP = _Type_MAP diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 9a66dc81226d4..2772acf81861c 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar): pass +cdef class BinaryViewScalar(BinaryScalar): + pass + + +cdef class StringViewScalar(StringScalar): + pass + + cdef class ListScalar(Scalar): """ Concrete class for list-like scalars. @@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = { _Type_BINARY: BinaryScalar, _Type_LARGE_BINARY: LargeBinaryScalar, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar, + _Type_BINARY_VIEW: BinaryViewScalar, _Type_STRING: StringScalar, _Type_LARGE_STRING: LargeStringScalar, + _Type_STRING_VIEW: StringViewScalar, _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index c266abc169d49..2c86c86a919be 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -63,6 +63,8 @@ std::shared_ptr GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(STRING, utf8); GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); + GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view); + GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view); GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); default: return nullptr; diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index 50d801026b7d8..abc8a0013df37 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -20,7 +20,7 @@ import numpy as np import pyarrow as pa -from pyarrow.lib import StringBuilder +from pyarrow.lib import StringBuilder, StringViewBuilder def test_weakref(): @@ -65,3 +65,22 @@ def test_string_builder_append_after_finish(): sbuilder.append("No effect") expected = [None, None, "text", None, "other text"] assert arr.to_pylist() == expected + + +def test_string_view_builder(): + builder = StringViewBuilder() + builder.append(b"a byte string") + builder.append("a string") + builder.append("a longer not-inlined string") + builder.append(np.nan) + builder.append_values([None, "text"]) + assert len(builder) == 6 + assert builder.null_count == 2 + arr = builder.finish() + assert isinstance(arr, pa.Array) + assert arr.null_count == 2 + assert arr.type == 'string_view' + expected = [ + "a byte string", "a string", "a longer not-inlined string", None, None, "text" + ] + assert arr.to_pylist() == expected diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8b8c50882b749..8cec8783280dd 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows(): pa.UnionArray, pa.BinaryArray, pa.StringArray, + pa.BinaryViewArray, + pa.StringViewArray, pa.FixedSizeBinaryArray, pa.DictionaryArray, pa.Date32Array, @@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringScalar, pa.BinaryScalar, pa.FixedSizeBinaryScalar, + pa.BinaryViewScalar, + pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, pa.MapScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 74dee59558239..4a239b23d5676 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -51,6 +51,9 @@ (b"bytes", None, pa.BinaryScalar), ("largestring", pa.large_string(), pa.LargeStringScalar), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar), + # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented + # ("string_view", pa.string_view(), pa.StringViewScalar), + # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar), ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), @@ -488,7 +491,8 @@ def test_month_day_nano_interval(): @pytest.mark.parametrize('value', ['foo', 'mañana']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), - (pa.large_string(), pa.LargeStringScalar) + (pa.large_string(), pa.LargeStringScalar), + # (pa.string_view(), pa.StringViewScalar), ]) def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ): assert buf.to_pybytes() == value.encode() +@pytest.mark.parametrize('value', ['foo', 'mañana']) +def test_string_view(value): + # TODO: replace with normal scalar construction + builder = pa.lib.StringViewBuilder() + builder.append(value) + arr = builder.finish() + + s = arr[0] + assert isinstance(s, pa.StringViewScalar) + assert s.as_py() == value + assert s.as_py() != 'something' + assert repr(value) in repr(s) + assert str(s) == str(value) + + buf = s.as_buffer() + assert isinstance(buf, pa.Buffer) + assert buf.to_pybytes() == value.encode() + + @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.binary(), pa.BinaryScalar), - (pa.large_binary(), pa.LargeBinaryScalar) + (pa.large_binary(), pa.LargeBinaryScalar), + # (pa.binary_view(), pa.BinaryViewScalar), ]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index c8a52c6b626c2..a5ab3128dc874 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -61,6 +61,8 @@ def get_many_types(): pa.binary(10), pa.large_string(), pa.large_binary(), + pa.string_view(), + pa.binary_view(), pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), @@ -244,6 +246,12 @@ def test_is_binary_string(): assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary()) + assert types.is_string_view(pa.string_view()) + assert not types.is_string_view(pa.string()) + assert types.is_binary_view(pa.binary_view()) + assert not types.is_binary_view(pa.binary()) + assert not types.is_binary_view(pa.string_view()) + def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b6dc53d633543..ce3736b5af847 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4375,6 +4375,36 @@ def large_utf8(): return large_string() +def binary_view(): + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + return primitive_type(_Type_BINARY_VIEW) + + +def string_view(): + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + return primitive_type(_Type_STRING_VIEW) + + def list_(value_type, int list_size=-1): """ Create ListType instance from child data type or field. @@ -4991,6 +5021,8 @@ cdef dict _type_aliases = { 'large_str': large_string, 'large_utf8': large_string, 'large_binary': large_binary, + 'binary_view': binary_view, + 'string_view': string_view, 'date32': date32, 'date64': date64, 'date32[day]': date32, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 5d7dbe4b451b9..32398dac9c5f5 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -243,6 +243,16 @@ def is_fixed_size_binary(t): return t.id == lib.Type_FIXED_SIZE_BINARY +@doc(is_null, datatype="variable-length binary view") +def is_binary_view(t): + return t.id == lib.Type_BINARY_VIEW + + +@doc(is_null, datatype="variable-length string (utf-8) view") +def is_string_view(t): + return t.id == lib.Type_STRING_VIEW + + @doc(is_null, datatype="date") def is_date(t): return t.id in _DATE_TYPES From 749f936fc77b83d3c0ec5642c16561b3afa5dfa7 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 31 Jan 2024 08:24:11 -0800 Subject: [PATCH 10/74] MINOR: [CI] update weston codeowners (#39867) ### Rationale for this change Currently I am unable to keep up with my Github inbox and thus respond to very little. I am trying to balance this. ### What changes are included in this PR? Reduce the scope of files that will trigger automated review. ### Are these changes tested? N/A ### Are there any user-facing changes? No Authored-by: Weston Pace Signed-off-by: Weston Pace --- .github/CODEOWNERS | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 41a075b1c0bcb..e7e544c2b0e62 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -30,15 +30,10 @@ # /cpp/ /cpp/src/arrow/acero @westonpace /cpp/src/arrow/adapters/orc @wgtmac -/cpp/src/arrow/dataset @westonpace /cpp/src/arrow/engine @westonpace /cpp/src/arrow/flight/ @lidavidm -/cpp/src/arrow/util/async* @westonpace -/cpp/src/arrow/util/future* @westonpace -/cpp/src/arrow/util/thread* @westonpace /cpp/src/parquet @wgtmac -/cpp/src/skyhook @westonpace -/csharp/ @westonpace +/csharp/ @curthagenlocher /go/ @zeroshade /java/ @lidavidm /js/ @domoritz @trxcllnt From 2a87693134135a8af2ae2b6df41980176431b1c0 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Wed, 31 Jan 2024 13:38:54 -0500 Subject: [PATCH 11/74] GH-39680: [Java] enable half float support on Java module (#39681) ### Rationale for this change - To enable half float support on Java module. ### What changes are included in this PR? - [x] Add initial Float16 type support - [x] Unit test - [x] Integration test - [x] Documentation ### Are these changes tested? Yes. ### Are there any user-facing changes? No * Closes: #39680 Authored-by: david dali susanibar arce Signed-off-by: David Li --- docs/source/status.rst | 9 +- .../apache/arrow/dataset/TestAllTypes.java | 6 +- .../org/apache/arrow/memory/util/Float16.java | 271 +++++++++++ .../org/apache/arrow/memory/TestArrowBuf.java | 11 + .../main/codegen/data/ValueVectorTypes.tdd | 10 + .../main/codegen/templates/UnionReader.java | 6 +- .../org/apache/arrow/vector/Float2Vector.java | 434 ++++++++++++++++++ .../org/apache/arrow/vector/types/Types.java | 16 +- .../apache/arrow/vector/TestValueVector.java | 198 ++++++++ 9 files changed, 953 insertions(+), 8 deletions(-) create mode 100644 java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java diff --git a/docs/source/status.rst b/docs/source/status.rst index 03a87012342c2..11dd9c2c2965c 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -40,7 +40,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Float16 | ✓ (1) | | ✓ | ✓ | ✓ (2)| ✓ | ✓ | | +| Float16 | ✓ (1) | ✓ (2) | ✓ | ✓ | ✓ (3)| ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -104,7 +104,7 @@ Data Types | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | (special) | | | | | | | | | +===================+=======+=======+=======+============+=======+=======+=======+=======+ -| Dictionary | ✓ | ✓ (3) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +| Dictionary | ✓ | ✓ (4) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -114,8 +114,9 @@ Data Types Notes: * \(1) Casting to/from Float16 in C++ is not supported. -* \(2) Float16 support in C# is only available when targeting .NET 6+. -* \(3) Nested dictionaries not supported +* \(2) Casting to/from Float16 in Java is not supported. +* \(3) Float16 support in C# is only available when targeting .NET 6+. +* \(4) Nested dictionaries not supported .. seealso:: The :ref:`format_columnar` specification. diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java index 13b247452348d..6d33cf057ed3a 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java @@ -32,6 +32,7 @@ import org.apache.arrow.dataset.file.DatasetFileWriter; import org.apache.arrow.dataset.file.FileFormat; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.Float16; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateMilliVector; @@ -39,6 +40,7 @@ import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -89,7 +91,6 @@ public class TestAllTypes extends TestDataset { private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { // Notes: - // - Float16 is not supported by Java. // - IntervalMonthDayNano is not supported by Parquet. // - Map (GH-38250) and SparseUnion are resulting in serialization errors when writing with the Dataset API. // "Unhandled type for Arrow to Parquet schema conversion" errors: IntervalDay, IntervalYear, DenseUnion @@ -109,6 +110,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { Field.nullablePrimitive("uint16", new ArrowType.Int(16, false)), Field.nullablePrimitive("uint32", new ArrowType.Int(32, false)), Field.nullablePrimitive("uint64", new ArrowType.Int(64, false)), + Field.nullablePrimitive("float16", new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)), Field.nullablePrimitive("float32", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), Field.nullablePrimitive("float64", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), Field.nullablePrimitive("utf8", ArrowType.Utf8.INSTANCE), @@ -148,6 +150,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { root.getVector("uint16").setNull(0); root.getVector("uint32").setNull(0); root.getVector("uint64").setNull(0); + root.getVector("float16").setNull(0); root.getVector("float32").setNull(0); root.getVector("float64").setNull(0); root.getVector("utf8").setNull(0); @@ -180,6 +183,7 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { ((UInt2Vector) root.getVector("uint16")).set(1, 1); ((UInt4Vector) root.getVector("uint32")).set(1, 1); ((UInt8Vector) root.getVector("uint64")).set(1, 1); + ((Float2Vector) root.getVector("float16")).set(1, Float16.toFloat16(+32.875f)); ((Float4Vector) root.getVector("float32")).set(1, 1.0f); ((Float8Vector) root.getVector("float64")).set(1, 1.0); ((VarCharVector) root.getVector("utf8")).set(1, new Text("a")); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java new file mode 100644 index 0000000000000..8040158fd090e --- /dev/null +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/Float16.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory.util; + + +import org.apache.arrow.util.VisibleForTesting; + +/** + * Lifted from Apache Parquet MR project: + * https://github.com/apache/parquet-mr/blob/e87b80308869b77f914fcfd04364686e11158950/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java + *
    + * Changes made: + *
  • Modify the data type input from Parquet-MR Binary (toFloat(Binary b)) to Arrow Java short (toFloat(short b))
  • + *
  • Expose NAN and POSITIVE_INFINITY variables
  • + *
+ * + * + * The class is a utility class to manipulate half-precision 16-bit + * IEEE 754 + * floating point data types (also called fp16 or binary16). A half-precision float can be + * created from or converted to single-precision floats, and is stored in a short data type. + * The IEEE 754 standard specifies an float16 as having the following format: + *
    + *
  • Sign bit: 1 bit
  • + *
  • Exponent width: 5 bits
  • + *
  • Significand: 10 bits
  • + *
+ * + *

The format is laid out as follows:

+ *
+ * 1   11111   1111111111
+ * ^   --^--   -----^----
+ * sign  |          |_______ significand
+ *       |
+ *      -- exponent
+ * 
+ * Half-precision floating points can be useful to save memory and/or + * bandwidth at the expense of range and precision when compared to single-precision + * floating points (float32). + * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java + */ +public class Float16 { + // Positive infinity of type half-precision float. + public static final short POSITIVE_INFINITY = (short) 0x7c00; + // A Not-a-Number representation of a half-precision float. + public static final short NaN = (short) 0x7e00; + // The bitmask to and a number with to obtain the sign bit. + private static final int SIGN_MASK = 0x8000; + // The offset to shift by to obtain the exponent bits. + private static final int EXPONENT_SHIFT = 10; + // The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits. + private static final int SHIFTED_EXPONENT_MASK = 0x1f; + // The bitmask to and a number with to obtain significand bits. + private static final int SIGNIFICAND_MASK = 0x3ff; + // The offset of the exponent from the actual value. + private static final int EXPONENT_BIAS = 15; + // The offset to shift by to obtain the sign bit. + private static final int SIGN_SHIFT = 15; + // The bitmask to AND with to obtain exponent and significand bits. + private static final int EXPONENT_SIGNIFICAND_MASK = 0x7fff; + + private static final int FP32_SIGN_SHIFT = 31; + private static final int FP32_EXPONENT_SHIFT = 23; + private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff; + private static final int FP32_SIGNIFICAND_MASK = 0x7fffff; + private static final int FP32_EXPONENT_BIAS = 127; + private static final int FP32_QNAN_MASK = 0x400000; + private static final int FP32_DENORMAL_MAGIC = 126 << 23; + private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC); + + /** + * Returns true if the specified half-precision float value represents + * a Not-a-Number, false otherwise. + * + * @param h A half-precision float value + * @return True if the value is a NaN, false otherwise + * + */ + @VisibleForTesting + public static boolean isNaN(short h) { + return (h & EXPONENT_SIGNIFICAND_MASK) > POSITIVE_INFINITY; + } + + /** + *

Compares the two specified half-precision float values. The following + * conditions apply during the comparison:

+ * + *
    + *
  • NaN is considered by this method to be equal to itself and greater + * than all other half-precision float values (including {@code #POSITIVE_INFINITY})
  • + *
  • POSITIVE_ZERO is considered by this method to be greater than NEGATIVE_ZERO.
  • + *
+ * + * @param x The first half-precision float value to compare. + * @param y The second half-precision float value to compare + * + * @return The value {@code 0} if {@code x} is numerically equal to {@code y}, a + * value less than {@code 0} if {@code x} is numerically less than {@code y}, + * and a value greater than {@code 0} if {@code x} is numerically greater + * than {@code y} + * + */ + @VisibleForTesting + public static int compare(short x, short y) { + boolean xIsNaN = isNaN(x); + boolean yIsNaN = isNaN(y); + + if (!xIsNaN && !yIsNaN) { + int first = ((x & SIGN_MASK) != 0 ? 0x8000 - (x & 0xffff) : x & 0xffff); + int second = ((y & SIGN_MASK) != 0 ? 0x8000 - (y & 0xffff) : y & 0xffff); + // Returns true if the first half-precision float value is less + // (smaller toward negative infinity) than the second half-precision float value. + if (first < second) { + return -1; + } + + // Returns true if the first half-precision float value is greater + // (larger toward positive infinity) than the second half-precision float value. + if (first > second) { + return 1; + } + } + + // Collapse NaNs, akin to halfToIntBits(), but we want to keep + // (signed) short value types to preserve the ordering of -0.0 + // and +0.0 + short xBits = xIsNaN ? NaN : x; + short yBits = yIsNaN ? NaN : y; + return (xBits == yBits ? 0 : (xBits < yBits ? -1 : 1)); + } + + /** + * Converts the specified half-precision float value into a + * single-precision float value. The following special cases are handled: + * If the input is NaN, the returned value is Float NaN. + * If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively + * Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is +/-0.0f. + * Otherwise, the returned value is a normalized single-precision float value. + * + * @param b The half-precision float value to convert to single-precision + * @return A normalized single-precision float value + */ + @VisibleForTesting + public static float toFloat(short b) { + int bits = b & 0xffff; + int s = bits & SIGN_MASK; + int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK; + int m = (bits) & SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0) { // Denormal or 0 + if (m != 0) { + // Convert denorm fp16 into normalized fp32 + float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m); + o -= FP32_DENORMAL_FLOAT; + return s == 0 ? o : -o; + } + } else { + outM = m << 13; + if (e == 0x1f) { // Infinite or NaN + outE = 0xff; + if (outM != 0) { // SNaNs are quieted + outM |= FP32_QNAN_MASK; + } + } else { + outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS; + } + } + int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM; + return Float.intBitsToFloat(out); + } + + /** + * Converts the specified single-precision float value into a + * half-precision float value. The following special cases are handled: + * + * If the input is NaN, the returned value is NaN. + * If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY, + * the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is + * POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_VALUE, the returned value + * is flushed to POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_NORMAL, the returned value + * is a denorm half-precision float. + * Otherwise, the returned value is rounded to the nearest + * representable half-precision float value. + * + * @param f The single-precision float value to convert to half-precision + * @return A half-precision float value + */ + public static short toFloat16(float f) { + int bits = Float.floatToRawIntBits(f); + int s = (bits >>> FP32_SIGN_SHIFT); + int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK; + int m = (bits) & FP32_SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0xff) { // Infinite or NaN + outE = 0x1f; + outM = m != 0 ? 0x200 : 0; + } else { + e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS; + if (e >= 0x1f) { // Overflow + outE = 0x1f; + } else if (e <= 0) { // Underflow + if (e < -10) { + // The absolute fp32 value is less than MIN_VALUE, flush to +/-0 + } else { + // The fp32 value is a normalized float less than MIN_NORMAL, + // we convert to a denorm fp16 + m = m | 0x800000; + int shift = 14 - e; + outM = m >> shift; + int lowm = m & ((1 << shift) - 1); + int hway = 1 << (shift - 1); + // if above halfway or exactly halfway and outM is odd + if (lowm + (outM & 1) > hway) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } else { + outE = e; + outM = m >> 13; + // if above halfway or exactly halfway and outM is odd + if ((m & 0x1fff) + (outM & 0x1) > 0x1000) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } + // The outM is added here as the +1 increments for outM above can + // cause an overflow in the exponent bit which is OK. + return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM); + } + + /** + * Returns a string representation of the specified half-precision + * float value. Calling this method is equivalent to calling + * Float.toString(toFloat(h)). See {@link Float#toString(float)} + * for more information on the format of the string representation. + * + * @param h A half-precision float value in binary little-endian format + * @return A string representation of the specified value + */ + @VisibleForTesting + public static String toFloatString(short h) { + return Float.toString(Float16.toFloat(h)); + } +} diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java index 9ba42abc1ce89..b4385b72a38cf 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java @@ -29,6 +29,7 @@ import java.nio.ByteOrder; import java.util.Arrays; +import org.apache.arrow.memory.util.Float16; import org.junit.Test; import org.slf4j.LoggerFactory; @@ -180,4 +181,14 @@ public void testEnabledHistoricalLog() { ((Logger) LoggerFactory.getLogger("org.apache.arrow")).setLevel(null); } } + + @Test + public void testArrowBufFloat16() { + try (BufferAllocator allocator = new RootAllocator(); + ArrowBuf buf = allocator.buffer(1024) + ) { + buf.setShort(0, Float16.toFloat16(+32.875f)); + assertEquals((short) 0x501c, buf.getShort(0)); + } + } } diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 2a921804202f0..6c2a967712454 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -49,6 +49,16 @@ { class: "SmallInt", valueHolder: "Int2Holder"}, ] }, + { + major: "Fixed", + width: 2, + javaType: "short", + boxedType: "Short", + fields: [{name: "value", type: "short"}], + minor: [ + { class: "Float2", valueHolder: "Int2Holder"}, + ] + }, { major: "Fixed", width: 4, diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 56a6cc90b321b..822d4822987fb 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,9 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private BaseReader[] readers = new BaseReader[45]; + private static final int NUM_SUPPORTED_TYPES = 46; + + private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; public UnionReader(UnionVector data) { @@ -50,7 +52,7 @@ public MinorType getMinorType() { return TYPES[data.getTypeValue(idx())]; } - private static MinorType[] TYPES = new MinorType[45]; + private static MinorType[] TYPES = new MinorType[NUM_SUPPORTED_TYPES]; static { for (MinorType minorType : MinorType.values()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java new file mode 100644 index 0000000000000..9d3f25769abff --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/Float2Vector.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.Float16; +import org.apache.arrow.vector.complex.impl.Float2ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float2Holder; +import org.apache.arrow.vector.holders.NullableFloat2Holder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float2Vector implements a fixed width (2 bytes) vector of + * short values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public final class Float2Vector extends BaseFixedWidthVector implements FloatingPointVector { + public static final byte TYPE_WIDTH = 2; + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float2Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.FLOAT2.getType()), allocator); + } + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float2Vector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a Float2Vector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float2Vector(Field field, BufferAllocator allocator) { + super(field, allocator, TYPE_WIDTH); + } + + @Override + protected FieldReader getReaderImpl() { + return new Float2ReaderImpl(Float2Vector.this); + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.FLOAT2; + } + + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public short get(int index) throws IllegalStateException { + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat2Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getShort((long) index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + @Override + public Short getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getShort((long) index * TYPE_WIDTH); + } + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + *

This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + static short get(final ArrowBuf buffer, final int index) { + return buffer.getShort((long) index * TYPE_WIDTH); + } + + @Override + public double getValueAsDouble(int index) { + return getValueAsFloat(index); + } + + public float getValueAsFloat(int index) { + return Float16.toFloat(this.get(index)); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + private void setValue(int index, short value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, value); + } + + private void setValue(int index, float value) { + valueBuffer.setShort((long) index * TYPE_WIDTH, Float16.toFloat16(value)); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, short value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void setWithPossibleTruncate(int index, float value) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat2Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float2Holder holder) { + BitVectorHelper.setBit(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, short)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, short value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #setWithPossibleTruncate(int, float)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafeWithPossibleTruncate(int index, float value) { + handleSafe(index); + setWithPossibleTruncate(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat2Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float2Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, short value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setWithPossibleTruncate(int index, int isSet, float value) { + if (isSet > 0) { + setWithPossibleTruncate(index, value); + } else { + BitVectorHelper.unsetBit(validityBuffer, index); + } + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, short value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafeWithPossibleTruncate(int index, int isSet, float value) { + handleSafe(index); + setWithPossibleTruncate(index, isSet, value); + } + + @Override + public void setWithPossibleTruncate(int index, double value) { + throw new UnsupportedOperationException("The operation for double data types is not supported."); + } + + @Override + public void setSafeWithPossibleTruncate(int index, double value) { + throw new UnsupportedOperationException("The operation for double data types is not supported."); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair comprising this and a target vector of + * the same type. + * + * @param field Field object used by the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + return new TransferImpl(field, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float2Vector) to); + } + + private class TransferImpl implements TransferPair { + Float2Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float2Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Field field, BufferAllocator allocator) { + to = new Float2Vector(field, allocator); + } + + public TransferImpl(Float2Vector to) { + this.to = to; + } + + @Override + public Float2Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float2Vector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index f29157524f2df..0b0e0d66a98f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector.types; import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; +import static org.apache.arrow.vector.types.FloatingPointPrecision.HALF; import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; import static org.apache.arrow.vector.types.UnionMode.Dense; import static org.apache.arrow.vector.types.UnionMode.Sparse; @@ -33,6 +34,7 @@ import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; @@ -79,6 +81,7 @@ import org.apache.arrow.vector.complex.impl.DenseUnionWriter; import org.apache.arrow.vector.complex.impl.DurationWriterImpl; import org.apache.arrow.vector.complex.impl.FixedSizeBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.Float2WriterImpl; import org.apache.arrow.vector.complex.impl.Float4WriterImpl; import org.apache.arrow.vector.complex.impl.Float8WriterImpl; import org.apache.arrow.vector.complex.impl.IntWriterImpl; @@ -432,6 +435,17 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntervalYearWriterImpl((IntervalYearVector) vector); } }, + FLOAT2(new FloatingPoint(HALF)) { + @Override + public FieldVector getNewVector(Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { + return new Float2Vector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float2WriterImpl((Float2Vector) vector); + } + }, // 4 byte ieee 754 FLOAT4(new FloatingPoint(SINGLE)) { @Override @@ -894,7 +908,7 @@ public MinorType visit(Int type) { public MinorType visit(FloatingPoint type) { switch (type.getPrecision()) { case HALF: - throw new UnsupportedOperationException("NYI: " + type); + return MinorType.FLOAT2; case SINGLE: return MinorType.FLOAT4; case DOUBLE: diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 614aff18d4554..10091aebdd50b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -332,6 +332,204 @@ public void testSizeOfValueBuffer() { } } + @Test + public void testFixedFloat2() { + try (final Float2Vector floatVector = new Float2Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 4); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 32 bytes (16 * 2) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + floatVector.zeroVector(); + + /* populate the floatVector */ + floatVector.set(0, (short) 0x101c); // Float16.toFloat16(+0.00050163269043f) + floatVector.set(2, (short) 0x901c); // Float16.toFloat16(-0.00050163269043f) + floatVector.set(4, (short) 0x101d); // Float16.toFloat16(+0.000502109527588f) + floatVector.set(6, (short) 0x901d); // Float16.toFloat16(-0.000502109527588f) + floatVector.set(8, (short) 0x121c); // Float16.toFloat16(+0.00074577331543f) + floatVector.set(10, (short) 0x921c); // Float16.toFloat16(-0.00074577331543f) + floatVector.set(12, (short) 0x501c); // Float16.toFloat16(+32.875f) + floatVector.set(14, (short) 0xd01c); // Float16.toFloat16(-32.875f) + + try { + floatVector.set(initialCapacity, (short) 0x141c); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals((short) 0x901d, floatVector.get(6)); + assertEquals((short) 0x121c, floatVector.get(8)); + assertEquals((short) 0x921c, floatVector.get(10)); + assertEquals((short) 0x501c, floatVector.get(12)); + assertEquals((short) 0xd01c, floatVector.get(14)); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + } + + /* this should trigger a realloc() */ + floatVector.setSafe(initialCapacity, (short) 0x141c); // Float16.toFloat16(+0.00100326538086f) + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals((short) 0x901d, floatVector.get(6)); + assertEquals((short) 0x121c, floatVector.get(8)); + assertEquals((short) 0x921c, floatVector.get(10)); + assertEquals((short) 0x501c, floatVector.get(12)); + assertEquals((short) 0xd01c, floatVector.get(14)); + assertEquals((short) 0x141c, floatVector.get(initialCapacity)); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-zero data not expected at index: " + i, floatVector.isNull(i)); + } + } + } + + @Test + public void testFixedFloat2WithPossibleTruncate() { + try (final Float2Vector floatVector = new Float2Vector(EMPTY_SCHEMA_PATH, allocator)) { + boolean error = false; + int initialCapacity = 16; + + /* we should not throw exception for these values of capacity */ + floatVector.setInitialCapacity(MAX_VALUE_COUNT - 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT); + + try { + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 4); + } catch (OversizedAllocationException oe) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + floatVector.setInitialCapacity(initialCapacity); + /* no memory allocation has happened yet so capacity of underlying buffer should be 0 */ + assertEquals(0, floatVector.getValueCapacity()); + + /* allocate 32 bytes (16 * 2) */ + floatVector.allocateNew(); + /* underlying buffer should be able to store 16 values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); + + floatVector.zeroVector(); + + /* populate the floatVector */ + floatVector.set(0, (short) 0x101c); // Float16.toFloat16(+0.00050163269043f) + floatVector.set(2, (short) 0x901c); // Float16.toFloat16(-0.00050163269043f) + floatVector.set(4, (short) 0x101d); // Float16.toFloat16(+0.000502109527588f) + floatVector.setWithPossibleTruncate(6, 2049.0f); // in f32=2049.000000, out f16=2048 + floatVector.setWithPossibleTruncate(8, 4098.0f); // in f32=4098.000000, out f16=4096 + floatVector.setWithPossibleTruncate(10, 8196.0f); // in f32=8196.000000, out f16=8192 + floatVector.setWithPossibleTruncate(12, 16392.0f); // in f32=16392.000000, out f16=16384 + floatVector.setWithPossibleTruncate(14, 32784.0f); // in f32=32784.000000, out f16=32768 + + try { + floatVector.setWithPossibleTruncate(initialCapacity, 1.618034f); // in f32=1.618034, out f16=1.6181641 + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + error = false; + } + + /* check vector contents */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals(2048.0f, floatVector.getValueAsFloat(6), 0); + assertEquals(4096.0f, floatVector.getValueAsFloat(8), 0); + assertEquals(8192.0f, floatVector.getValueAsFloat(10), 0); + assertEquals(16384.0f, floatVector.getValueAsDouble(12), 0); + assertEquals(32768.0f, floatVector.getValueAsDouble(14), 0); + + try { + floatVector.get(initialCapacity); + } catch (IndexOutOfBoundsException ie) { + error = true; + } finally { + assertTrue(error); + } + + /* this should trigger a realloc() */ + floatVector.setSafeWithPossibleTruncate(initialCapacity, 1.618034f); // in f32=1.618034, out f16=1.6181641 + + /* underlying buffer should now be able to store double the number of values */ + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); + + /* vector data should still be intact after realloc */ + assertEquals((short) 0x101c, floatVector.get(0)); + assertEquals((short) 0x901c, floatVector.get(2)); + assertEquals((short) 0x101d, floatVector.get(4)); + assertEquals(2048.0f, floatVector.getValueAsFloat(6), 0); + assertEquals(4096.0f, floatVector.getValueAsFloat(8), 0); + assertEquals(8192.0f, floatVector.getValueAsFloat(10), 0); + assertEquals(16384.0f, floatVector.getValueAsDouble(12), 0); + assertEquals(32768.0f, floatVector.getValueAsDouble(14), 0); + assertEquals(1.6181641f, floatVector.getValueAsDouble(initialCapacity), 0); + + /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); + floatVector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); + + /* vector data should be zeroed out */ + for (int i = 0; i < capacityBeforeReset; i++) { + assertTrue("non-zero data not expected at index: " + i, floatVector.isNull(i)); + } + } + } + @Test /* Float4Vector */ public void testFixedType3() { try (final Float4Vector floatVector = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { From 6ccfeeec3b864671556e50c1ac01e65f47bd06d9 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 1 Feb 2024 21:14:47 +0800 Subject: [PATCH 12/74] GH-39876: [C++] Thirdparty: Bump zlib to 1.3.1 (#39877) ### Rationale for this change zlib 1.3.1 is the latest release. ### What changes are included in this PR? Bump zlib to 1.3.1 ### Are these changes tested? Already has testing ### Are there any user-facing changes? no * Closes: #39876 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/thirdparty/versions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 2664775c0fbf4..dd3f5da84f777 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -115,8 +115,8 @@ ARROW_UTF8PROC_BUILD_VERSION=v2.7.0 ARROW_UTF8PROC_BUILD_SHA256_CHECKSUM=4bb121e297293c0fd55f08f83afab6d35d48f0af4ecc07523ad8ec99aa2b12a1 ARROW_XSIMD_BUILD_VERSION=9.0.1 ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0 -ARROW_ZLIB_BUILD_VERSION=1.3 -ARROW_ZLIB_BUILD_SHA256_CHECKSUM=ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e +ARROW_ZLIB_BUILD_VERSION=1.3.1 +ARROW_ZLIB_BUILD_SHA256_CHECKSUM=9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 ARROW_ZSTD_BUILD_VERSION=1.5.5 ARROW_ZSTD_BUILD_SHA256_CHECKSUM=9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4 From 2721134715b7dedfa2715bcf47548728ff702d5a Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 1 Feb 2024 21:24:42 +0800 Subject: [PATCH 13/74] GH-39845: [C++][Parquet] Minor: avoid creating a new Reader object in Decoder::SetData (#39847) ### Rationale for this change avoid creating a new Reader object in Decoder::SetData ### What changes are included in this PR? avoid creating a new Reader object in Decoder::SetData ### Are these changes tested? Already ### Are there any user-facing changes? no * Closes: #39845 Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/encoding.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b801b5ab11bb9..5573f5b9aed4c 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2411,7 +2411,11 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecodernum_values_ = num_values; - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_ == nullptr) { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } else { + decoder_->Reset(data, len); + } InitHeader(); } @@ -2769,7 +2773,11 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, void SetData(int num_values, const uint8_t* data, int len) override { DecoderImpl::SetData(num_values, data, len); - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_ == nullptr) { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } else { + decoder_->Reset(data, len); + } DecodeLengths(); } From 44d5597a0e8a4d635f1aec82ba885f61b5c17829 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 1 Feb 2024 14:35:32 +0100 Subject: [PATCH 14/74] GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850) ### Rationale for this change Removing the use of `pytest-lazy-fixture` in our test suite as it is unmaintained. Changes in this PR include: - Remove the use of `pytest-lazy-fixture` - Remove marks from fixtures to avoid future error, see ``` PytestRemovedIn9Warning: Marks applied to fixtures have no effect See docs: https://docs.pytest.org/en/stable/deprecations.html#applying-a-mark-to-a-fixture-function ``` - Catch two different warnings in `def test_legacy_int_type()` ### Are these changes tested? The changes affect the tests so they must pass. ### Are there any user-facing changes? No. * Closes: #39849 Lead-authored-by: AlenkaF Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/conda_env_python.txt | 3 +- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 1 - python/pyarrow/tests/conftest.py | 7 ++--- python/pyarrow/tests/test_dataset.py | 3 -- python/pyarrow/tests/test_extension_type.py | 5 +-- python/pyarrow/tests/test_fs.py | 34 ++++++++++----------- python/pyarrow/tests/test_ipc.py | 6 ++-- python/requirements-test.txt | 1 - python/requirements-wheel-test.txt | 1 - 9 files changed, 25 insertions(+), 36 deletions(-) diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 5fdd21d2bd1f9..59e2def1bf339 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -23,9 +23,8 @@ cloudpickle fsspec hypothesis numpy>=1.16.6 -pytest<8 # pytest-lazy-fixture broken on pytest 8.0.0 +pytest<8 pytest-faulthandler -pytest-lazy-fixture s3fs>=2023.10.0 setuptools setuptools_scm<8.0.0 diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index b8ffbfdb715b6..367445c595c4b 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -340,7 +340,6 @@ outputs: # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - pytest - - pytest-lazy-fixture - backports.zoneinfo # [py<39] - boto3 - cffi diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a5941e8c8d1a8..0da757a4bc56e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -24,7 +24,6 @@ import urllib.request import pytest -from pytest_lazyfixture import lazy_fixture import hypothesis as h from ..conftest import groups, defaults @@ -259,13 +258,13 @@ def gcs_server(): @pytest.fixture( params=[ - lazy_fixture('builtin_pickle'), - lazy_fixture('cloudpickle') + 'builtin_pickle', + 'cloudpickle' ], scope='session' ) def pickle_module(request): - return request.param + return request.getfixturevalue(request.param) @pytest.fixture(scope='session') diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a4838d63a6b0b..a9054f0b174aa 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -100,7 +100,6 @@ def assert_dataset_fragment_convenience_methods(dataset): @pytest.fixture -@pytest.mark.parquet def mockfs(): mockfs = fs._MockFileSystem() @@ -221,7 +220,6 @@ def multisourcefs(request): @pytest.fixture -@pytest.mark.parquet def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) @@ -2692,7 +2690,6 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module) @pytest.fixture -@pytest.mark.parquet def s3_example_simple(s3_server): from pyarrow.fs import FileSystem diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index a88e20eefe098..d8c792ef00c6b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1485,10 +1485,7 @@ def test_legacy_int_type(): batch = pa.RecordBatch.from_arrays([ext_arr], names=['ext']) buf = ipc_write_batch(batch) - with pytest.warns( - RuntimeWarning, - match="pickle-based deserialization of pyarrow.PyExtensionType " - "subclasses is disabled by default"): + with pytest.warns((RuntimeWarning, FutureWarning)): batch = ipc_read_batch(buf) assert isinstance(batch.column(0).type, pa.UnknownExtensionType) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index d0fa253e314e9..ab10addfc3d4c 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -362,79 +362,79 @@ def py_fsspec_s3fs(request, s3_server): @pytest.fixture(params=[ pytest.param( - pytest.lazy_fixture('localfs'), + 'localfs', id='LocalFileSystem()' ), pytest.param( - pytest.lazy_fixture('localfs_with_mmap'), + 'localfs_with_mmap', id='LocalFileSystem(use_mmap=True)' ), pytest.param( - pytest.lazy_fixture('subtree_localfs'), + 'subtree_localfs', id='SubTreeFileSystem(LocalFileSystem())' ), pytest.param( - pytest.lazy_fixture('s3fs'), + 's3fs', id='S3FileSystem', marks=pytest.mark.s3 ), pytest.param( - pytest.lazy_fixture('gcsfs'), + 'gcsfs', id='GcsFileSystem', marks=pytest.mark.gcs ), pytest.param( - pytest.lazy_fixture('hdfs'), + 'hdfs', id='HadoopFileSystem', marks=pytest.mark.hdfs ), pytest.param( - pytest.lazy_fixture('mockfs'), + 'mockfs', id='_MockFileSystem()' ), pytest.param( - pytest.lazy_fixture('py_localfs'), + 'py_localfs', id='PyFileSystem(ProxyHandler(LocalFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_mockfs'), + 'py_mockfs', id='PyFileSystem(ProxyHandler(_MockFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_localfs'), + 'py_fsspec_localfs', id='PyFileSystem(FSSpecHandler(fsspec.LocalFileSystem()))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_memoryfs'), + 'py_fsspec_memoryfs', id='PyFileSystem(FSSpecHandler(fsspec.filesystem("memory")))' ), pytest.param( - pytest.lazy_fixture('py_fsspec_s3fs'), + 'py_fsspec_s3fs', id='PyFileSystem(FSSpecHandler(s3fs.S3FileSystem()))', marks=pytest.mark.s3 ), ]) def filesystem_config(request): - return request.param + return request.getfixturevalue(request.param) @pytest.fixture -def fs(request, filesystem_config): +def fs(filesystem_config): return filesystem_config['fs'] @pytest.fixture -def pathfn(request, filesystem_config): +def pathfn(filesystem_config): return filesystem_config['pathfn'] @pytest.fixture -def allow_move_dir(request, filesystem_config): +def allow_move_dir(filesystem_config): return filesystem_config['allow_move_dir'] @pytest.fixture -def allow_append_to_file(request, filesystem_config): +def allow_append_to_file(filesystem_config): return filesystem_config['allow_append_to_file'] diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index f75ec8158a9da..407011d90b734 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -142,16 +142,16 @@ def stream_fixture(): @pytest.fixture(params=[ pytest.param( - pytest.lazy_fixture('file_fixture'), + 'file_fixture', id='File Format' ), pytest.param( - pytest.lazy_fixture('stream_fixture'), + 'stream_fixture', id='Stream Format' ) ]) def format_fixture(request): - return request.param + return request.getfixturevalue(request.param) def test_empty_file(): diff --git a/python/requirements-test.txt b/python/requirements-test.txt index b3ba5d852b968..2108d70a543f5 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -2,5 +2,4 @@ cffi hypothesis pandas pytest<8 -pytest-lazy-fixture pytz diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt index c74a8ca6908a7..a1046bc18c704 100644 --- a/python/requirements-wheel-test.txt +++ b/python/requirements-wheel-test.txt @@ -2,7 +2,6 @@ cffi cython hypothesis pytest<8 -pytest-lazy-fixture pytz tzdata; sys_platform == 'win32' From 3d45ac96534fc76b820b488aa02182e6b93a388f Mon Sep 17 00:00:00 2001 From: "y.yoshida5" <39612448+yo1956@users.noreply.github.com> Date: Thu, 1 Feb 2024 22:36:59 +0900 Subject: [PATCH 15/74] GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819) ### Rationale for this change / What changes are included in this PR? To expose force_virtual_addressing in PyArrow. ### Are these changes tested? Existing unit tests are not broken, and a new test case have been added. ### Are there any user-facing changes? pyarrow.fs.S3FileSystem: it becomes possible to specify the argument 'force_virtual_addressing'. * Closes: #39779 Authored-by: yo1956 Signed-off-by: Joris Van den Bossche --- python/pyarrow/_s3fs.pyx | 11 ++++++++++- python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_fs.py | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 13b8c748cb8ca..f5bab99a49f7a 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -245,6 +245,11 @@ cdef class S3FileSystem(FileSystem): retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3) The retry strategy to use with S3; fail after max_attempts. Available strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy. + force_virtual_addressing : bool, default False + Whether to use virtual addressing of buckets. + If true, then virtual addressing is always enabled. + If false, then virtual addressing is only enabled if `endpoint_override` is empty. + This can be used for non-AWS backends that only support virtual hosted-style access. Examples -------- @@ -268,7 +273,9 @@ cdef class S3FileSystem(FileSystem): role_arn=None, session_name=None, external_id=None, load_frequency=900, proxy_options=None, allow_bucket_creation=False, allow_bucket_deletion=False, - retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)): + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy( + max_attempts=3), + force_virtual_addressing=False): cdef: optional[CS3Options] options shared_ptr[CS3FileSystem] wrapped @@ -380,6 +387,7 @@ cdef class S3FileSystem(FileSystem): options.value().allow_bucket_creation = allow_bucket_creation options.value().allow_bucket_deletion = allow_bucket_deletion + options.value().force_virtual_addressing = force_virtual_addressing if isinstance(retry_strategy, AwsStandardS3RetryStrategy): options.value().retry_strategy = CS3RetryStrategy.GetAwsStandardRetryStrategy( @@ -447,6 +455,7 @@ cdef class S3FileSystem(FileSystem): opts.proxy_options.username), 'password': frombytes( opts.proxy_options.password)}, + force_virtual_addressing=opts.force_virtual_addressing, ),) ) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index cb30f4e750eff..7876fb0f96671 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -167,6 +167,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_bool background_writes c_bool allow_bucket_creation c_bool allow_bucket_deletion + c_bool force_virtual_addressing shared_ptr[const CKeyValueMetadata] default_metadata c_string role_arn c_string session_name diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ab10addfc3d4c..6ba5137e4f63e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1186,6 +1186,10 @@ def test_s3_options(pickle_module): assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2 assert fs2 != fs + fs = S3FileSystem(endpoint_override='localhost:8999', force_virtual_addressing=True) + assert isinstance(fs, S3FileSystem) + assert pickle_module.loads(pickle_module.dumps(fs)) == fs + with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): From a1c1773b724e4d78faf9a097247c7e976cd2cbfa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Feb 2024 14:53:35 +0100 Subject: [PATCH 16/74] GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557) ### Rationale for this change Ensure we can build pyarrow against numpy 2.0 nightly (update pyproject.toml to allow this), and test this by building our nightly wheels with numpy nightly. This also ensures that other projects that use our nightly wheels to test together with numpy nightly can do that (numpy 2.0 changes the ABI, so to run with numpy 2.0, your package needs to be built with numpy 2.x; currently pyarrow installed with our nightly wheel will fail to import when also numpy nightly is installed). See the parent issue https://github.com/apache/arrow/issues/39532 for details, and https://numpy.org/devdocs/dev/depending_on_numpy.html#numpy-2-0-specific-advice for a direct link to the NumPy guidelines on updating build dependencies for NumPy 2.0. * Closes: #39555 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- ci/docker/python-wheel-manylinux.dockerfile | 5 +++-- ci/docker/python-wheel-windows-vs2017.dockerfile | 3 ++- ci/scripts/python_wheel_macos_build.sh | 5 ++++- python/pyproject.toml | 7 ++++++- python/requirements-build.txt | 3 ++- python/requirements-wheel-build.txt | 3 ++- python/setup.py | 2 +- 7 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 0a50d450c225a..a07c727ac76fa 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -28,7 +28,7 @@ ENV MANYLINUX_VERSION=${manylinux} RUN yum install -y dnf # Install basic dependencies -RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers +RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget # A system Python is required for ninja and vcpkg in this Dockerfile. # On manylinux2014 base images, system Python is 2.7.5, while @@ -97,4 +97,5 @@ SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] COPY python/requirements-wheel-build.txt /arrow/python/ -RUN pip install -r /arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index faf07800c956a..067105b3a7995 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -88,7 +88,8 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ -RUN python -m pip install -r arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" # ENV CLCACHE_DIR="C:\clcache" # ENV CLCACHE_COMPRESS=1 diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index fd845c512dcdb..8123a9fdf1c48 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -50,12 +50,15 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ - -r ${source_dir}/python/requirements-wheel-build.txt + -r ${source_dir}/python/requirements-wheel-build.txt \ + --pre \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" diff --git a/python/pyproject.toml b/python/pyproject.toml index 437de105ab8e7..9079618ad1c7d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,7 +18,12 @@ [build-system] requires = [ "cython >= 0.29.31", - "oldest-supported-numpy>=0.14", + # Starting with NumPy 1.25, NumPy is (by default) as far back compatible + # as oldest-support-numpy was (customizable with a NPY_TARGET_VERSION + # define). For older Python versions (where NumPy 1.25 is not yet avaiable) + # continue using oldest-support-numpy. + "oldest-supported-numpy>=0.14; python_version<'3.9'", + "numpy>=1.25; python_version>='3.9'", "setuptools_scm < 8.0.0", "setuptools >= 40.1.0", "wheel" diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 56e9d479ee9ba..e1372e807f88d 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,5 @@ cython>=0.29.31 -oldest-supported-numpy>=0.14 +oldest-supported-numpy>=0.14; python_version<'3.9' +numpy>=1.25; python_version>='3.9' setuptools_scm<8.0.0 setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index f42ee4a018f3c..044f9de5f8214 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,5 +1,6 @@ cython>=0.29.31 -oldest-supported-numpy>=0.14 +oldest-supported-numpy>=0.14; python_version<'3.9' +numpy>=1.25; python_version>='3.9' setuptools_scm<8.0.0 setuptools>=58 wheel diff --git a/python/setup.py b/python/setup.py index d7a2da2077cdd..098d75a3186af 100755 --- a/python/setup.py +++ b/python/setup.py @@ -449,7 +449,7 @@ def has_ext_modules(foo): install_requires = ( - 'numpy >= 1.16.6, <2', + 'numpy >= 1.16.6', ) From 4ceb66101382d74c6ef73ff546fad10183ab58d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Feb 2024 14:54:14 +0100 Subject: [PATCH 17/74] GH-39880: [Python][CI] Pin moto<5 for dask integration tests (#39881) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See upstream pin being added (https://github.com/dask/dask/pull/10868 / https://github.com/dask/dask/issues/10869), we are seeing the same failures * Closes: #39880 Lead-authored-by: Joris Van den Bossche Co-authored-by: Raúl Cumplido Signed-off-by: Joris Van den Bossche --- ci/scripts/install_dask.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 8d712a88a6ab1..478c1d5997906 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -35,4 +35,5 @@ else fi # additional dependencies needed for dask's s3 tests -pip install moto[server] flask requests +# Moto 5 results in timeouts in s3 tests: https://github.com/dask/dask/issues/10869 +pip install "moto[server]<5" flask requests From b684028dfbeeed85d132a1249449a85877d796b1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Thu, 1 Feb 2024 08:16:56 -0600 Subject: [PATCH 18/74] GH-39859: [R] Remove macOS from the allow list (#39861) Originally this was going to also bundle all of our dependencies to send to CRAN, but their webforms don't allow source tars that large (I tried down to 80MB which removed a large number of our dependencies, and that was still rejected by the macbuilder). This means that on CRAN, if there is no internet, the macOS binary will be minimal. But it means that we build on CRAN using source always. We should definitely submit this to macbuilder after this merges to main and confirm we get source build by default (since we look to the repo for our allowlist) * Closes: #39859 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- r/tools/nixlibs-allowlist.txt | 1 - r/tools/nixlibs.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/r/tools/nixlibs-allowlist.txt b/r/tools/nixlibs-allowlist.txt index 9c368e6ed15a2..bd9f0c1b2c084 100644 --- a/r/tools/nixlibs-allowlist.txt +++ b/r/tools/nixlibs-allowlist.txt @@ -2,4 +2,3 @@ ubuntu centos redhat rhel -darwin diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 17c6ab0a8078b..0af41888b95b7 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -222,7 +222,7 @@ check_allowlist <- function(os, allowed = "https://raw.githubusercontent.com/apa # Try a remote allowlist so that we can add/remove without a release suppressWarnings(readLines(allowed)), # Fallback to default: allowed only on Ubuntu and CentOS/RHEL - error = function(e) c("ubuntu", "centos", "redhat", "rhel", "darwin") + error = function(e) c("ubuntu", "centos", "redhat", "rhel") ) # allowlist should contain valid regular expressions (plain strings ok too) any(grepl(paste(allowlist, collapse = "|"), os)) From 63c7c4a327ff5b27a1ba6838253408e965c0a348 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Feb 2024 23:43:21 +0900 Subject: [PATCH 19/74] GH-39874: [CI][C++][Windows] Use pre-installed OpenSSL (#39882) ### Rationale for this change It seems that we can't use OpenSSL via Chocolatey. ```text openssl v3.2.0 [Approved] openssl package files install completed. Performing other installation steps. Attempt to get headers for https://slproweb.com/download/Win64OpenSSL-3_2_0.exe failed. The remote file either doesn't exist, is unauthorized, or is forbidden for url 'https://slproweb.com/download/Win64OpenSSL-3_2_0.exe'. Exception calling "GetResponse" with "0" argument(s): "The remote server returned an error: (404) Not Found." Downloading openssl 64 bit from 'https://slproweb.com/download/Win64OpenSSL-3_2_0.exe' ERROR: The remote file either doesn't exist, is unauthorized, or is forbidden for url 'https://slproweb.com/download/Win64OpenSSL-3_2_0.exe'. Exception calling "GetResponse" with "0" argument(s): "The remote server returned an error: (404) Not Found." This package is likely not broken for licensed users - see https://docs.chocolatey.org/en-us/features/private-cdn. The install of openssl was NOT successful. Error while running 'C:\ProgramData\chocolatey\lib\openssl\tools\chocolateyinstall.ps1'. See log for details. ``` ### What changes are included in this PR? Use pre-installed OpenSSL on self-hosted GitHub runner instead. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39874 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index bd14f1b895bf6..9fbad06692bd2 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -284,10 +284,6 @@ jobs: /t REG_DWORD ` /d 1 ` /f - - name: Installed Packages - run: choco list - - name: Install Dependencies - run: choco install -y --no-progress openssl - name: Checkout Arrow uses: actions/checkout@v4 with: From c534749b3230f4ad640fe568d603c665b4bcee3d Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Thu, 1 Feb 2024 10:21:12 -0500 Subject: [PATCH 20/74] GH-39885: [CI][MATLAB] Bump matlab-actions/setup-matlab and matlab-actions/run-tests from v1 to v2 (#39886) ### Rationale for this change Upgrading our CI workflows to use the latest versions of [matlab-actions/setup-matlab](https://github.com/matlab-actions/setup-matlab/) and [matlab-actions/run-tests](https://github.com/matlab-actions/run-tests/). ### What changes are included in this PR? 1. Bumped version of `matlab-actions/setup-matlab` from `v1` to `v2` 2. Bumped version of `matlab-actions/runtests-matlab` from `v1` to `v2` ### Are these changes tested? All MATLAB workflow checks passed. ### Are there any user-facing changes? No. * Closes: #39885 Authored-by: Sarah Gilmore Signed-off-by: Sutou Kouhei --- .github/workflows/matlab.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 512ff2bb929b3..eceeb551a0653 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -52,7 +52,7 @@ jobs: - name: Install ninja-build run: sudo apt-get install ninja-build - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Install ccache @@ -85,7 +85,7 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test macos: @@ -100,7 +100,7 @@ jobs: - name: Install ninja-build run: brew install ninja - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Install ccache @@ -125,7 +125,7 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test windows: @@ -138,7 +138,7 @@ jobs: with: fetch-depth: 0 - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2023a - name: Download Timezone Database @@ -171,6 +171,6 @@ jobs: # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab - uses: matlab-actions/run-tests@v1 + uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test From 87b515e9207509aa3f77e3e1c0122be314a77e6d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 1 Feb 2024 11:48:29 -0500 Subject: [PATCH 21/74] GH-39771: [C++][Device] Generic CopyBatchTo/CopyArrayTo memory types (#39772) ### Rationale for this change Right now our MemoryManager interfaces operate solely at the buffer level and we do not provide any higher level facilities to copy an entire array or record batch between memory types. We should implement CopyArrayTo and CopyBatchTo functions which recursively utilize the buffer level copying to create a new Array/RecordBatch whose buffers have been copied to the destination memory manager. ### What changes are included in this PR? Exposing a `CopyArrayTo` and `CopyBatchTo` function for copying entire Array or RecordBatches between memory types. ### Are these changes tested? Tests are still being written but will be added. * Closes: #39771 Authored-by: Matt Topol Signed-off-by: Matt Topol --- cpp/src/arrow/array/array_base.cc | 12 +++++++++ cpp/src/arrow/array/array_base.h | 16 ++++++++++++ cpp/src/arrow/array/data.cc | 39 ++++++++++++++++++++++++++++ cpp/src/arrow/array/data.h | 19 +++++++++++--- cpp/src/arrow/buffer.h | 2 +- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/c/bridge_test.cc | 4 ++- cpp/src/arrow/device.cc | 2 ++ cpp/src/arrow/gpu/cuda_context.cc | 5 ++++ cpp/src/arrow/ipc/read_write_test.cc | 27 +++---------------- cpp/src/arrow/record_batch.cc | 24 +++++++++++++++++ cpp/src/arrow/record_batch.h | 19 ++++++++++++++ 12 files changed, 142 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index b483ec420cc3c..6927f51283eb7 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -307,6 +307,18 @@ Result> Array::View( return MakeArray(result); } +Result> Array::CopyTo( + const std::shared_ptr& to) const { + ARROW_ASSIGN_OR_RAISE(auto copied_data, data()->CopyTo(to)); + return MakeArray(copied_data); +} + +Result> Array::ViewOrCopyTo( + const std::shared_ptr& to) const { + ARROW_ASSIGN_OR_RAISE(auto new_data, data()->ViewOrCopyTo(to)); + return MakeArray(new_data); +} + // ---------------------------------------------------------------------- // NullArray diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 7e857bf20568e..6411aebf80442 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -165,6 +165,22 @@ class ARROW_EXPORT Array { /// An error is returned if the types are not layout-compatible. Result> View(const std::shared_ptr& type) const; + /// \brief Construct a copy of the array with all buffers on destination + /// Memory Manager + /// + /// This method recursively copies the array's buffers and those of its children + /// onto the destination MemoryManager device and returns the new Array. + Result> CopyTo(const std::shared_ptr& to) const; + + /// \brief Construct a new array attempting to zero-copy view if possible. + /// + /// Like CopyTo this method recursively goes through all of the array's buffers + /// and those of it's children and first attempts to create zero-copy + /// views on the destination MemoryManager device. If it can't, it falls back + /// to performing a copy. See Buffer::ViewOrCopy. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + /// Construct a zero-copy slice of the array with the indicated offset and /// length /// diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8454ac8f1d5fb..80c411dfa6a6d 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -27,6 +27,7 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/device.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -36,6 +37,7 @@ #include "arrow/util/dict_util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" #include "arrow/util/ree_util.h" #include "arrow/util/slice_util_internal.h" #include "arrow/util/union_util.h" @@ -140,6 +142,43 @@ std::shared_ptr ArrayData::Make(std::shared_ptr type, int64 return std::make_shared(std::move(type), length, null_count, offset); } +namespace { +template +Result> CopyToImpl(const ArrayData& data, + const std::shared_ptr& to, + Fn&& copy_fn) { + auto output = ArrayData::Make(data.type, data.length, data.null_count, data.offset); + output->buffers.resize(data.buffers.size()); + for (auto&& [buf, out_buf] : internal::Zip(data.buffers, output->buffers)) { + if (buf) { + ARROW_ASSIGN_OR_RAISE(out_buf, copy_fn(buf, to)); + } + } + + output->child_data.reserve(data.child_data.size()); + for (const auto& child : data.child_data) { + ARROW_ASSIGN_OR_RAISE(auto copied, CopyToImpl(*child, to, copy_fn)); + output->child_data.push_back(std::move(copied)); + } + + if (data.dictionary) { + ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); + } + + return output; +} +} // namespace + +Result> ArrayData::CopyTo( + const std::shared_ptr& to) const { + return CopyToImpl(*this, to, MemoryManager::CopyBuffer); +} + +Result> ArrayData::ViewOrCopyTo( + const std::shared_ptr& to) const { + return CopyToImpl(*this, to, Buffer::ViewOrCopy); +} + std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { ARROW_CHECK_LE(off, length) << "Slice offset (" << off << ") greater than array length (" << length << ")"; diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index edd443adc43c4..d8a6663cec580 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -27,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" #include "arrow/util/span.h" @@ -34,9 +35,6 @@ namespace arrow { -class Array; -struct ArrayData; - namespace internal { // ---------------------------------------------------------------------- // Null handling for types without a validity bitmap and the dictionary type @@ -183,6 +181,21 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr Copy() const { return std::make_shared(*this); } + /// \brief Copy all buffers and children recursively to destination MemoryManager + /// + /// This utilizes MemoryManager::CopyBuffer to create a new ArrayData object + /// recursively copying the buffers and all child buffers to the destination + /// memory manager. This includes dictionaries if applicable. + Result> CopyTo( + const std::shared_ptr& to) const; + /// \brief View or Copy this ArrayData to destination memory manager. + /// + /// Tries to view the buffer contents on the given memory manager's device + /// if possible (to avoid a copy) but falls back to copying if a no-copy view + /// isn't supported. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + bool IsNull(int64_t i) const { return !IsValid(i); } bool IsValid(int64_t i) const { diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 52fd94ec1f7d4..258a9faac7361 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -359,7 +359,7 @@ class ARROW_EXPORT Buffer { static Result> ViewOrCopy( std::shared_ptr source, const std::shared_ptr& to); - virtual std::shared_ptr device_sync_event() { return NULLPTR; } + virtual std::shared_ptr device_sync_event() const { return NULLPTR; } protected: bool is_mutable_; diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 238afb0328672..172ed8962ce77 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1466,7 +1466,7 @@ class ImportedBuffer : public Buffer { ~ImportedBuffer() override = default; - std::shared_ptr device_sync_event() override { + std::shared_ptr device_sync_event() const override { return import_->device_sync_; } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 58bbc9282c204..321ec36c38d8c 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -1282,7 +1282,9 @@ class MyBuffer final : public MutableBuffer { default_memory_pool()->Free(const_cast(data_), size_); } - std::shared_ptr device_sync_event() override { return device_sync_; } + std::shared_ptr device_sync_event() const override { + return device_sync_; + } protected: std::shared_ptr device_sync_; diff --git a/cpp/src/arrow/device.cc b/cpp/src/arrow/device.cc index de709923dc44e..616f89aae896f 100644 --- a/cpp/src/arrow/device.cc +++ b/cpp/src/arrow/device.cc @@ -20,8 +20,10 @@ #include #include +#include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/memory.h" +#include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 81542d339bd70..988cc1f25b91c 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -433,6 +433,11 @@ Result> CudaMemoryManager::CopyBufferTo( Result> CudaMemoryManager::CopyNonOwnedTo( const Buffer& buf, const std::shared_ptr& to) { if (to->is_cpu()) { + auto sync_event = buf.device_sync_event(); + if (sync_event) { + RETURN_NOT_OK(sync_event->Wait()); + } + // Device-to-CPU copy std::unique_ptr dest; ARROW_ASSIGN_OR_RAISE(auto from_context, cuda_device()->GetContext()); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index bd2c2b716d502..c5075299a3e35 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1336,30 +1336,11 @@ class CopyCollectListener : public CollectListener { Status OnRecordBatchWithMetadataDecoded( RecordBatchWithMetadata record_batch_with_metadata) override { - auto& record_batch = record_batch_with_metadata.batch; - for (auto column_data : record_batch->column_data()) { - ARROW_RETURN_NOT_OK(CopyArrayData(column_data)); - } - return CollectListener::OnRecordBatchWithMetadataDecoded(record_batch_with_metadata); - } + ARROW_ASSIGN_OR_RAISE( + record_batch_with_metadata.batch, + record_batch_with_metadata.batch->CopyTo(default_cpu_memory_manager())); - private: - Status CopyArrayData(std::shared_ptr data) { - auto& buffers = data->buffers; - for (size_t i = 0; i < buffers.size(); ++i) { - auto& buffer = buffers[i]; - if (!buffer) { - continue; - } - ARROW_ASSIGN_OR_RAISE(buffers[i], Buffer::Copy(buffer, buffer->memory_manager())); - } - for (auto child_data : data->child_data) { - ARROW_RETURN_NOT_OK(CopyArrayData(child_data)); - } - if (data->dictionary) { - ARROW_RETURN_NOT_OK(CopyArrayData(data->dictionary)); - } - return Status::OK(); + return CollectListener::OnRecordBatchWithMetadataDecoded(record_batch_with_metadata); } }; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 457135fa400d5..ca6b45af3d6b4 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -357,6 +357,30 @@ Status ValidateBatch(const RecordBatch& batch, bool full_validation) { } // namespace +Result> RecordBatch::CopyTo( + const std::shared_ptr& to) const { + ArrayVector copied_columns; + copied_columns.reserve(num_columns()); + for (const auto& col : columns()) { + ARROW_ASSIGN_OR_RAISE(auto c, col->CopyTo(to)); + copied_columns.push_back(std::move(c)); + } + + return Make(schema_, num_rows(), std::move(copied_columns)); +} + +Result> RecordBatch::ViewOrCopyTo( + const std::shared_ptr& to) const { + ArrayVector copied_columns; + copied_columns.reserve(num_columns()); + for (const auto& col : columns()) { + ARROW_ASSIGN_OR_RAISE(auto c, col->ViewOrCopyTo(to)); + copied_columns.push_back(std::move(c)); + } + + return Make(schema_, num_rows(), std::move(copied_columns)); +} + Status RecordBatch::Validate() const { return ValidateBatch(*this, /*full_validation=*/false); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 1a66fc3fb5629..79f93a7b5997f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -186,6 +186,25 @@ class ARROW_EXPORT RecordBatch { /// \return the number of rows (the corresponding length of each column) int64_t num_rows() const { return num_rows_; } + /// \brief Copy the entire RecordBatch to destination MemoryManager + /// + /// This uses Array::CopyTo on each column of the record batch to create + /// a new record batch where all underlying buffers for the columns have + /// been copied to the destination MemoryManager. This uses + /// MemoryManager::CopyBuffer under the hood. + Result> CopyTo( + const std::shared_ptr& to) const; + + /// \brief View or Copy the entire RecordBatch to destination MemoryManager + /// + /// This uses Array::ViewOrCopyTo on each column of the record batch to create + /// a new record batch where all underlying buffers for the columns have + /// been zero-copy viewed on the destination MemoryManager, falling back + /// to performing a copy if it can't be viewed as a zero-copy buffer. This uses + /// Buffer::ViewOrCopy under the hood. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + /// \brief Slice each of the arrays in the record batch /// \param[in] offset the starting offset to slice, through end of batch /// \return new record batch From f9b7ac2e922bceed8bab09b1e28d7261cbe8b41d Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 1 Feb 2024 23:08:21 +0530 Subject: [PATCH 22/74] GH-37841: [Java] Dictionary decoding not using the compression factory from the ArrowReader (#38371) ### Rationale for this change This PR addresses https://github.com/apache/arrow/issues/37841. ### What changes are included in this PR? Adding compression-based write and read for Dictionary data. ### Are these changes tested? Yes. ### Are there any user-facing changes? No * Closes: #37841 Lead-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: vibhatha Signed-off-by: David Li --- .../TestArrowReaderWriterWithCompression.java | 206 ++++++++++++++++-- .../apache/arrow/vector/ipc/ArrowReader.java | 2 +- .../apache/arrow/vector/ipc/ArrowWriter.java | 23 +- 3 files changed, 201 insertions(+), 30 deletions(-) diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java b/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java index 6104cb1a132e4..af28333746290 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java @@ -18,7 +18,9 @@ package org.apache.arrow.compression; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -27,63 +29,223 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.GenerateSampleData; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.compression.CompressionUtil; import org.apache.arrow.vector.compression.NoCompressionCodec; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import org.apache.arrow.vector.ipc.message.IpcOption; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class TestArrowReaderWriterWithCompression { - @Test - public void testArrowFileZstdRoundTrip() throws Exception { - // Prepare sample data - final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + private BufferAllocator allocator; + private ByteArrayOutputStream out; + private VectorSchemaRoot root; + + @BeforeEach + public void setup() { + if (allocator == null) { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + out = new ByteArrayOutputStream(); + root = null; + } + + @After + public void tearDown() { + if (root != null) { + root.close(); + } + if (allocator != null) { + allocator.close(); + } + if (out != null) { + out.reset(); + } + + } + + private void createAndWriteArrowFile(DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { List fields = new ArrayList<>(); fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); - VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), allocator); + root = VectorSchemaRoot.create(new Schema(fields), allocator); + final int rowCount = 10; GenerateSampleData.generateTestData(root.getVector(0), rowCount); root.setRowCount(rowCount); - // Write an in-memory compressed arrow file - ByteArrayOutputStream out = new ByteArrayOutputStream(); - try (final ArrowFileWriter writer = - new ArrowFileWriter(root, null, Channels.newChannel(out), new HashMap<>(), - IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, CompressionUtil.CodecType.ZSTD, Optional.of(7))) { + try (final ArrowFileWriter writer = new ArrowFileWriter(root, provider, Channels.newChannel(out), + new HashMap<>(), IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) { writer.start(); writer.writeBatch(); writer.end(); } + } + + private void createAndWriteArrowStream(DictionaryProvider provider, + CompressionUtil.CodecType codecType) throws IOException { + List fields = new ArrayList<>(); + fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>())); + root = VectorSchemaRoot.create(new Schema(fields), allocator); + + final int rowCount = 10; + GenerateSampleData.generateTestData(root.getVector(0), rowCount); + root.setRowCount(rowCount); + + try (final ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out), + IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + } - // Read the in-memory compressed arrow file with CommonsCompressionFactory provided + private Dictionary createDictionary(VarCharVector dictionaryVector) { + setVector(dictionaryVector, + "foo".getBytes(StandardCharsets.UTF_8), + "bar".getBytes(StandardCharsets.UTF_8), + "baz".getBytes(StandardCharsets.UTF_8)); + + return new Dictionary(dictionaryVector, + new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null)); + } + + @Test + public void testArrowFileZstdRoundTrip() throws Exception { + createAndWriteArrowFile(null, CompressionUtil.CodecType.ZSTD); + // with compression + try (ArrowFileReader reader = + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression try (ArrowFileReader reader = - new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), - allocator, CommonsCompressionFactory.INSTANCE)) { - Assert.assertEquals(1, reader.getRecordBlocks().size()); + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + } + + @Test + public void testArrowStreamZstdRoundTrip() throws Exception { + createAndWriteArrowStream(null, CompressionUtil.CodecType.ZSTD); + // with compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { Assert.assertTrue(reader.loadNextBatch()); Assert.assertTrue(root.equals(reader.getVectorSchemaRoot())); Assert.assertFalse(reader.loadNextBatch()); } + // without compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assert.assertEquals( + "Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage() + ); + } + } - // Read the in-memory compressed arrow file without CompressionFactory provided + @Test + public void testArrowFileZstdRoundTripWithDictionary() throws Exception { + VarCharVector dictionaryVector = (VarCharVector) + FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector("f1_file", allocator, null); + Dictionary dictionary = createDictionary(dictionaryVector); + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary); + + createAndWriteArrowFile(provider, CompressionUtil.CodecType.ZSTD); + + // with compression + try (ArrowFileReader reader = + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression try (ArrowFileReader reader = - new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), - allocator, NoCompressionCodec.Factory.INSTANCE)) { - Assert.assertEquals(1, reader.getRecordBlocks().size()); + new ArrowFileReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Assertions.assertEquals(1, reader.getRecordBlocks().size()); + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + dictionaryVector.close(); + } + + @Test + public void testArrowStreamZstdRoundTripWithDictionary() throws Exception { + VarCharVector dictionaryVector = (VarCharVector) + FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector("f1_stream", allocator, null); + Dictionary dictionary = createDictionary(dictionaryVector); + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + provider.put(dictionary); + + createAndWriteArrowStream(provider, CompressionUtil.CodecType.ZSTD); + + // with compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + CommonsCompressionFactory.INSTANCE)) { + Assertions.assertTrue(reader.loadNextBatch()); + Assertions.assertTrue(root.equals(reader.getVectorSchemaRoot())); + Assertions.assertFalse(reader.loadNextBatch()); + } + // without compression + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator, + NoCompressionCodec.Factory.INSTANCE)) { + Exception exception = Assert.assertThrows(IllegalArgumentException.class, + reader::loadNextBatch); + Assertions.assertEquals("Please add arrow-compression module to use CommonsCompressionFactory for ZSTD", + exception.getMessage()); + } + dictionaryVector.close(); + } - Exception exception = Assert.assertThrows(IllegalArgumentException.class, () -> reader.loadNextBatch()); - String expectedMessage = "Please add arrow-compression module to use CommonsCompressionFactory for ZSTD"; - Assert.assertEquals(expectedMessage, exception.getMessage()); + public static void setVector(VarCharVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } } + vector.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java index 04c57d7e82fef..01f4e925c69b3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java @@ -251,7 +251,7 @@ private void load(ArrowDictionaryBatch dictionaryBatch, FieldVector vector) { VectorSchemaRoot root = new VectorSchemaRoot( Collections.singletonList(vector.getField()), Collections.singletonList(vector), 0); - VectorLoader loader = new VectorLoader(root); + VectorLoader loader = new VectorLoader(root, this.compressionFactory); try { loader.load(dictionaryBatch.getDictionary()); } finally { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java index a33c55de53f23..1cc201ae56f4b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java @@ -61,9 +61,14 @@ public abstract class ArrowWriter implements AutoCloseable { private final DictionaryProvider dictionaryProvider; private final Set dictionaryIdsUsed = new HashSet<>(); + private final CompressionCodec.Factory compressionFactory; + private final CompressionUtil.CodecType codecType; + private final Optional compressionLevel; private boolean started = false; private boolean ended = false; + private final CompressionCodec codec; + protected IpcOption option; protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out) { @@ -89,16 +94,19 @@ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, Writab protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, WritableByteChannel out, IpcOption option, CompressionCodec.Factory compressionFactory, CompressionUtil.CodecType codecType, Optional compressionLevel) { - this.unloader = new VectorUnloader( - root, /*includeNullCount*/ true, - compressionLevel.isPresent() ? - compressionFactory.createCodec(codecType, compressionLevel.get()) : - compressionFactory.createCodec(codecType), - /*alignBuffers*/ true); this.out = new WriteChannel(out); this.option = option; this.dictionaryProvider = provider; + this.compressionFactory = compressionFactory; + this.codecType = codecType; + this.compressionLevel = compressionLevel; + this.codec = this.compressionLevel.isPresent() ? + this.compressionFactory.createCodec(this.codecType, this.compressionLevel.get()) : + this.compressionFactory.createCodec(this.codecType); + this.unloader = new VectorUnloader(root, /*includeNullCount*/ true, codec, + /*alignBuffers*/ true); + List fields = new ArrayList<>(root.getSchema().getFields().size()); MetadataV4UnionChecker.checkForUnion(root.getSchema().getFields().iterator(), option.metadataVersion); @@ -133,7 +141,8 @@ protected void writeDictionaryBatch(Dictionary dictionary) throws IOException { Collections.singletonList(vector.getField()), Collections.singletonList(vector), count); - VectorUnloader unloader = new VectorUnloader(dictRoot); + VectorUnloader unloader = new VectorUnloader(dictRoot, /*includeNullCount*/ true, this.codec, + /*alignBuffers*/ true); ArrowRecordBatch batch = unloader.getRecordBatch(); ArrowDictionaryBatch dictionaryBatch = new ArrowDictionaryBatch(id, batch, false); try { From a57363867a6d88d0a7f17767571ab57dbb70cbfd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:33:45 -0500 Subject: [PATCH 23/74] MINOR: [JS] Bump esbuild from 0.19.2 to 0.20.0 in /js (#39891) Bumps [esbuild](https://github.com/evanw/esbuild) from 0.19.2 to 0.20.0.

Release notes

Sourced from esbuild's releases.

v0.20.0

This release deliberately contains backwards-incompatible changes. To avoid automatically picking up releases like this, you should either be pinning the exact version of esbuild in your package.json file (recommended) or be using a version range syntax that only accepts patch upgrades such as ^0.19.0 or ~0.19.0. See npm's documentation about semver for more information.

This time there is only one breaking change, and it only matters for people using Deno. Deno tests that use esbuild will now fail unless you make the change described below.

  • Work around API deprecations in Deno 1.40.x (#3609, #3611)

    Deno 1.40.0 was just released and introduced run-time warnings about certain APIs that esbuild uses. With this release, esbuild will work around these run-time warnings by using newer APIs if they are present and falling back to the original APIs otherwise. This should avoid the warnings without breaking compatibility with older versions of Deno.

    Unfortunately, doing this introduces a breaking change. The newer child process APIs lack a way to synchronously terminate esbuild's child process, so calling esbuild.stop() from within a Deno test is no longer sufficient to prevent Deno from failing a test that uses esbuild's API (Deno fails tests that create a child process without killing it before the test ends). To work around this, esbuild's stop() function has been changed to return a promise, and you now have to change esbuild.stop() to await esbuild.stop() in all of your Deno tests.

  • Reorder implicit file extensions within node_modules (#3341, #3608)

    In version 0.18.0, esbuild changed the behavior of implicit file extensions within node_modules directories (i.e. in published packages) to prefer .js over .ts even when the --resolve-extensions= order prefers .ts over .js (which it does by default). However, doing that also accidentally made esbuild prefer .css over .ts, which caused problems for people that published packages containing both TypeScript and CSS in files with the same name.

    With this release, esbuild will reorder TypeScript file extensions immediately after the last JavaScript file extensions in the implicit file extension order instead of putting them at the end of the order. Specifically the default implicit file extension order is .tsx,.ts,.jsx,.js,.css,.json which used to become .jsx,.js,.css,.json,.tsx,.ts in node_modules directories. With this release it will now become .jsx,.js,.tsx,.ts,.css,.json instead.

    Why even rewrite the implicit file extension order at all? One reason is because the .js file is more likely to behave correctly than the .ts file. The behavior of the .ts file may depend on tsconfig.json and the tsconfig.json file may not even be published, or may use extends to refer to a base tsconfig.json file that wasn't published. People can get into this situation when they forget to add all .ts files to their .npmignore file before publishing to npm. Picking .js over .ts helps make it more likely that resulting bundle will behave correctly.

v0.19.12

  • The "preserve" JSX mode now preserves JSX text verbatim (#3605)

    The JSX specification deliberately doesn't specify how JSX text is supposed to be interpreted and there is no canonical way to interpret JSX text. Two most popular interpretations are Babel and TypeScript. Yes they are different (esbuild deliberately follows TypeScript by the way).

    Previously esbuild normalized text to the TypeScript interpretation when the "preserve" JSX mode is active. However, "preserve" should arguably reproduce the original JSX text verbatim so that whatever JSX transform runs after esbuild is free to interpret it however it wants. So with this release, esbuild will now pass JSX text through unmodified:

    // Original code
    let el =
      <a href={'/'} title='&apos;&quot;'> some text
        {foo}
          more text </a>
    

    // Old output (with --loader=jsx --jsx=preserve) let el = <a href="/" title={'&quot;}> {" some text"} {foo} {"more text "} </a>;

    // New output (with --loader=jsx --jsx=preserve) let el = <a href={"/"} title='&apos;&quot;'> some text {foo} more text </a>;

  • Allow JSX elements as JSX attribute values

    JSX has an obscure feature where you can use JSX elements in attribute position without surrounding them with {...}. It looks like this:

... (truncated)

Changelog

Sourced from esbuild's changelog.

0.20.0

This release deliberately contains backwards-incompatible changes. To avoid automatically picking up releases like this, you should either be pinning the exact version of esbuild in your package.json file (recommended) or be using a version range syntax that only accepts patch upgrades such as ^0.19.0 or ~0.19.0. See npm's documentation about semver for more information.

This time there is only one breaking change, and it only matters for people using Deno. Deno tests that use esbuild will now fail unless you make the change described below.

  • Work around API deprecations in Deno 1.40.x (#3609, #3611)

    Deno 1.40.0 was just released and introduced run-time warnings about certain APIs that esbuild uses. With this release, esbuild will work around these run-time warnings by using newer APIs if they are present and falling back to the original APIs otherwise. This should avoid the warnings without breaking compatibility with older versions of Deno.

    Unfortunately, doing this introduces a breaking change. The newer child process APIs lack a way to synchronously terminate esbuild's child process, so calling esbuild.stop() from within a Deno test is no longer sufficient to prevent Deno from failing a test that uses esbuild's API (Deno fails tests that create a child process without killing it before the test ends). To work around this, esbuild's stop() function has been changed to return a promise, and you now have to change esbuild.stop() to await esbuild.stop() in all of your Deno tests.

  • Reorder implicit file extensions within node_modules (#3341, #3608)

    In version 0.18.0, esbuild changed the behavior of implicit file extensions within node_modules directories (i.e. in published packages) to prefer .js over .ts even when the --resolve-extensions= order prefers .ts over .js (which it does by default). However, doing that also accidentally made esbuild prefer .css over .ts, which caused problems for people that published packages containing both TypeScript and CSS in files with the same name.

    With this release, esbuild will reorder TypeScript file extensions immediately after the last JavaScript file extensions in the implicit file extension order instead of putting them at the end of the order. Specifically the default implicit file extension order is .tsx,.ts,.jsx,.js,.css,.json which used to become .jsx,.js,.css,.json,.tsx,.ts in node_modules directories. With this release it will now become .jsx,.js,.tsx,.ts,.css,.json instead.

    Why even rewrite the implicit file extension order at all? One reason is because the .js file is more likely to behave correctly than the .ts file. The behavior of the .ts file may depend on tsconfig.json and the tsconfig.json file may not even be published, or may use extends to refer to a base tsconfig.json file that wasn't published. People can get into this situation when they forget to add all .ts files to their .npmignore file before publishing to npm. Picking .js over .ts helps make it more likely that resulting bundle will behave correctly.

0.19.12

  • The "preserve" JSX mode now preserves JSX text verbatim (#3605)

    The JSX specification deliberately doesn't specify how JSX text is supposed to be interpreted and there is no canonical way to interpret JSX text. Two most popular interpretations are Babel and TypeScript. Yes they are different (esbuild deliberately follows TypeScript by the way).

    Previously esbuild normalized text to the TypeScript interpretation when the "preserve" JSX mode is active. However, "preserve" should arguably reproduce the original JSX text verbatim so that whatever JSX transform runs after esbuild is free to interpret it however it wants. So with this release, esbuild will now pass JSX text through unmodified:

    // Original code
    let el =
      <a href={'/'} title='&apos;&quot;'> some text
        {foo}
          more text </a>
    

    // Old output (with --loader=jsx --jsx=preserve) let el = <a href="/" title={'&quot;}> {" some text"} {foo} {"more text "} </a>;

    // New output (with --loader=jsx --jsx=preserve) let el = <a href={"/"} title='&apos;&quot;'> some text {foo} more text </a>;

  • Allow JSX elements as JSX attribute values

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=esbuild&package-manager=npm_and_yarn&previous-version=0.19.2&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/package.json | 2 +- js/yarn.lock | 234 +++++++++++++++++++++++++----------------------- 2 files changed, 121 insertions(+), 115 deletions(-) diff --git a/js/package.json b/js/package.json index 57f9267afa3a8..f96764d82245e 100644 --- a/js/package.json +++ b/js/package.json @@ -79,7 +79,7 @@ "cross-env": "7.0.3", "del": "7.1.0", "del-cli": "5.1.0", - "esbuild": "0.19.2", + "esbuild": "0.20.0", "esbuild-plugin-alias": "0.2.1", "eslint": "8.52.0", "eslint-plugin-jest": "27.4.2", diff --git a/js/yarn.lock b/js/yarn.lock index 10d2a256e1cac..e7dead09bf8bb 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -416,225 +416,230 @@ resolved "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.5.7.tgz#1d572bfbbe14b7704e0ba0f39b74815b84870d70" integrity sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw== +"@esbuild/aix-ppc64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/aix-ppc64/-/aix-ppc64-0.20.0.tgz#509621cca4e67caf0d18561a0c56f8b70237472f" + integrity sha512-fGFDEctNh0CcSwsiRPxiaqX0P5rq+AqE0SRhYGZ4PX46Lg1FNR6oCxJghf8YgY0WQEgQuh3lErUFE4KxLeRmmw== + "@esbuild/android-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.17.19.tgz#bafb75234a5d3d1b690e7c2956a599345e84a2fd" integrity sha512-KBMWvEZooR7+kzY0BtbTQn0OAYY7CsiydT63pVEaPtVYF0hXbUaOyZog37DKxK7NF3XacBJOpYT4adIJh+avxA== -"@esbuild/android-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.2.tgz#bc35990f412a749e948b792825eef7df0ce0e073" - integrity sha512-lsB65vAbe90I/Qe10OjkmrdxSX4UJDjosDgb8sZUKcg3oefEuW2OT2Vozz8ef7wrJbMcmhvCC+hciF8jY/uAkw== +"@esbuild/android-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.20.0.tgz#109a6fdc4a2783fc26193d2687827045d8fef5ab" + integrity sha512-aVpnM4lURNkp0D3qPoAzSG92VXStYmoVPOgXveAUoQBWRSuQzt51yvSju29J6AHPmwY1BjH49uR29oyfH1ra8Q== "@esbuild/android-arm@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.17.19.tgz#5898f7832c2298bc7d0ab53701c57beb74d78b4d" integrity sha512-rIKddzqhmav7MSmoFCmDIb6e2W57geRsM94gV2l38fzhXMwq7hZoClug9USI2pFRGL06f4IOPHHpFNOkWieR8A== -"@esbuild/android-arm@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.2.tgz#edd1c8f23ba353c197f5b0337123c58ff2a56999" - integrity sha512-tM8yLeYVe7pRyAu9VMi/Q7aunpLwD139EY1S99xbQkT4/q2qa6eA4ige/WJQYdJ8GBL1K33pPFhPfPdJ/WzT8Q== +"@esbuild/android-arm@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.20.0.tgz#1397a2c54c476c4799f9b9073550ede496c94ba5" + integrity sha512-3bMAfInvByLHfJwYPJRlpTeaQA75n8C/QKpEaiS4HrFWFiJlNI0vzq/zCjBrhAYcPyVPG7Eo9dMrcQXuqmNk5g== "@esbuild/android-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.17.19.tgz#658368ef92067866d95fb268719f98f363d13ae1" integrity sha512-uUTTc4xGNDT7YSArp/zbtmbhO0uEEK9/ETW29Wk1thYUJBz3IVnvgEiEwEa9IeLyvnpKrWK64Utw2bgUmDveww== -"@esbuild/android-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.2.tgz#2dcdd6e6f1f2d82ea1b746abd8da5b284960f35a" - integrity sha512-qK/TpmHt2M/Hg82WXHRc/W/2SGo/l1thtDHZWqFq7oi24AjZ4O/CpPSu6ZuYKFkEgmZlFoa7CooAyYmuvnaG8w== +"@esbuild/android-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.20.0.tgz#2b615abefb50dc0a70ac313971102f4ce2fdb3ca" + integrity sha512-uK7wAnlRvjkCPzh8jJ+QejFyrP8ObKuR5cBIsQZ+qbMunwR8sbd8krmMbxTLSrDhiPZaJYKQAU5Y3iMDcZPhyQ== "@esbuild/darwin-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.17.19.tgz#584c34c5991b95d4d48d333300b1a4e2ff7be276" integrity sha512-80wEoCfF/hFKM6WE1FyBHc9SfUblloAWx6FJkFWTWiCoht9Mc0ARGEM47e67W9rI09YoUxJL68WHfDRYEAvOhg== -"@esbuild/darwin-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.2.tgz#55b36bc06d76f5c243987c1f93a11a80d8fc3b26" - integrity sha512-Ora8JokrvrzEPEpZO18ZYXkH4asCdc1DLdcVy8TGf5eWtPO1Ie4WroEJzwI52ZGtpODy3+m0a2yEX9l+KUn0tA== +"@esbuild/darwin-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.20.0.tgz#5c122ed799eb0c35b9d571097f77254964c276a2" + integrity sha512-AjEcivGAlPs3UAcJedMa9qYg9eSfU6FnGHJjT8s346HSKkrcWlYezGE8VaO2xKfvvlZkgAhyvl06OJOxiMgOYQ== "@esbuild/darwin-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.17.19.tgz#7751d236dfe6ce136cce343dce69f52d76b7f6cb" integrity sha512-IJM4JJsLhRYr9xdtLytPLSH9k/oxR3boaUIYiHkAawtwNOXKE8KoU8tMvryogdcT8AU+Bflmh81Xn6Q0vTZbQw== -"@esbuild/darwin-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.2.tgz#982524af33a6424a3b5cb44bbd52559623ad719c" - integrity sha512-tP+B5UuIbbFMj2hQaUr6EALlHOIOmlLM2FK7jeFBobPy2ERdohI4Ka6ZFjZ1ZYsrHE/hZimGuU90jusRE0pwDw== +"@esbuild/darwin-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.20.0.tgz#9561d277002ba8caf1524f209de2b22e93d170c1" + integrity sha512-bsgTPoyYDnPv8ER0HqnJggXK6RyFy4PH4rtsId0V7Efa90u2+EifxytE9pZnsDgExgkARy24WUQGv9irVbTvIw== "@esbuild/freebsd-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.17.19.tgz#cacd171665dd1d500f45c167d50c6b7e539d5fd2" integrity sha512-pBwbc7DufluUeGdjSU5Si+P3SoMF5DQ/F/UmTSb8HXO80ZEAJmrykPyzo1IfNbAoaqw48YRpv8shwd1NoI0jcQ== -"@esbuild/freebsd-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.2.tgz#8e478a0856645265fe79eac4b31b52193011ee06" - integrity sha512-YbPY2kc0acfzL1VPVK6EnAlig4f+l8xmq36OZkU0jzBVHcOTyQDhnKQaLzZudNJQyymd9OqQezeaBgkTGdTGeQ== +"@esbuild/freebsd-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.20.0.tgz#84178986a3138e8500d17cc380044868176dd821" + integrity sha512-kQ7jYdlKS335mpGbMW5tEe3IrQFIok9r84EM3PXB8qBFJPSc6dpWfrtsC/y1pyrz82xfUIn5ZrnSHQQsd6jebQ== "@esbuild/freebsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.17.19.tgz#0769456eee2a08b8d925d7c00b79e861cb3162e4" integrity sha512-4lu+n8Wk0XlajEhbEffdy2xy53dpR06SlzvhGByyg36qJw6Kpfk7cp45DR/62aPH9mtJRmIyrXAS5UWBrJT6TQ== -"@esbuild/freebsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.2.tgz#01b96604f2540db023c73809bb8ae6cd1692d6f3" - integrity sha512-nSO5uZT2clM6hosjWHAsS15hLrwCvIWx+b2e3lZ3MwbYSaXwvfO528OF+dLjas1g3bZonciivI8qKR/Hm7IWGw== +"@esbuild/freebsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.20.0.tgz#3f9ce53344af2f08d178551cd475629147324a83" + integrity sha512-uG8B0WSepMRsBNVXAQcHf9+Ko/Tr+XqmK7Ptel9HVmnykupXdS4J7ovSQUIi0tQGIndhbqWLaIL/qO/cWhXKyQ== "@esbuild/linux-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.17.19.tgz#38e162ecb723862c6be1c27d6389f48960b68edb" integrity sha512-ct1Tg3WGwd3P+oZYqic+YZF4snNl2bsnMKRkb3ozHmnM0dGWuxcPTTntAF6bOP0Sp4x0PjSF+4uHQ1xvxfRKqg== -"@esbuild/linux-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.2.tgz#7e5d2c7864c5c83ec789b59c77cd9c20d2594916" - integrity sha512-ig2P7GeG//zWlU0AggA3pV1h5gdix0MA3wgB+NsnBXViwiGgY77fuN9Wr5uoCrs2YzaYfogXgsWZbm+HGr09xg== +"@esbuild/linux-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.20.0.tgz#24efa685515689df4ecbc13031fa0a9dda910a11" + integrity sha512-uTtyYAP5veqi2z9b6Gr0NUoNv9F/rOzI8tOD5jKcCvRUn7T60Bb+42NDBCWNhMjkQzI0qqwXkQGo1SY41G52nw== "@esbuild/linux-arm@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.17.19.tgz#1a2cd399c50040184a805174a6d89097d9d1559a" integrity sha512-cdmT3KxjlOQ/gZ2cjfrQOtmhG4HJs6hhvm3mWSRDPtZ/lP5oe8FWceS10JaSJC13GBd4eH/haHnqf7hhGNLerA== -"@esbuild/linux-arm@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.2.tgz#c32ae97bc0246664a1cfbdb4a98e7b006d7db8ae" - integrity sha512-Odalh8hICg7SOD7XCj0YLpYCEc+6mkoq63UnExDCiRA2wXEmGlK5JVrW50vZR9Qz4qkvqnHcpH+OFEggO3PgTg== +"@esbuild/linux-arm@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.20.0.tgz#6b586a488e02e9b073a75a957f2952b3b6e87b4c" + integrity sha512-2ezuhdiZw8vuHf1HKSf4TIk80naTbP9At7sOqZmdVwvvMyuoDiZB49YZKLsLOfKIr77+I40dWpHVeY5JHpIEIg== "@esbuild/linux-ia32@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.17.19.tgz#e28c25266b036ce1cabca3c30155222841dc035a" integrity sha512-w4IRhSy1VbsNxHRQpeGCHEmibqdTUx61Vc38APcsRbuVgK0OPEnQ0YD39Brymn96mOx48Y2laBQGqgZ0j9w6SQ== -"@esbuild/linux-ia32@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.2.tgz#3fc4f0fa026057fe885e4a180b3956e704f1ceaa" - integrity sha512-mLfp0ziRPOLSTek0Gd9T5B8AtzKAkoZE70fneiiyPlSnUKKI4lp+mGEnQXcQEHLJAcIYDPSyBvsUbKUG2ri/XQ== +"@esbuild/linux-ia32@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.20.0.tgz#84ce7864f762708dcebc1b123898a397dea13624" + integrity sha512-c88wwtfs8tTffPaoJ+SQn3y+lKtgTzyjkD8NgsyCtCmtoIC8RDL7PrJU05an/e9VuAke6eJqGkoMhJK1RY6z4w== "@esbuild/linux-loong64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.17.19.tgz#0f887b8bb3f90658d1a0117283e55dbd4c9dcf72" integrity sha512-2iAngUbBPMq439a+z//gE+9WBldoMp1s5GWsUSgqHLzLJ9WoZLZhpwWuym0u0u/4XmZ3gpHmzV84PonE+9IIdQ== -"@esbuild/linux-loong64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.2.tgz#633bcaea443f3505fb0ed109ab840c99ad3451a4" - integrity sha512-hn28+JNDTxxCpnYjdDYVMNTR3SKavyLlCHHkufHV91fkewpIyQchS1d8wSbmXhs1fiYDpNww8KTFlJ1dHsxeSw== +"@esbuild/linux-loong64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.20.0.tgz#1922f571f4cae1958e3ad29439c563f7d4fd9037" + integrity sha512-lR2rr/128/6svngnVta6JN4gxSXle/yZEZL3o4XZ6esOqhyR4wsKyfu6qXAL04S4S5CgGfG+GYZnjFd4YiG3Aw== "@esbuild/linux-mips64el@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.17.19.tgz#f5d2a0b8047ea9a5d9f592a178ea054053a70289" integrity sha512-LKJltc4LVdMKHsrFe4MGNPp0hqDFA1Wpt3jE1gEyM3nKUvOiO//9PheZZHfYRfYl6AwdTH4aTcXSqBerX0ml4A== -"@esbuild/linux-mips64el@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.2.tgz#e0bff2898c46f52be7d4dbbcca8b887890805823" - integrity sha512-KbXaC0Sejt7vD2fEgPoIKb6nxkfYW9OmFUK9XQE4//PvGIxNIfPk1NmlHmMg6f25x57rpmEFrn1OotASYIAaTg== +"@esbuild/linux-mips64el@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.20.0.tgz#7ca1bd9df3f874d18dbf46af009aebdb881188fe" + integrity sha512-9Sycc+1uUsDnJCelDf6ZNqgZQoK1mJvFtqf2MUz4ujTxGhvCWw+4chYfDLPepMEvVL9PDwn6HrXad5yOrNzIsQ== "@esbuild/linux-ppc64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.17.19.tgz#876590e3acbd9fa7f57a2c7d86f83717dbbac8c7" integrity sha512-/c/DGybs95WXNS8y3Ti/ytqETiW7EU44MEKuCAcpPto3YjQbyK3IQVKfF6nbghD7EcLUGl0NbiL5Rt5DMhn5tg== -"@esbuild/linux-ppc64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.2.tgz#d75798da391f54a9674f8c143b9a52d1dbfbfdde" - integrity sha512-dJ0kE8KTqbiHtA3Fc/zn7lCd7pqVr4JcT0JqOnbj4LLzYnp+7h8Qi4yjfq42ZlHfhOCM42rBh0EwHYLL6LEzcw== +"@esbuild/linux-ppc64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.20.0.tgz#8f95baf05f9486343bceeb683703875d698708a4" + integrity sha512-CoWSaaAXOZd+CjbUTdXIJE/t7Oz+4g90A3VBCHLbfuc5yUQU/nFDLOzQsN0cdxgXd97lYW/psIIBdjzQIwTBGw== "@esbuild/linux-riscv64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.17.19.tgz#7f49373df463cd9f41dc34f9b2262d771688bf09" integrity sha512-FC3nUAWhvFoutlhAkgHf8f5HwFWUL6bYdvLc/TTuxKlvLi3+pPzdZiFKSWz/PF30TB1K19SuCxDTI5KcqASJqA== -"@esbuild/linux-riscv64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.2.tgz#012409bd489ed1bb9b775541d4a46c5ded8e6dd8" - integrity sha512-7Z/jKNFufZ/bbu4INqqCN6DDlrmOTmdw6D0gH+6Y7auok2r02Ur661qPuXidPOJ+FSgbEeQnnAGgsVynfLuOEw== +"@esbuild/linux-riscv64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.20.0.tgz#ca63b921d5fe315e28610deb0c195e79b1a262ca" + integrity sha512-mlb1hg/eYRJUpv8h/x+4ShgoNLL8wgZ64SUr26KwglTYnwAWjkhR2GpoKftDbPOCnodA9t4Y/b68H4J9XmmPzA== "@esbuild/linux-s390x@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.17.19.tgz#e2afd1afcaf63afe2c7d9ceacd28ec57c77f8829" integrity sha512-IbFsFbxMWLuKEbH+7sTkKzL6NJmG2vRyy6K7JJo55w+8xDk7RElYn6xvXtDW8HCfoKBFK69f3pgBJSUSQPr+4Q== -"@esbuild/linux-s390x@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.2.tgz#ece3ed75c5a150de8a5c110f02e97d315761626b" - integrity sha512-U+RinR6aXXABFCcAY4gSlv4CL1oOVvSSCdseQmGO66H+XyuQGZIUdhG56SZaDJQcLmrSfRmx5XZOWyCJPRqS7g== +"@esbuild/linux-s390x@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.20.0.tgz#cb3d069f47dc202f785c997175f2307531371ef8" + integrity sha512-fgf9ubb53xSnOBqyvWEY6ukBNRl1mVX1srPNu06B6mNsNK20JfH6xV6jECzrQ69/VMiTLvHMicQR/PgTOgqJUQ== "@esbuild/linux-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.17.19.tgz#8a0e9738b1635f0c53389e515ae83826dec22aa4" integrity sha512-68ngA9lg2H6zkZcyp22tsVt38mlhWde8l3eJLWkyLrp4HwMUr3c1s/M2t7+kHIhvMjglIBrFpncX1SzMckomGw== -"@esbuild/linux-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.2.tgz#dea187019741602d57aaf189a80abba261fbd2aa" - integrity sha512-oxzHTEv6VPm3XXNaHPyUTTte+3wGv7qVQtqaZCrgstI16gCuhNOtBXLEBkBREP57YTd68P0VgDgG73jSD8bwXQ== +"@esbuild/linux-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.20.0.tgz#ac617e0dc14e9758d3d7efd70288c14122557dc7" + integrity sha512-H9Eu6MGse++204XZcYsse1yFHmRXEWgadk2N58O/xd50P9EvFMLJTQLg+lB4E1cF2xhLZU5luSWtGTb0l9UeSg== "@esbuild/netbsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.17.19.tgz#c29fb2453c6b7ddef9a35e2c18b37bda1ae5c462" integrity sha512-CwFq42rXCR8TYIjIfpXCbRX0rp1jo6cPIUPSaWwzbVI4aOfX96OXY8M6KNmtPcg7QjYeDmN+DD0Wp3LaBOLf4Q== -"@esbuild/netbsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.2.tgz#bbfd7cf9ab236a23ee3a41b26f0628c57623d92a" - integrity sha512-WNa5zZk1XpTTwMDompZmvQLHszDDDN7lYjEHCUmAGB83Bgs20EMs7ICD+oKeT6xt4phV4NDdSi/8OfjPbSbZfQ== +"@esbuild/netbsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.20.0.tgz#6cc778567f1513da6e08060e0aeb41f82eb0f53c" + integrity sha512-lCT675rTN1v8Fo+RGrE5KjSnfY0x9Og4RN7t7lVrN3vMSjy34/+3na0q7RIfWDAj0e0rCh0OL+P88lu3Rt21MQ== "@esbuild/openbsd-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.17.19.tgz#95e75a391403cb10297280d524d66ce04c920691" integrity sha512-cnq5brJYrSZ2CF6c35eCmviIN3k3RczmHz8eYaVlNasVqsNY+JKohZU5MKmaOI+KkllCdzOKKdPs762VCPC20g== -"@esbuild/openbsd-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.2.tgz#fa5c4c6ee52a360618f00053652e2902e1d7b4a7" - integrity sha512-S6kI1aT3S++Dedb7vxIuUOb3oAxqxk2Rh5rOXOTYnzN8JzW1VzBd+IqPiSpgitu45042SYD3HCoEyhLKQcDFDw== +"@esbuild/openbsd-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.20.0.tgz#76848bcf76b4372574fb4d06cd0ed1fb29ec0fbe" + integrity sha512-HKoUGXz/TOVXKQ+67NhxyHv+aDSZf44QpWLa3I1lLvAwGq8x1k0T+e2HHSRvxWhfJrFxaaqre1+YyzQ99KixoA== "@esbuild/sunos-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.17.19.tgz#722eaf057b83c2575937d3ffe5aeb16540da7273" integrity sha512-vCRT7yP3zX+bKWFeP/zdS6SqdWB8OIpaRq/mbXQxTGHnIxspRtigpkUcDMlSCOejlHowLqII7K2JKevwyRP2rg== -"@esbuild/sunos-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.2.tgz#52a2ac8ac6284c02d25df22bb4cfde26fbddd68d" - integrity sha512-VXSSMsmb+Z8LbsQGcBMiM+fYObDNRm8p7tkUDMPG/g4fhFX5DEFmjxIEa3N8Zr96SjsJ1woAhF0DUnS3MF3ARw== +"@esbuild/sunos-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.20.0.tgz#ea4cd0639bf294ad51bc08ffbb2dac297e9b4706" + integrity sha512-GDwAqgHQm1mVoPppGsoq4WJwT3vhnz/2N62CzhvApFD1eJyTroob30FPpOZabN+FgCjhG+AgcZyOPIkR8dfD7g== "@esbuild/win32-arm64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.17.19.tgz#9aa9dc074399288bdcdd283443e9aeb6b9552b6f" integrity sha512-yYx+8jwowUstVdorcMdNlzklLYhPxjniHWFKgRqH7IFlUEa0Umu3KuYplf1HUZZ422e3NU9F4LGb+4O0Kdcaag== -"@esbuild/win32-arm64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.2.tgz#719ed5870855de8537aef8149694a97d03486804" - integrity sha512-5NayUlSAyb5PQYFAU9x3bHdsqB88RC3aM9lKDAz4X1mo/EchMIT1Q+pSeBXNgkfNmRecLXA0O8xP+x8V+g/LKg== +"@esbuild/win32-arm64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.20.0.tgz#a5c171e4a7f7e4e8be0e9947a65812c1535a7cf0" + integrity sha512-0vYsP8aC4TvMlOQYozoksiaxjlvUcQrac+muDqj1Fxy6jh9l9CZJzj7zmh8JGfiV49cYLTorFLxg7593pGldwQ== "@esbuild/win32-ia32@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.17.19.tgz#95ad43c62ad62485e210f6299c7b2571e48d2b03" integrity sha512-eggDKanJszUtCdlVs0RB+h35wNlb5v4TWEkq4vZcmVt5u/HiDZrTXe2bWFQUez3RgNHwx/x4sk5++4NSSicKkw== -"@esbuild/win32-ia32@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.2.tgz#24832223880b0f581962c8660f8fb8797a1e046a" - integrity sha512-47gL/ek1v36iN0wL9L4Q2MFdujR0poLZMJwhO2/N3gA89jgHp4MR8DKCmwYtGNksbfJb9JoTtbkoe6sDhg2QTA== +"@esbuild/win32-ia32@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.20.0.tgz#f8ac5650c412d33ea62d7551e0caf82da52b7f85" + integrity sha512-p98u4rIgfh4gdpV00IqknBD5pC84LCub+4a3MO+zjqvU5MVXOc3hqR2UgT2jI2nh3h8s9EQxmOsVI3tyzv1iFg== "@esbuild/win32-x64@0.17.19": version "0.17.19" resolved "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.17.19.tgz#8cfaf2ff603e9aabb910e9c0558c26cf32744061" integrity sha512-lAhycmKnVOuRYNtRtatQR1LPQf2oYCkRGkSFnseDAKPl8lu5SOsK/e1sXe5a0Pc5kHIHe6P2I/ilntNv2xf3cA== -"@esbuild/win32-x64@0.19.2": - version "0.19.2" - resolved "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.2.tgz#1205014625790c7ff0e471644a878a65d1e34ab0" - integrity sha512-tcuhV7ncXBqbt/Ybf0IyrMcwVOAPDckMK9rXNHtF17UTK18OKLpg08glminN06pt2WCoALhXdLfSPbVvK/6fxw== +"@esbuild/win32-x64@0.20.0": + version "0.20.0" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.20.0.tgz#2efddf82828aac85e64cef62482af61c29561bee" + integrity sha512-NgJnesu1RtWihtTtXGFMU5YSE6JyyHPMxCwBZK7a6/8d31GuSo9l0Ss7w1Jw5QnKUawG6UEehs883kcXf5fYwg== "@eslint-community/eslint-utils@^4.2.0", "@eslint-community/eslint-utils@^4.4.0": version "4.4.0" @@ -2888,33 +2893,34 @@ esbuild-plugin-alias@0.2.1: resolved "https://registry.npmjs.org/esbuild-plugin-alias/-/esbuild-plugin-alias-0.2.1.tgz#45a86cb941e20e7c2bc68a2bea53562172494fcb" integrity sha512-jyfL/pwPqaFXyKnj8lP8iLk6Z0m099uXR45aSN8Av1XD4vhvQutxxPzgA2bTcAwQpa1zCXDcWOlhFgyP3GKqhQ== -esbuild@0.19.2: - version "0.19.2" - resolved "https://registry.npmjs.org/esbuild/-/esbuild-0.19.2.tgz#b1541828a89dfb6f840d38538767c6130dca2aac" - integrity sha512-G6hPax8UbFakEj3hWO0Vs52LQ8k3lnBhxZWomUJDxfz3rZTLqF5k/FCzuNdLx2RbpBiQQF9H9onlDDH1lZsnjg== +esbuild@0.20.0: + version "0.20.0" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.20.0.tgz#a7170b63447286cd2ff1f01579f09970e6965da4" + integrity sha512-6iwE3Y2RVYCME1jLpBqq7LQWK3MW6vjV2bZy6gt/WrqkY+WE74Spyc0ThAOYpMtITvnjX09CrC6ym7A/m9mebA== optionalDependencies: - "@esbuild/android-arm" "0.19.2" - "@esbuild/android-arm64" "0.19.2" - "@esbuild/android-x64" "0.19.2" - "@esbuild/darwin-arm64" "0.19.2" - "@esbuild/darwin-x64" "0.19.2" - "@esbuild/freebsd-arm64" "0.19.2" - "@esbuild/freebsd-x64" "0.19.2" - "@esbuild/linux-arm" "0.19.2" - "@esbuild/linux-arm64" "0.19.2" - "@esbuild/linux-ia32" "0.19.2" - "@esbuild/linux-loong64" "0.19.2" - "@esbuild/linux-mips64el" "0.19.2" - "@esbuild/linux-ppc64" "0.19.2" - "@esbuild/linux-riscv64" "0.19.2" - "@esbuild/linux-s390x" "0.19.2" - "@esbuild/linux-x64" "0.19.2" - "@esbuild/netbsd-x64" "0.19.2" - "@esbuild/openbsd-x64" "0.19.2" - "@esbuild/sunos-x64" "0.19.2" - "@esbuild/win32-arm64" "0.19.2" - "@esbuild/win32-ia32" "0.19.2" - "@esbuild/win32-x64" "0.19.2" + "@esbuild/aix-ppc64" "0.20.0" + "@esbuild/android-arm" "0.20.0" + "@esbuild/android-arm64" "0.20.0" + "@esbuild/android-x64" "0.20.0" + "@esbuild/darwin-arm64" "0.20.0" + "@esbuild/darwin-x64" "0.20.0" + "@esbuild/freebsd-arm64" "0.20.0" + "@esbuild/freebsd-x64" "0.20.0" + "@esbuild/linux-arm" "0.20.0" + "@esbuild/linux-arm64" "0.20.0" + "@esbuild/linux-ia32" "0.20.0" + "@esbuild/linux-loong64" "0.20.0" + "@esbuild/linux-mips64el" "0.20.0" + "@esbuild/linux-ppc64" "0.20.0" + "@esbuild/linux-riscv64" "0.20.0" + "@esbuild/linux-s390x" "0.20.0" + "@esbuild/linux-x64" "0.20.0" + "@esbuild/netbsd-x64" "0.20.0" + "@esbuild/openbsd-x64" "0.20.0" + "@esbuild/sunos-x64" "0.20.0" + "@esbuild/win32-arm64" "0.20.0" + "@esbuild/win32-ia32" "0.20.0" + "@esbuild/win32-x64" "0.20.0" esbuild@^0.17.11: version "0.17.19" From a88e9f62f371e87ac34a29305dc87a82d227ff30 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:34:14 -0500 Subject: [PATCH 24/74] MINOR: [JS] Bump regenerator-runtime from 0.14.0 to 0.14.1 in /js (#39889) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [regenerator-runtime](https://github.com/facebook/regenerator) from 0.14.0 to 0.14.1.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=regenerator-runtime&package-manager=npm_and_yarn&previous-version=0.14.0&new-version=0.14.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/package.json | 2 +- js/yarn.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/js/package.json b/js/package.json index f96764d82245e..bb70fd0a395b0 100644 --- a/js/package.json +++ b/js/package.json @@ -102,7 +102,7 @@ "memfs": "4.5.0", "mkdirp": "3.0.1", "multistream": "4.1.0", - "regenerator-runtime": "0.14.0", + "regenerator-runtime": "0.14.1", "rollup": "4.3.0", "rxjs": "7.8.1", "ts-jest": "29.1.1", diff --git a/js/yarn.lock b/js/yarn.lock index e7dead09bf8bb..7b3180740d3da 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -5967,10 +5967,10 @@ redent@^4.0.0: indent-string "^5.0.0" strip-indent "^4.0.0" -regenerator-runtime@0.14.0: - version "0.14.0" - resolved "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.0.tgz#5e19d68eb12d486f797e15a3c6a918f7cec5eb45" - integrity sha512-srw17NI0TUWHuGa5CFGGmhfNIeja30WMBfbslPNhf6JrqQlLN5gcrvig1oqPxiVaXb0oW0XRKtH6Nngs5lKCIA== +regenerator-runtime@0.14.1: + version "0.14.1" + resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz#356ade10263f685dda125100cd862c1db895327f" + integrity sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw== regex-not@^1.0.0, regex-not@^1.0.2: version "1.0.2" From 796b0cc0ad0509502f5419d379225e6168e2bb06 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 2 Feb 2024 16:49:59 +0900 Subject: [PATCH 25/74] GH-39872: [Packaging][Ubuntu] Add support for Ubuntu 24.04 Noble Numbat (#39887) ### Rationale for this change Ubuntu 24.04 isn't released yet but it seems that Docker image is already available. ### What changes are included in this PR? Add jobs for Ubuntu 24.04. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39872 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/release/binary-task.rb | 7 +- dev/release/verify-release-candidate.sh | 4 +- .../apt/ubuntu-noble/Dockerfile | 41 +++++++++ .../apache-arrow/apt/ubuntu-noble-arm64/from | 18 ++++ .../apache-arrow/apt/ubuntu-noble/Dockerfile | 85 +++++++++++++++++++ dev/tasks/linux-packages/package-task.rb | 2 + dev/tasks/tasks.yml | 3 +- 7 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile create mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from create mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index df6c0778dc805..0c1b98ab32c95 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1089,6 +1089,7 @@ def available_apt_targets ["ubuntu", "focal", "main"], ["ubuntu", "jammy", "main"], ["ubuntu", "mantic", "main"], + ["ubuntu", "noble", "main"], ] end @@ -2121,8 +2122,10 @@ def apt_test_targets_default # "ubuntu-focal-arm64", "ubuntu-jammy", # "ubuntu-jammy-arm64", - "ubuntu-lunar", - # "ubuntu-lunar-arm64", + "ubuntu-mantic", + # "ubuntu-mantic-arm64", + "ubuntu-noble", + # "ubuntu-noble-arm64", ] end diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 04fc7fd563f65..a61b5ba094c8a 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -196,7 +196,9 @@ test_apt() { "ubuntu:jammy" \ "arm64v8/ubuntu:jammy" \ "ubuntu:mantic" \ - "arm64v8/ubuntu:mantic"; do \ + "arm64v8/ubuntu:mantic" \ + "ubuntu:noble" \ + "arm64v8/ubuntu:noble"; do \ case "${target}" in arm64v8/*) if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile new file mode 100644 index 0000000000000..0e37ee94bb0a3 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-noble/Dockerfile @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:noble + +RUN \ + echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN \ + echo 'APT::Install-Recommends "false";' > \ + /etc/apt/apt.conf.d/disable-install-recommends + +ARG DEBUG + +RUN \ + quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ + apt update ${quiet} && \ + apt install -y -V ${quiet} \ + build-essential \ + debhelper \ + devscripts \ + fakeroot \ + gnupg \ + lsb-release && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from new file mode 100644 index 0000000000000..4414c353871c6 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble-arm64/from @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arm64v8/ubuntu:noble diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile new file mode 100644 index 0000000000000..33f2d9a35371b --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-noble/Dockerfile @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG FROM=ubuntu:noble +FROM ${FROM} + +RUN \ + echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN \ + echo 'APT::Install-Recommends "false";' > \ + /etc/apt/apt.conf.d/disable-install-recommends + +ARG DEBUG +RUN \ + quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ + apt update ${quiet} && \ + apt install -y -V ${quiet} \ + build-essential \ + clang \ + clang-tools \ + cmake \ + debhelper \ + devscripts \ + git \ + gtk-doc-tools \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgirepository1.0-dev \ + libglib2.0-doc \ + libgmock-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + libgtest-dev \ + liblz4-dev \ + libmlir-15-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libre2-dev \ + libsnappy-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libzstd-dev \ + llvm-dev \ + lsb-release \ + meson \ + mlir-15-tools \ + ninja-build \ + nlohmann-json3-dev \ + pkg-config \ + protobuf-compiler-grpc \ + python3-dev \ + python3-pip \ + python3-setuptools \ + rapidjson-dev \ + tzdata \ + valac \ + zlib1g-dev && \ + if apt list | grep -q '^libcuda'; then \ + apt install -y -V ${quiet} nvidia-cuda-toolkit; \ + else \ + :; \ + fi && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index ecd61054daeb1..51fe0b9a75b0c 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -279,6 +279,8 @@ def apt_targets_default # "ubuntu-jammy-arm64", "ubuntu-mantic", # "ubuntu-mantic-arm64", + "ubuntu-noble", + # "ubuntu-noble-arm64", ] end diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 6c59364d51a50..0f8c58391fa66 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -465,7 +465,8 @@ tasks: "debian-trixie", "ubuntu-focal", "ubuntu-jammy", - "ubuntu-mantic"] %} + "ubuntu-mantic", + "ubuntu-noble"] %} {% for architecture in ["amd64", "arm64"] %} {{ target }}-{{ architecture }}: ci: github From 129a5291a26e2baa91d98d1910cb2128854e6b60 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Feb 2024 11:26:57 +0100 Subject: [PATCH 26/74] GH-39788: [Python] Validate max_chunksize in Table.to_batches (#39796) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Validating the keyword to be strictly positive, to avoid an infinite loop. * Closes: #39788 Authored-by: Joris Van den Bossche Signed-off-by: Raúl Cumplido --- python/pyarrow/table.pxi | 2 ++ python/pyarrow/tests/test_table.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3c450d61a7659..abda784fb7c18 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -4172,6 +4172,8 @@ cdef class Table(_Tabular): reader.reset(new TableBatchReader(deref(self.table))) if max_chunksize is not None: + if not max_chunksize > 0: + raise ValueError("'max_chunksize' should be strictly positive") c_max_chunksize = max_chunksize reader.get().set_chunksize(c_max_chunksize) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index ff38c614c251f..d6def54570581 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1089,6 +1089,9 @@ def test_table_to_batches(): table_from_iter = pa.Table.from_batches(iter([batch1, batch2, batch1])) assert table.equals(table_from_iter) + with pytest.raises(ValueError): + table.to_batches(max_chunksize=0) + def test_table_basics(): data = [ From 90b30fcbfdfe12fa9ed497c3fa1cfe682b50168f Mon Sep 17 00:00:00 2001 From: Lyndon Shi <9373058+lynshi@users.noreply.github.com> Date: Fri, 2 Feb 2024 07:15:57 -0800 Subject: [PATCH 27/74] MINOR: [C++][Docs] Fix MapBuilder docstring (#39755) The [current `MapBuilder` documentation](https://arrow.apache.org/docs/cpp/api/builder.html#_CPPv4N5arrow10MapBuilderE) says: > To use this class, you must append values to the key and item array builders and use the Append function to delimit each distinct map (once the keys and items have been appended) This contradicts the [docstring for `Append`](https://arrow.apache.org/docs/cpp/api/builder.html#_CPPv4N5arrow10MapBuilder6AppendEv): > This function should be called before beginning to append elements to the key and item builders The `Append` documentation is correct; it should be called *before* keys and items have been appended. If `Append` is called after, as the `MapBuilder` docstring suggests, `Finish` results in an empty `Array`. ### What changes are included in this PR? Documentation only change. ### Are these changes tested? There are no behavior changes. ### Are there any user-facing changes? No Authored-by: Lyndon Shi <9373058+lynshi@users.noreply.github.com> Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/array/builder_nested.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 8065752f3e278..429aa5c0488cd 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -515,10 +515,9 @@ class ARROW_EXPORT LargeListViewBuilder final /// \class MapBuilder /// \brief Builder class for arrays of variable-size maps /// -/// To use this class, you must append values to the key and item array builders -/// and use the Append function to delimit each distinct map (once the keys and items -/// have been appended) or use the bulk API to append a sequence of offsets and null -/// maps. +/// To use this class, you must use the Append function to delimit each distinct +/// map before appending values to the key and item array builders, or use the +/// bulk API to append a sequence of offsets and null maps. /// /// Key uniqueness and ordering are not validated. class ARROW_EXPORT MapBuilder : public ArrayBuilder { From 32bd01fa64b275937ca90aa50b11f275eeefde94 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 3 Feb 2024 03:54:47 +0800 Subject: [PATCH 28/74] GH-39843: [C++][Parquet] Parquet binary length overflow exception should contain the length of binary (#39844) ### Rationale for this change See https://github.com/apache/arrow/issues/39843 It will be great to contain a string length in decoder. ### What changes are included in this PR? change the logging of encoding ### Are these changes tested? no ### Are there any user-facing changes? more specific error logging? * Closes: #39843 Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/column_writer.cc | 3 ++- cpp/src/parquet/encoding.cc | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 23366b2daafd5..eae8fc6125499 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -442,7 +442,8 @@ class SerializedPageWriter : public PageWriter { if (offset_index_builder_ != nullptr) { const int64_t compressed_size = output_data_len + header_size; if (compressed_size > std::numeric_limits::max()) { - throw ParquetException("Compressed page size overflows INT32_MAX."); + throw ParquetException("Compressed page size ", compressed_size, + " overflows INT32_MAX."); } if (!page.first_row_index().has_value()) { throw ParquetException("First row index is not set in data page."); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 5573f5b9aed4c..a3d1746536647 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -160,7 +160,8 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } UnsafePutByteArray(view.data(), static_cast(view.size())); return Status::OK(); @@ -571,7 +572,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } PutByteArray(view.data(), static_cast(view.size())); return Status::OK(); @@ -585,7 +587,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { for (int64_t i = 0; i < array.length(); i++) { auto v = array.GetView(i); if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); + throw ParquetException( + "Parquet cannot store strings with size 2GB or more, got: ", v.size()); } dict_encoded_size_ += static_cast(v.size() + sizeof(uint32_t)); int32_t unused_memo_index; @@ -2671,7 +2674,8 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, *array.data(), [&](::std::string_view view) { if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); @@ -3200,7 +3204,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { - return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + return Status::Invalid( + "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } const ByteArray src{view}; @@ -3246,7 +3251,8 @@ struct ByteArrayVisitor { std::string_view operator[](int i) const { if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); + throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ", + src[i].len); } return std::string_view{src[i]}; } From 0fb00fdea7a9541ac8df8a4f784af1dfd0adb056 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Sat, 3 Feb 2024 01:45:40 +0530 Subject: [PATCH 29/74] GH-39734: [Java] Bump org.codehaus.mojo:exec-maven-plugin from 1.6.0 to 3.1.1 (#39696) ### Rationale for this change This PR was created to replace https://github.com/apache/arrow/pull/39374 and do the necessary changes for `org.codehaus.mojo` upgrade to take place. ### What changes are included in this PR? The changes to the `org.codehaus.mojo` version and an upgrade on the maven version used in the `.env`. ### Are these changes tested? Tested locally, but this requires a CI verification on Java. ### Are there any user-facing changes? No Authored-by: vibhatha Signed-off-by: David Li --- .env | 2 +- docker-compose.yml | 12 +++--------- java/performance/pom.xml | 2 +- java/pom.xml | 6 +++--- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/.env b/.env index 6746892fd4ed8..427a4ab0bf398 100644 --- a/.env +++ b/.env @@ -65,7 +65,7 @@ JDK=8 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=14 -MAVEN=3.5.4 +MAVEN=3.6.3 NODE=18 NUMBA=latest NUMPY=latest diff --git a/docker-compose.yml b/docker-compose.yml index a08345c198fa0..0252c4ec8a896 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1709,9 +1709,7 @@ services: arch: ${ARCH} # Use a newer JDK as it seems to improve stability jdk: 17 - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} node: ${NODE} go: ${GO} volumes: *conda-volumes @@ -1843,9 +1841,7 @@ services: arch: ${ARCH} python: ${PYTHON} jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} hdfs: ${HDFS} links: - impala:impala @@ -1886,9 +1882,7 @@ services: arch: ${ARCH} python: ${PYTHON} jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 + maven: ${MAVEN} spark: ${SPARK} numpy: ${NUMPY} shm_size: *shm-size diff --git a/java/performance/pom.xml b/java/performance/pom.xml index a1d53171f549b..ba5a6616dca77 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -139,7 +139,7 @@ org.codehaus.mojo exec-maven-plugin - 1.6.0 + 3.1.1 run-java-benchmarks diff --git a/java/pom.xml b/java/pom.xml index 3e595648ed085..7871303634976 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1038,7 +1038,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 cdata-cmake @@ -1099,7 +1099,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 jni-cpp-cmake @@ -1214,7 +1214,7 @@ org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.1.1 jni-cpp-cmake From 22f2cfd1e1ebe49016b6d97c49f494287a98d02f Mon Sep 17 00:00:00 2001 From: Divyansh200102 <146909065+Divyansh200102@users.noreply.github.com> Date: Sat, 3 Feb 2024 16:29:49 +0530 Subject: [PATCH 30/74] GH-39416: [GLib][Docs] Fixed Broken Link in README Content (#39896) ### Rationale for this change ### What changes are included in this PR? Fixed Broken Link in README Content ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * Closes: #39416 Lead-authored-by: Divyansh200102 Co-authored-by: Divyansh200102 <146909065+Divyansh200102@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/README.md b/c_glib/README.md index 2a4d6b8a6628c..24e69eff65055 100644 --- a/c_glib/README.md +++ b/c_glib/README.md @@ -101,7 +101,7 @@ $ sudo meson install -C c_glib.build You need to install Arrow C++ before you install Arrow GLib. See Arrow C++ document about how to install Arrow C++. -You need [GTK-Doc](https://www.gtk.org/gtk-doc/) and +You need [GTK-Doc](https://gitlab.gnome.org/GNOME/gtk-doc) and [GObject Introspection](https://wiki.gnome.org/Projects/GObjectIntrospection) to build Arrow GLib. You can install them by the followings: From aded7bf37686a16fc4b0649ab97231427a219d7b Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Sun, 4 Feb 2024 01:37:36 -0500 Subject: [PATCH 31/74] GH-39909: [Java][CI] Update reference to Float16 testing file reference on Testing submodule (#39911) ### Rationale for this change Update reference to Float16 testing file reference on Testing submodule. ### What changes are included in this PR? Testing submodule. changes. ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #39909 Authored-by: david dali susanibar arce Signed-off-by: Sutou Kouhei --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index ad82a736c170e..25d16511e8d42 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170 +Subproject commit 25d16511e8d42c2744a1d94d90169e3a36e92631 From 585e0a252f327e7136695f586b187b2ba5a3a1e3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 5 Feb 2024 05:55:54 +0800 Subject: [PATCH 32/74] MINOR: [C++][Parquet] Remove undefined GetArrowType from schema_internal.h (#39931) ### Rationale for this change We have redundant declarations below and the 1st one should be removed: ```cpp Result> GetArrowType(Type::type physical_type, const LogicalType& logical_type, int type_length); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); ``` ### What changes are included in this PR? Remove the redundant function declaration described above. ### Are these changes tested? Make sure build and test pass. ### Are there any user-facing changes? No. Authored-by: Gang Wu Signed-off-by: Sutou Kouhei --- cpp/src/parquet/arrow/schema_internal.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 55292ac35ab9c..f56ba0958ae2d 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -34,10 +34,6 @@ Result> FromFLBA(const LogicalType& logical_t Result> FromInt32(const LogicalType& logical_type); Result> FromInt64(const LogicalType& logical_type); -Result> GetArrowType(Type::type physical_type, - const LogicalType& logical_type, - int type_length); - Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); From ed78986aa6971484f40a5780922128636a47d175 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 5 Feb 2024 11:51:04 +0900 Subject: [PATCH 33/74] GH-39928: [C++][Gandiva] Accept LLVM 18 (#39934) ### Rationale for this change LLVM 18.1 will be released soon. ### What changes are included in this PR? Accept LLVM 18.1. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39928 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/CMakeLists.txt | 1 + cpp/src/gandiva/engine.cc | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 016cd8a1b9ec8..50a85b33d5489 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -152,6 +152,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "18.1" "17.0" "16.0" "15.0" diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index fc047f2ac0763..bfce72cefc630 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -62,7 +62,11 @@ #endif #include #include +#if LLVM_VERSION_MAJOR >= 18 +#include +#else #include +#endif #include #include #if LLVM_VERSION_MAJOR >= 14 @@ -86,7 +90,9 @@ #include #include #include +#if LLVM_VERSION_MAJOR <= 17 #include +#endif // JITLink is available in LLVM 9+ // but the `InProcessMemoryManager::Create` API was added since LLVM 14 @@ -132,8 +138,13 @@ Result MakeTargetMachineBuilder( jtmb.setCPU(cpu_name.str()); jtmb.addFeatures(cpu_attrs); } +#if LLVM_VERSION_MAJOR >= 18 + using CodeGenOptLevel = llvm::CodeGenOptLevel; +#else + using CodeGenOptLevel = llvm::CodeGenOpt::Level; +#endif auto const opt_level = - conf.optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; + conf.optimize() ? CodeGenOptLevel::Aggressive : CodeGenOptLevel::None; jtmb.setCodeGenOptLevel(opt_level); return jtmb; } From 5856421e31b163104570d0305cb79f323cf488a6 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 5 Feb 2024 23:14:48 +0800 Subject: [PATCH 34/74] GH-39921: [Go][Parquet] ColumnWriter not reset TotalCompressedBytes after Flush (#39922) ### Rationale for this change See https://github.com/apache/arrow/issues/39921 ### What changes are included in this PR? Not clearing `totalCompressedBytes` when flush called ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, it's a bugfix * Closes: #39921 Authored-by: mwish Signed-off-by: Matt Topol --- go/parquet/file/column_writer.go | 5 +++-- go/parquet/file/column_writer_test.go | 28 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index ac857d17e632d..36663b10b89dd 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -397,7 +397,6 @@ func (w *columnWriter) FlushBufferedDataPages() (err error) { } } w.pages = w.pages[:0] - w.totalCompressedBytes = 0 return } @@ -542,7 +541,9 @@ func (w *columnWriter) Close() (err error) { if !w.closed { w.closed = true if w.hasDict && !w.fallbackToNonDict { - w.WriteDictionaryPage() + if err = w.WriteDictionaryPage(); err != nil { + return err + } } if err = w.FlushBufferedDataPages(); err != nil { diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 8011ac2487995..321e7b730d165 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -426,6 +426,26 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parque } } +func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { + p.GenerateData(SmallSize) + props := parquet.DefaultColumnProperties() + props.DictionaryEnabled = true + + if version == parquet.V1_0 { + props.Encoding = parquet.Encodings.PlainDict + } else { + props.Encoding = parquet.Encodings.RLEDict + } + + writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version)) + p.WriteBatchValues(writer, nil, nil) + writer.FallbackToPlain() + p.NotEqual(0, writer.TotalCompressedBytes()) + writer.Close() + p.NotEqual(0, writer.TotalCompressedBytes()) + p.NotEqual(0, writer.TotalBytesWritten()) +} + func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { p.testRequiredWithEncoding(parquet.Encodings.Plain) } @@ -575,6 +595,14 @@ func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() { p.testDictionaryFallbackEncoding(parquet.V2_LATEST) } +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV1() { + p.testDictionaryFallbackAndCompressedSize(parquet.V1_0) +} + +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV2() { + p.testDictionaryFallbackAndCompressedSize(parquet.V2_LATEST) +} + func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() { // test case for NULL values p.SetupSchema(parquet.Repetitions.Optional, 1) From 85e2a684b79b560929085c7f8e27586fa6d0b1ff Mon Sep 17 00:00:00 2001 From: Elliot Morrison-Reed Date: Mon, 5 Feb 2024 10:45:46 -0500 Subject: [PATCH 35/74] GH-39925: [Go][Parquet] Fix re-slicing in maybeReplaceValidity function (#39926) ### Rationale for this change See #39925. ### What changes are included in this PR? Fixes re-slicing logic for multiple data-types and negative length bug. ### Are these changes tested? There is a new test in the PR. ### Are there any user-facing changes? No, it just fixes a bug. * Closes: #39925 Authored-by: Morrison-Reed Elliot (BEG/EVS1-NA) Signed-off-by: Matt Topol --- go/parquet/file/column_writer.go | 5 +++- go/parquet/file/column_writer_test.go | 38 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index 36663b10b89dd..4d603c547ca6a 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -660,7 +660,10 @@ func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int if values.Data().Offset() > 0 { data := values.Data() - buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes]) + elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes() + start := data.Offset() * elemSize + end := start + data.Len()*elemSize + buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end]) } data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0) diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 321e7b730d165..dd597e280b850 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -24,6 +24,8 @@ import ( "sync" "testing" + "github.com/apache/arrow/go/v16/arrow" + "github.com/apache/arrow/go/v16/arrow/array" "github.com/apache/arrow/go/v16/arrow/bitutil" "github.com/apache/arrow/go/v16/arrow/memory" arrutils "github.com/apache/arrow/go/v16/internal/utils" @@ -36,6 +38,7 @@ import ( "github.com/apache/arrow/go/v16/parquet/internal/testutils" "github.com/apache/arrow/go/v16/parquet/internal/utils" "github.com/apache/arrow/go/v16/parquet/metadata" + "github.com/apache/arrow/go/v16/parquet/pqarrow" "github.com/apache/arrow/go/v16/parquet/schema" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -736,3 +739,38 @@ func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) } } + +func TestDictionaryReslice(t *testing.T) { + pts := []arrow.DataType{ + arrow.PrimitiveTypes.Int8, + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + arrow.PrimitiveTypes.Uint8, + arrow.PrimitiveTypes.Uint16, + arrow.PrimitiveTypes.Uint32, + arrow.PrimitiveTypes.Uint64, + } + for _, pt := range pts { + t.Run(pt.String(), func(t *testing.T) { + mem := memory.NewGoAllocator() + dt := &arrow.DictionaryType{ + IndexType: pt, + ValueType: &arrow.StringType{}, + } + field := arrow.Field{Name: "test_field", Type: dt, Nullable: true} + schema := arrow.NewSchema([]arrow.Field{field}, nil) + b := array.NewRecordBuilder(mem, schema) + for i := 0; i < 2000; i++ { + b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value") + } + rec := b.NewRecord() + out := &bytes.Buffer{} + pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties()) + assert.NoError(t, err) + err = pqw.WriteBuffered(rec) + assert.NoError(t, err) + + }) + } +} From 56951fee35c920ac898c2515896ff3bd752dde97 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 5 Feb 2024 17:15:44 +0100 Subject: [PATCH 36/74] GH-39865: [C++] Strip extension metadata when importing a registered extension (#39866) ### Rationale for this change When importing an extension type from the C Data Interface and the extension type is registered, we would still leave the extension-related metadata on the storage type. ### What changes are included in this PR? Strip extension-related metadata on the storage type if we succeed in recreating the extension type. This matches the behavior of the IPC layer and allows for more exact roundtripping. ### Are these changes tested? Yes. ### Are there any user-facing changes? No, unless people mistakingly rely on the presence of said metadata. * Closes: #39865 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/c/bridge.cc | 6 +++ cpp/src/arrow/c/bridge_test.cc | 48 ++++++++++++++++-------- cpp/src/arrow/util/key_value_metadata.cc | 18 ++++----- cpp/src/arrow/util/key_value_metadata.h | 11 +++--- 4 files changed, 52 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 172ed8962ce77..9b165a10a61e7 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -914,6 +914,8 @@ struct DecodedMetadata { std::shared_ptr metadata; std::string extension_name; std::string extension_serialized; + int extension_name_index = -1; // index of extension_name in metadata + int extension_serialized_index = -1; // index of extension_serialized in metadata }; Result DecodeMetadata(const char* metadata) { @@ -956,8 +958,10 @@ Result DecodeMetadata(const char* metadata) { RETURN_NOT_OK(read_string(&values[i])); if (keys[i] == kExtensionTypeKeyName) { decoded.extension_name = values[i]; + decoded.extension_name_index = i; } else if (keys[i] == kExtensionMetadataKeyName) { decoded.extension_serialized = values[i]; + decoded.extension_serialized_index = i; } } decoded.metadata = key_value_metadata(std::move(keys), std::move(values)); @@ -1046,6 +1050,8 @@ struct SchemaImporter { ARROW_ASSIGN_OR_RAISE( type_, registered_ext_type->Deserialize(std::move(type_), metadata_.extension_serialized)); + RETURN_NOT_OK(metadata_.metadata->DeleteMany( + {metadata_.extension_name_index, metadata_.extension_serialized_index})); } } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 321ec36c38d8c..8b67027454c55 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -1872,7 +1872,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertTypeEqual(*expected, *type); + AssertTypeEqual(*expected, *type, /*check_metadata=*/true); } void CheckImport(const std::shared_ptr& expected) { @@ -1892,7 +1892,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertSchemaEqual(*expected, *schema); + AssertSchemaEqual(*expected, *schema, /*check_metadata=*/true); } void CheckImportError() { @@ -3571,7 +3571,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the type ASSERT_OK_AND_ASSIGN(actual, ImportType(&c_schema)); type = factory_expected(); - AssertTypeEqual(*type, *actual); + AssertTypeEqual(*type, *actual, /*check_metadata=*/true); type.reset(); actual.reset(); @@ -3602,7 +3602,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the schema ASSERT_OK_AND_ASSIGN(actual, ImportSchema(&c_schema)); schema = factory(); - AssertSchemaEqual(*schema, *actual); + AssertSchemaEqual(*schema, *actual, /*check_metadata=*/true); schema.reset(); actual.reset(); @@ -3695,13 +3695,27 @@ TEST_F(TestSchemaRoundtrip, Dictionary) { } } +// Given an extension type, return a field of its storage type + the +// serialized extension metadata. +std::shared_ptr GetStorageWithMetadata(const std::string& field_name, + const std::shared_ptr& type) { + const auto& ext_type = checked_cast(*type); + auto storage_type = ext_type.storage_type(); + auto md = KeyValueMetadata::Make({kExtensionTypeKeyName, kExtensionMetadataKeyName}, + {ext_type.extension_name(), ext_type.Serialize()}); + return field(field_name, storage_type, /*nullable=*/true, md); +} + TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); - // Inside nested type - TestWithTypeFactory([]() { return list(dict_extension_type()); }, - []() { return list(dictionary(int8(), utf8())); }); + // Inside nested type. + // When an extension type is not known by the importer, it is imported + // as its storage type and the extension metadata is preserved on the field. + TestWithTypeFactory( + []() { return list(dict_extension_type()); }, + []() { return list(GetStorageWithMetadata("item", dict_extension_type())); }); } TEST_F(TestSchemaRoundtrip, RegisteredExtension) { @@ -3710,7 +3724,9 @@ TEST_F(TestSchemaRoundtrip, RegisteredExtension) { TestWithTypeFactory(dict_extension_type); TestWithTypeFactory(complex128); - // Inside nested type + // Inside nested type. + // When the extension type is registered, the extension metadata is removed + // from the storage type's field to ensure roundtripping (GH-39865). TestWithTypeFactory([]() { return list(uuid()); }); TestWithTypeFactory([]() { return list(dict_extension_type()); }); TestWithTypeFactory([]() { return list(complex128()); }); @@ -3810,7 +3826,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -3850,7 +3866,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4230,7 +4246,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -4276,7 +4292,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4353,7 +4369,7 @@ class TestArrayStreamExport : public BaseArrayStreamTest { SchemaExportGuard schema_guard(&c_schema); ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(expected, *schema); + AssertSchemaEqual(expected, *schema, /*check_metadata=*/true); } void AssertStreamEnd(struct ArrowArrayStream* c_stream) { @@ -4437,7 +4453,7 @@ TEST_F(TestArrayStreamExport, ArrayLifetime) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto got_schema, ImportSchema(&c_schema)); - AssertSchemaEqual(*schema, *got_schema); + AssertSchemaEqual(*schema, *got_schema, /*check_metadata=*/true); } ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); @@ -4462,7 +4478,7 @@ TEST_F(TestArrayStreamExport, Errors) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(schema, arrow::schema({})); + AssertSchemaEqual(schema, arrow::schema({}), /*check_metadata=*/true); } struct ArrowArray c_array; @@ -4539,7 +4555,7 @@ TEST_F(TestArrayStreamRoundtrip, Simple) { ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, orig_schema)); Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { - AssertSchemaEqual(*orig_schema, *reader->schema()); + AssertSchemaEqual(*orig_schema, *reader->schema(), /*check_metadata=*/true); AssertReaderNext(reader, *batches[0]); AssertReaderNext(reader, *batches[1]); AssertReaderEnd(reader); diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index bc48ae76c2a2f..002e8b0975094 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -90,7 +90,7 @@ void KeyValueMetadata::Append(std::string key, std::string value) { values_.push_back(std::move(value)); } -Result KeyValueMetadata::Get(const std::string& key) const { +Result KeyValueMetadata::Get(std::string_view key) const { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -129,7 +129,7 @@ Status KeyValueMetadata::DeleteMany(std::vector indices) { return Status::OK(); } -Status KeyValueMetadata::Delete(const std::string& key) { +Status KeyValueMetadata::Delete(std::string_view key) { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -138,20 +138,18 @@ Status KeyValueMetadata::Delete(const std::string& key) { } } -Status KeyValueMetadata::Set(const std::string& key, const std::string& value) { +Status KeyValueMetadata::Set(std::string key, std::string value) { auto index = FindKey(key); if (index < 0) { - Append(key, value); + Append(std::move(key), std::move(value)); } else { - keys_[index] = key; - values_[index] = value; + keys_[index] = std::move(key); + values_[index] = std::move(value); } return Status::OK(); } -bool KeyValueMetadata::Contains(const std::string& key) const { - return FindKey(key) >= 0; -} +bool KeyValueMetadata::Contains(std::string_view key) const { return FindKey(key) >= 0; } void KeyValueMetadata::reserve(int64_t n) { DCHECK_GE(n, 0); @@ -188,7 +186,7 @@ std::vector> KeyValueMetadata::sorted_pairs( return pairs; } -int KeyValueMetadata::FindKey(const std::string& key) const { +int KeyValueMetadata::FindKey(std::string_view key) const { for (size_t i = 0; i < keys_.size(); ++i) { if (keys_[i] == key) { return static_cast(i); diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index 8702ce73a639a..57ade11e75868 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,13 +45,13 @@ class ARROW_EXPORT KeyValueMetadata { void ToUnorderedMap(std::unordered_map* out) const; void Append(std::string key, std::string value); - Result Get(const std::string& key) const; - bool Contains(const std::string& key) const; + Result Get(std::string_view key) const; + bool Contains(std::string_view key) const; // Note that deleting may invalidate known indices - Status Delete(const std::string& key); + Status Delete(std::string_view key); Status Delete(int64_t index); Status DeleteMany(std::vector indices); - Status Set(const std::string& key, const std::string& value); + Status Set(std::string key, std::string value); void reserve(int64_t n); @@ -63,7 +64,7 @@ class ARROW_EXPORT KeyValueMetadata { std::vector> sorted_pairs() const; /// \brief Perform linear search for key, returning -1 if not found - int FindKey(const std::string& key) const; + int FindKey(std::string_view key) const; std::shared_ptr Copy() const; From cb5c109a5d6985264203e256ddae0b210251e820 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Mon, 5 Feb 2024 22:23:50 +0530 Subject: [PATCH 37/74] GH-39946: [Java] Bump com.puppycrawl.tools:checkstyle from 8.19 to 8.29 (#39694) ### Rationale for this change This PR was created in place of https://github.com/apache/arrow/pull/39202 to integrate the `puppycrawl.tools.checkstyle` upgrade. ### What changes are included in this PR? Style changes in Java classes and core changes to the style format itself. Some unsupported attributes have been removed. And some attributes have been reorganized upon the provided guidelines in the documentation. ### Are these changes tested? N/A Tested by existing checkstyle guideline. ### Are there any user-facing changes? No * Closes: #39946 Lead-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: vibhatha Signed-off-by: David Li --- .../apache/arrow/adapter/jdbc/Constants.java | 3 +- .../adapter/jdbc/MockPreparedStatement.java | 63 ++++++++++++------- .../arrow/adapter/jdbc/ResultSetUtility.java | 3 +- .../apache/arrow/adapter/orc/OrcJniUtils.java | 3 +- java/dev/checkstyle/checkstyle.xml | 18 +++--- java/dev/checkstyle/suppressions.xml | 2 +- .../org/apache/arrow/flight/FlightClient.java | 3 +- .../apache/arrow/flight/FlightGrpcUtils.java | 3 +- .../org/apache/arrow/flight/FlightStream.java | 3 +- .../arrow/flight/OutboundStreamListener.java | 3 +- .../arrow/flight/auth/AuthConstants.java | 3 +- .../arrow/flight/auth/ServerAuthWrapper.java | 4 +- .../arrow/flight/TestClientMiddleware.java | 9 ++- .../integration/tests/OrderedScenario.java | 3 +- .../jdbc/utils/IntervalStringUtils.java | 3 +- .../utils/ClientAuthenticationUtilsTest.java | 2 +- .../evaluator/ConfigurationBuilder.java | 3 +- .../gandiva/evaluator/DecimalTypeUtil.java | 3 +- .../gandiva/expression/ArrowTypeHelper.java | 3 +- .../arrow/gandiva/expression/TreeBuilder.java | 3 +- java/maven/pom.xml | 2 +- .../arrow/memory/AllocationListener.java | 15 +++-- .../apache/arrow/memory/BaseAllocator.java | 24 +++---- .../org/apache/arrow/memory/BufferLedger.java | 32 +++++----- .../apache/arrow/memory/ReferenceManager.java | 6 +- .../memory/util/ByteFunctionHelpers.java | 3 +- .../apache/arrow/memory/util/CommonUtil.java | 3 +- .../arrow/memory/util/LargeMemoryUtil.java | 3 +- .../org/apache/arrow/util/Collections2.java | 3 +- .../org/apache/arrow/util/Preconditions.java | 3 +- java/pom.xml | 2 +- .../org/apache/arrow/tools/FileToStream.java | 3 +- .../apache/arrow/vector/AllocationHelper.java | 3 +- .../apache/arrow/vector/BitVectorHelper.java | 3 +- .../arrow/vector/GenerateSampleData.java | 3 +- .../org/apache/arrow/vector/NullVector.java | 3 +- .../apache/arrow/vector/compare/Range.java | 3 +- .../arrow/vector/complex/StateTool.java | 3 +- .../apache/arrow/vector/ipc/ArrowMagic.java | 3 +- .../vector/ipc/message/FBSerializables.java | 3 +- .../apache/arrow/vector/util/DateUtility.java | 3 +- .../arrow/vector/util/DecimalUtility.java | 3 +- .../arrow/vector/util/DictionaryUtility.java | 3 +- .../vector/util/ObjectMapperFactory.java | 3 +- .../arrow/vector/util/SchemaUtility.java | 3 +- .../testing/ValueVectorDataPopulator.java | 3 +- 46 files changed, 174 insertions(+), 107 deletions(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java index 5b01077b17996..f95133fc7e44c 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java @@ -21,7 +21,8 @@ * String constants used for metadata returned on Vectors. */ public class Constants { - private Constants() {} + private Constants() { + } public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME"; public static final String SQL_SCHEMA_NAME_KEY = "SQL_SCHEMA_NAME"; diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java index 438a949b736f1..4478cdfbee6f7 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java @@ -231,7 +231,8 @@ public void setDate(int parameterIndex, Date x, Calendar cal) throws SQLExceptio } @Override - public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException {} + public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException { + } @Override public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws SQLException { @@ -241,7 +242,8 @@ public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws S } @Override - public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException {} + public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException { + } @Override public void setURL(int parameterIndex, URL x) throws SQLException { @@ -259,62 +261,80 @@ public void setRowId(int parameterIndex, RowId x) throws SQLException { } @Override - public void setNString(int parameterIndex, String value) throws SQLException {} + public void setNString(int parameterIndex, String value) throws SQLException { + } @Override public void setNCharacterStream(int parameterIndex, Reader value, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, NClob value) throws SQLException {} + public void setNClob(int parameterIndex, NClob value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override public void setBlob(int parameterIndex, InputStream inputStream, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override - public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException {} + public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException { + } @Override public void setObject(int parameterIndex, Object x, int targetSqlType, int scaleOrLength) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override public void setCharacterStream(int parameterIndex, Reader reader, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException {} + public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException {} + public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader) throws SQLException {} + public void setClob(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException {} + public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader) throws SQLException { + } @Override public ResultSet executeQuery(String sql) throws SQLException { @@ -327,7 +347,8 @@ public int executeUpdate(String sql) throws SQLException { } @Override - public void close() throws SQLException {} + public void close() throws SQLException { + } @Override public int getMaxFieldSize() throws SQLException { diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java index c712741b51f5b..ccc7681c5bc8b 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java @@ -348,7 +348,8 @@ public static class MockColumnMetaData { private int displaySize; - private MockColumnMetaData() {} + private MockColumnMetaData() { + } private String getLabel() { return label; diff --git a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java index 9b599234bdf51..d61799e990f77 100644 --- a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java +++ b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java @@ -32,7 +32,8 @@ class OrcJniUtils { private static final String LIBRARY_NAME = "arrow_orc_jni"; private static boolean isLoaded = false; - private OrcJniUtils() {} + private OrcJniUtils() { + } static void loadOrcAdapterLibraryFromJar() throws IOException, IllegalAccessException { diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml index c27f382ddda76..b63a4a9cba1f3 100644 --- a/java/dev/checkstyle/checkstyle.xml +++ b/java/dev/checkstyle/checkstyle.xml @@ -60,6 +60,11 @@ + + + + + @@ -72,10 +77,6 @@ - - - - @@ -223,13 +224,12 @@ - - - - - + + + + diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml index 585985bf32dbc..a3536e2ca9212 100644 --- a/java/dev/checkstyle/suppressions.xml +++ b/java/dev/checkstyle/suppressions.xml @@ -40,5 +40,5 @@ - + diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java index fc491ebe0df98..8f251a7c7ef07 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java @@ -437,7 +437,8 @@ public ClientStreamListener getWriter() { */ public void getResult() { // After exchange is complete, make sure stream is drained to propagate errors through reader - while (reader.next()) { }; + while (reader.next()) { + } } /** Shut down the streams in this call. */ diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java index eb5e492b4cd46..b711d7ef6b5d7 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java @@ -125,7 +125,8 @@ public void enterIdle() { } } - private FlightGrpcUtils() {} + private FlightGrpcUtils() { + } /** * Creates a Flight service. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java index 7a5a941603ace..84beee7d40564 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java @@ -194,7 +194,8 @@ public void close() throws Exception { } } // Drain the stream without the lock (as next() implicitly needs the lock) - while (next()) { } + while (next()) { + } } catch (FlightRuntimeException e) { suppressor = e; } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java index e80fb41c67273..80ddad90a1d28 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java @@ -119,5 +119,6 @@ default void start(VectorSchemaRoot root, DictionaryProvider dictionaries) { *

The default value can be toggled globally by setting the JVM property arrow.flight.enable_zero_copy_write * or the environment variable ARROW_FLIGHT_ENABLE_ZERO_COPY_WRITE. */ - default void setUseZeroCopy(boolean enabled) {} + default void setUseZeroCopy(boolean enabled) { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java index e3ccdc626d71b..8a37115f1f024 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java @@ -47,5 +47,6 @@ public byte[] parseBytes(byte[] serialized) { public static final Context.Key PEER_IDENTITY_KEY = Context.keyWithDefault("arrow-flight-peer-identity", ""); - private AuthConstants() {} + private AuthConstants() { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java index ad1a36a935fd7..3647e113cc0f6 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java @@ -115,7 +115,9 @@ public boolean hasNext() { @Override public void onError(Throwable t) { completed = true; - while (future == null) {/* busy wait */} + while (future == null) { + /* busy wait */ + } future.cancel(true); } diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java index bcff54bd7f66f..a1fa1f1d18509 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java @@ -303,10 +303,12 @@ public void onBeforeSendingHeaders(CallHeaders outgoingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } @Override - public void onCallErrored(Throwable err) {} + public void onCallErrored(Throwable err) { + } } static class MultiHeaderClientMiddlewareFactory implements FlightClientMiddleware.Factory { @@ -356,6 +358,7 @@ public void onHeadersReceived(CallHeaders incomingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } } } diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java index b8aa46fb5674a..13238f318eaaa 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java @@ -55,7 +55,8 @@ public FlightProducer producer(BufferAllocator allocator, Location location) thr } @Override - public void buildServer(FlightServer.Builder builder) throws Exception {} + public void buildServer(FlightServer.Builder builder) throws Exception { + } @Override public void client(BufferAllocator allocator, Location location, FlightClient client) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java index fdf6c508d93b0..de6dccad4a846 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java @@ -31,7 +31,8 @@ public final class IntervalStringUtils { /** * Constructor Method of class. */ - private IntervalStringUtils( ) {} + private IntervalStringUtils( ) { + } /** * Formats a period similar to Oracle INTERVAL YEAR TO MONTH data type
. diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index b7977462e9c01..78d252f7824c3 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -84,7 +84,7 @@ public void testGetDefaultKeyStoreInstancePassword() throws IOException, keyStoreMockedStatic .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) - .thenReturn(keyStoreMock); + .thenReturn(keyStoreMock); KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); Assert.assertEquals(receiveKeyStore, keyStoreMock); } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java index e903b4e873278..fa5d285b90997 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java @@ -43,7 +43,8 @@ public static ConfigOptions getDefault() { return new ConfigOptions(); } - public ConfigOptions() {} + public ConfigOptions() { + } public ConfigOptions withOptimize(boolean optimize) { this.optimize = optimize; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java index e0c072cfbe52e..703cfaa8be88b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java @@ -23,7 +23,8 @@ * Utility methods for working with {@link Decimal} values. */ public class DecimalTypeUtil { - private DecimalTypeUtil() {} + private DecimalTypeUtil() { + } /** * Enum for supported mathematical operations. diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 90f8684b455a8..e7377cc5c9db4 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -33,7 +33,8 @@ * Utility methods to convert between Arrow and Gandiva types. */ public class ArrowTypeHelper { - private ArrowTypeHelper() {} + private ArrowTypeHelper() { + } static final int WIDTH_8 = 8; static final int WIDTH_16 = 16; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index 8656e886aae24..3d2ea27d044e7 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -29,7 +29,8 @@ * Contains helper functions for constructing expression trees. */ public class TreeBuilder { - private TreeBuilder() {} + private TreeBuilder() { + } /** * Helper functions to create literal constants. diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 3a88ec762e19c..7fdca7db7b8d8 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -235,7 +235,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java index ff2b25dfa30ab..b8de6d819eaf8 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java @@ -34,7 +34,8 @@ public interface AllocationListener { * * @param size the buffer size being allocated */ - default void onPreAllocation(long size) {} + default void onPreAllocation(long size) { + } /** * Called each time a new buffer has been allocated. @@ -43,7 +44,8 @@ default void onPreAllocation(long size) {} * * @param size the buffer size being allocated */ - default void onAllocation(long size) {} + default void onAllocation(long size) { + } /** * Informed each time a buffer is released from allocation. @@ -51,7 +53,8 @@ default void onAllocation(long size) {} *

An exception cannot be thrown by this method. * @param size The size of the buffer being released. */ - default void onRelease(long size) {} + default void onRelease(long size) { + } /** @@ -73,7 +76,8 @@ default boolean onFailedAllocation(long size, AllocationOutcome outcome) { * @param parentAllocator The parent allocator to which a child was added * @param childAllocator The child allocator that was just added */ - default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } /** * Called immediately after a child allocator was removed from the parent allocator. @@ -81,5 +85,6 @@ default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator child * @param parentAllocator The parent allocator from which a child was removed * @param childAllocator The child allocator that was just removed */ - default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 8779c7a3434ea..189c800ba0fe5 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -702,18 +702,18 @@ private void verifyAllocator( void print(StringBuilder sb, int level, Verbosity verbosity) { CommonUtil.indent(sb, level) - .append("Allocator(") - .append(name) - .append(") ") - .append(reservation) - .append('/') - .append(getAllocatedMemory()) - .append('/') - .append(getPeakMemoryAllocation()) - .append('/') - .append(getLimit()) - .append(" (res/actual/peak/limit)") - .append('\n'); + .append("Allocator(") + .append(name) + .append(") ") + .append(reservation) + .append('/') + .append(getAllocatedMemory()) + .append('/') + .append(getPeakMemoryAllocation()) + .append('/') + .append(getLimit()) + .append(" (res/actual/peak/limit)") + .append('\n'); if (DEBUG) { CommonUtil.indent(sb, level + 1).append(String.format("child allocators: %d\n", childAllocators.size())); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java index 1ca3e08ecf046..62d268a1f4493 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java @@ -478,20 +478,20 @@ public long getAccountedSize() { */ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { CommonUtil.indent(sb, indent) - .append("ledger[") - .append(ledgerId) - .append("] allocator: ") - .append(allocator.getName()) - .append("), isOwning: ") - .append(", size: ") - .append(", references: ") - .append(bufRefCnt.get()) - .append(", life: ") - .append(lCreationTime) - .append("..") - .append(lDestructionTime) - .append(", allocatorManager: [") - .append(", life: "); + .append("ledger[") + .append(ledgerId) + .append("] allocator: ") + .append(allocator.getName()) + .append("), isOwning: ") + .append(", size: ") + .append(", references: ") + .append(bufRefCnt.get()) + .append(", life: ") + .append(lCreationTime) + .append("..") + .append(lDestructionTime) + .append(", allocatorManager: [") + .append(", life: "); if (!BaseAllocator.DEBUG) { sb.append("]\n"); @@ -499,8 +499,8 @@ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { Preconditions.checkArgument(buffers != null, "IdentityHashMap of buffers must not be null"); synchronized (buffers) { sb.append("] holds ") - .append(buffers.size()) - .append(" buffers. \n"); + .append(buffers.size()) + .append(" buffers. \n"); for (ArrowBuf buf : buffers.keySet()) { buf.print(sb, indent + 2, verbosity); sb.append('\n'); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java index 7d4de18751ba9..64a4232d8aeb7 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java @@ -141,10 +141,12 @@ public boolean release(int decrement) { } @Override - public void retain() { } + public void retain() { + } @Override - public void retain(int increment) { } + public void retain(int increment) { + } @Override public ArrowBuf retain(ArrowBuf srcBuffer, BufferAllocator targetAllocator) { diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java index 9579245ca7004..79d21fa040876 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java @@ -32,7 +32,8 @@ public class ByteFunctionHelpers { private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; - private ByteFunctionHelpers() {} + private ByteFunctionHelpers() { + } /** * Helper function to check for equality of bytes in two ArrowBufs. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java index ccca7b1e03093..707c5f1556062 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java @@ -24,7 +24,8 @@ */ public final class CommonUtil { - private CommonUtil() { } + private CommonUtil() { + } /** * Rounds up the provided value to the nearest power of two. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java index db63bbd14ba5f..94a7873664216 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java @@ -22,7 +22,8 @@ /** Contains utilities for dealing with a 64-bit address base. */ public final class LargeMemoryUtil { - private LargeMemoryUtil() {} + private LargeMemoryUtil() { + } /** * Casts length to an int, but raises an exception the value is outside diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java index 6b01a61ebca39..b88372abaaee1 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java @@ -34,7 +34,8 @@ * Utility methods for manipulating {@link java.util.Collections} and their subclasses/implementations. */ public final class Collections2 { - private Collections2() {} + private Collections2() { + } /** * Creates a {@link List} from the elements remaining in iterator. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java index 8083033007d9c..5e4323cfc9c61 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java @@ -111,7 +111,8 @@ * @since 2.0 */ public final class Preconditions { - private Preconditions() {} + private Preconditions() { + } /** * Ensures the truth of an expression involving one or more parameters to the calling method. diff --git a/java/pom.xml b/java/pom.xml index 7871303634976..b2b300b2f3fed 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -304,7 +304,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java index bb7cedeb74579..3d9bca58a763c 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java @@ -34,7 +34,8 @@ * first argument and the output is written to standard out. */ public class FileToStream { - private FileToStream() {} + private FileToStream() { + } /** * Reads an Arrow file from in and writes it back to out. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 6824756d8aca7..abece39475016 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -22,7 +22,8 @@ /** Helper utility methods for allocating storage for Vectors. */ public class AllocationHelper { - private AllocationHelper() {} + private AllocationHelper() { + } /** * Allocates the vector. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index 568554ba75ed6..10f343e260ccc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -33,7 +33,8 @@ */ public class BitVectorHelper { - private BitVectorHelper() {} + private BitVectorHelper() { + } /** * Get the index of byte corresponding to bit index in validity buffer. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java index 6cda18a8a53d3..be501ce245410 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java @@ -27,7 +27,8 @@ * with sample data. This class should be used for that purpose. */ public class GenerateSampleData { - private GenerateSampleData() {} + private GenerateSampleData() { + } /** Populates vector with valueCount random values. */ public static void generateTestData(final ValueVector vector, final int valueCount) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index d7b147feb152f..3b734bbf6608b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -300,7 +300,8 @@ public int getNullCount() { * @param index position of element */ @Override - public void setNull(int index) {} + public void setNull(int index) { + } @Override public boolean isNull(int index) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java index 0de99ab011f66..76db0734464ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java @@ -41,7 +41,8 @@ public class Range { /** * Constructs a new instance. */ - public Range() {} + public Range() { + } /** * Constructs a new instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java index 0098f68360a1a..2cd64c4fc6766 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -23,7 +23,8 @@ * Utility methods for state machines based on enums. */ public class StateTool { - private StateTool() {} + private StateTool() { + } static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StateTool.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java index 9c399669affc3..b16315caa9f51 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java @@ -25,7 +25,8 @@ * Magic header/footer helpers for {@link ArrowFileWriter} and {@link ArrowFileReader} formatted files. */ class ArrowMagic { - private ArrowMagic(){} + private ArrowMagic(){ + } private static final byte[] MAGIC = "ARROW1".getBytes(StandardCharsets.UTF_8); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java index 26736ed91c5ca..59b3bb07bcf16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java @@ -31,7 +31,8 @@ * Utility methods for {@linkplain org.apache.arrow.vector.ipc.message.FBSerializable}s. */ public class FBSerializables { - private FBSerializables() {} + private FBSerializables() { + } /** * Writes every element of all to builder and calls {@link FlatBufferBuilder#endVector()} afterwards. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java index 9e8b6d26f6fd7..f7f975a0d0e7b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -26,7 +26,8 @@ /** Utility class for Date, DateTime, TimeStamp, Interval data types. */ public class DateUtility { - private DateUtility() {} + private DateUtility() { + } private static final String UTC = "UTC"; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 0dfb61dcdf269..4635822e5141b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -29,7 +29,8 @@ * Utility methods for configurable precision Decimal values (e.g. {@link BigDecimal}). */ public class DecimalUtility { - private DecimalUtility() {} + private DecimalUtility() { + } public static final byte [] zeroes = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index 9592f3975ab99..76fb585e6bd3a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -35,7 +35,8 @@ * Utility methods for working with Dictionaries used in Dictionary encodings. */ public class DictionaryUtility { - private DictionaryUtility() {} + private DictionaryUtility() { + } /** * Convert field and child fields that have a dictionary encoding to message format, so fields diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java index 39488e96efda0..5fa4c1b2260e3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java @@ -26,7 +26,8 @@ */ public final class ObjectMapperFactory { - private ObjectMapperFactory() {} + private ObjectMapperFactory() { + } /** * Creates a new {@link ObjectMapper} instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java index f8167604c21ad..5b3d00f6b7362 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java @@ -33,7 +33,8 @@ * Schema utility class including serialization and deserialization. */ public class SchemaUtility { - private SchemaUtility() {} + private SchemaUtility() { + } /** * Deserialize Arrow schema from byte array. diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index f9f0357861c15..9e96e75880522 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -75,7 +75,8 @@ */ public class ValueVectorDataPopulator { - private ValueVectorDataPopulator(){} + private ValueVectorDataPopulator() { + } /** * Populate values for BigIntVector. From 0c88d13341dfaba5109683bda25ee3ffcd808080 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 6 Feb 2024 01:34:37 +0800 Subject: [PATCH 38/74] GH-39704: [C++][Parquet] Benchmark levels decoding (#39705) ### Rationale for this change This patch add the level-decoding benchmark. It test: 1. Different max-level (for flat type, maximum level would be 1, for nested type, it would grows) 2. With different repeat ( repeated null / non-null is different from non-repeated data) 3. With different read-batch size. This part of logic is a bit tricky in original code ### What changes are included in this PR? Add Level decoding benchmark ### Are these changes tested? No need ### Are there any user-facing changes? no * Closes: #39704 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_reader_benchmark.cc | 98 ++++++++++++++++++++++ cpp/src/parquet/column_writer_test.cc | 4 +- 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc index 49b2317ede187..61fe397cf1c30 100644 --- a/cpp/src/parquet/column_reader_benchmark.cc +++ b/cpp/src/parquet/column_reader_benchmark.cc @@ -219,5 +219,103 @@ BENCHMARK(RecordReaderReadRecords) ->Args({2, 1000, true}) ->Args({2, 1000, false}); +void GenerateLevels(int level_repeats, int max_level, int num_levels, + std::vector* levels) { + // Generate random levels + std::default_random_engine gen(/*seed=*/1943); + std::uniform_int_distribution d(0, max_level); + for (int i = 0; i < num_levels;) { + int16_t current_level = d(gen); // level repeat `level_repeats` times + const int current_repeated = std::min(level_repeats, num_levels - i); + levels->insert(levels->end(), current_repeated, current_level); + i += current_repeated; + } +} + +void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, + const int16_t* input_levels, std::vector* bytes) { + LevelEncoder encoder; + // encode levels + if (encoding == Encoding::RLE) { + int rle_size = LevelEncoder::MaxBufferSize(encoding, max_level, num_levels); + bytes->resize(rle_size + sizeof(int32_t)); + // leave space to write the rle length value + encoder.Init(encoding, max_level, num_levels, bytes->data() + sizeof(int32_t), + rle_size); + encoder.Encode(num_levels, input_levels); + int data_length = encoder.len(); + memcpy(bytes->data(), &data_length, sizeof(int32_t)); + } else { + int bitpack_size = + LevelEncoder::MaxBufferSize(encoding, max_level, num_levels) + sizeof(int32_t); + bytes->resize(bitpack_size); + encoder.Init(encoding, max_level, num_levels, bytes->data(), + static_cast(bytes->size())); + encoder.Encode(num_levels, input_levels); + } +} + +static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int num_levels, + int batch_size, int level_repeat_count, + ::benchmark::State& state) { + std::vector bytes; + { + std::vector input_levels; + GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level, + num_levels, &input_levels); + EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), &bytes); + } + + LevelDecoder decoder; + std::vector output_levels(batch_size); + for (auto _ : state) { + state.PauseTiming(); + decoder.SetData(level_encoding, max_level, num_levels, bytes.data(), + static_cast(bytes.size())); + state.ResumeTiming(); + // Decode multiple times with batch_size + while (true) { + int levels_decoded = decoder.Decode(batch_size, output_levels.data()); + if (levels_decoded == 0) { + break; + } + } + } + state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t)); + state.SetItemsProcessed(state.iterations() * num_levels); +} + +static void ReadLevels_Rle(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size, level_repeat_count, + state); +} + +static void ReadLevels_BitPack(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::BIT_PACKED, max_level, num_levels, batch_size, + level_repeat_count, state); +} + +static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) { + b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"}) + ->Args({1, 8096, 1024, 1}) + ->Args({1, 8096, 1024, 7}) + ->Args({1, 8096, 1024, 1024}) + ->Args({1, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 1}) + ->Args({3, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 7}); +} + +BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments); +BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments); + } // namespace benchmark } // namespace parquet diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 97421629d2ca6..a40e71ce30aec 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1021,7 +1021,7 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, } void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector& bytes) { LevelDecoder decoder; int levels_count = 0; @@ -1060,7 +1060,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, } void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector>& bytes) { LevelDecoder decoder; int levels_count = 0; From de53aac762fc703148f5822ed170b462a6b467d8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:01:02 -0800 Subject: [PATCH 39/74] MINOR: [C#] Bump Grpc.Tools from 2.60.0 to 2.61.0 in /csharp (#39945) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.60.0 to 2.61.0.

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.60.0&new-version=2.61.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 68c3e47e01902..3a6ae28b390d2 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From 26801f147a9e98bb6c5bc4e7131bdf1bc2794467 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 5 Feb 2024 15:29:06 -0500 Subject: [PATCH 40/74] GH-39769: [C++][Device] Fix Importing nested and string types for DeviceArray (#39770) ### Rationale for this change In my testing with libcudf and other GPU data, I discovered a deficiency in ImportDeviceArray and thus ImportDeviceRecordBatch where the device type and memory manager aren't propagated to child importers and it fails to import offset-based types such as strings. ### What changes are included in this PR? These are relatively easily handled by first ensuring that `ImportChild` propagates the device_type and memory manager from the parent. Then for importing offset based values we merely need to use the memory manager to copy the final offset value to the CPU to use for the buffer size computation. This will work for any device which has implemented CopyBufferTo/From ### Are these changes tested? A new test is added to test these situations. * Closes: #39769 Authored-by: Matt Topol Signed-off-by: Matt Topol --- cpp/src/arrow/c/bridge.cc | 23 ++++++++++++++++++++--- cpp/src/arrow/c/bridge_test.cc | 10 ++++++++++ cpp/src/arrow/device.cc | 14 ++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 9b165a10a61e7..119249da99a6d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1543,6 +1543,8 @@ struct ArrayImporter { if (recursion_level_ >= kMaxImportRecursionLevel) { return Status::Invalid("Recursion level in ArrowArray struct exceeded"); } + device_type_ = parent->device_type_; + memory_mgr_ = parent->memory_mgr_; // Child buffers will keep the entire parent import alive. // Perhaps we can move the child structs to an owned area // when the parent ImportedArrayData::Release() gets called, @@ -1857,10 +1859,25 @@ struct ArrayImporter { template Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, int64_t byte_width = 1) { - auto offsets = data_->GetValues(offsets_buffer_id); + if (device_type_ == DeviceAllocationType::kCPU) { + auto offsets = data_->GetValues(offsets_buffer_id); + // Compute visible size of buffer + int64_t buffer_size = + (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + return ImportBuffer(buffer_id, buffer_size); + } + + // we only need the value of the last offset so let's just copy that + // one value from device to host. + auto single_value_buf = + SliceBuffer(data_->buffers[offsets_buffer_id], + c_struct_->length * sizeof(OffsetType), sizeof(OffsetType)); + ARROW_ASSIGN_OR_RAISE( + auto cpubuf, Buffer::ViewOrCopy(single_value_buf, default_cpu_memory_manager())); + auto offsets = cpubuf->data_as(); // Compute visible size of buffer - int64_t buffer_size = - (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + int64_t buffer_size = (c_struct_->length > 0) ? byte_width * offsets[0] : 0; + return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 8b67027454c55..b8d5e0fcd3845 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -4320,6 +4320,16 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) { TestWithJSON(mm, int32(), "[4, 5, null]"); } +TEST_F(TestDeviceArrayRoundtrip, Struct) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + auto type = struct_({field("ints", int16()), field("strs", utf8())}); + + TestWithJSON(mm, type, "[]"); + TestWithJSON(mm, type, R"([[4, "foo"], [5, "bar"]])"); + TestWithJSON(mm, type, R"([[4, null], null, [5, "foo"]])"); +} + //////////////////////////////////////////////////////////////////////////// // Array stream export tests diff --git a/cpp/src/arrow/device.cc b/cpp/src/arrow/device.cc index 616f89aae896f..3736a4e018c33 100644 --- a/cpp/src/arrow/device.cc +++ b/cpp/src/arrow/device.cc @@ -195,6 +195,13 @@ Result> CPUMemoryManager::ViewBufferFrom( if (!from->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), shared_from_this(), buf); + } return buf; } @@ -220,6 +227,13 @@ Result> CPUMemoryManager::ViewBufferTo( if (!to->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), to, buf); + } return buf; } From fd69d307447888101600376fa3016b727a3e0106 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:11:36 +0800 Subject: [PATCH 41/74] GH-39860: [C++] Expression ExecuteScalarExpression execute empty args function with a wrong result (#39908) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Try to fix #39860. ### What changes are included in this PR? Deal with the call->arguments.size() == 0's condition in ExecuteScalarExpression when we call some functions has no arguments, like (random, hash_count ...). ### Are these changes tested? Yes ### Are there any user-facing changes? No. * Closes: #39860 Lead-authored-by: hugo.zhang Co-authored-by: 张回归 Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/compute/expression.cc | 13 +++++++++++-- cpp/src/arrow/compute/expression_test.cc | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index b47e0a35525c5..8c59ad1df86f2 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -761,6 +761,15 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i } } + int64_t input_length; + if (!arguments.empty() && all_scalar) { + // all inputs are scalar, so use a 1-long batch to avoid + // computing input.length equivalent outputs + input_length = 1; + } else { + input_length = input.length; + } + auto executor = compute::detail::KernelExecutor::MakeScalar(); compute::KernelContext kernel_context(exec_context, call->kernel); @@ -772,8 +781,8 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, types, options})); compute::detail::DatumAccumulator listener; - RETURN_NOT_OK(executor->Execute( - ExecBatch(std::move(arguments), all_scalar ? 1 : input.length), &listener)); + RETURN_NOT_OK( + executor->Execute(ExecBatch(std::move(arguments), input_length), &listener)); const auto out = executor->WrapResults(arguments, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 44159e76600fb..d33c348cd77da 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -863,6 +863,25 @@ TEST(Expression, ExecuteCall) { ])")); } +TEST(Expression, ExecuteCallWithNoArguments) { + const int kCount = 10; + auto random_options = RandomOptions::FromSeed(/*seed=*/0); + ExecBatch input({}, kCount); + + Expression random_expr = call("random", {}, random_options); + ASSERT_OK_AND_ASSIGN(random_expr, random_expr.Bind(float64())); + + ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(random_expr, input)); + compute::ExecContext* exec_context = default_exec_context(); + ASSERT_OK_AND_ASSIGN(auto function, + exec_context->func_registry()->GetFunction("random")); + ASSERT_OK_AND_ASSIGN(Datum expected, + function->Execute(input, &random_options, exec_context)); + AssertDatumsEqual(actual, expected, /*verbose=*/true); + + EXPECT_EQ(actual.length(), kCount); +} + TEST(Expression, ExecuteDictionaryTransparent) { ExpectExecute( equal(field_ref("a"), field_ref("b")), From 0415a60eebdaf8130ca3028a802529ecfb738493 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 6 Feb 2024 08:44:29 +0900 Subject: [PATCH 42/74] GH-39883: [CI][R][Windows] Use ci/scripts/install_minio.sh with Git bash (#39929) ### Rationale for this change `curl` in Rtools can't be used on non Rtools' MSYS2 environment. Because `curl` in Rtools can't refer `/usr/ssl/certs/ca-bundle.crt` on non Rtools' MSYS2 environment. ### What changes are included in this PR? Use the `bash` in GitHub Actions Runner. `curl` in the environment works. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39883 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/r.yml | 41 ++++++++++++----------- ci/scripts/install_minio.sh | 67 +++++++++++++++++++++++++------------ 2 files changed, 67 insertions(+), 41 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 2a801b6040ec8..3d1f75ede4bb5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -21,24 +21,26 @@ on: push: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" pull_request: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" concurrency: @@ -256,6 +258,16 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + # This must be done before r-lib/actions/setup-r because curl in + # Rtools doesn't work on non Rtools' MSYS2 environment. If we + # use "shell: bash" after r-lib/actions/setup-r, bash in Rtools + # is used on non Rtools' MSYS2 environment. + - name: Install MinIO + shell: bash + run: | + mkdir -p "$HOME/.local/bin" + ci/scripts/install_minio.sh latest "$HOME/.local" + echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts uses: actions/download-artifact@v3 @@ -282,15 +294,6 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck - - name: Install MinIO - shell: bash - run: | - mkdir -p "$HOME/.local/bin" - curl \ - --output "$HOME/.local/bin/minio.exe" \ - https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z - chmod +x "$HOME/.local/bin/minio.exe" - echo "$HOME/.local/bin" >> $GITHUB_PATH # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows # - name: Install Google Cloud Storage Testbench # shell: bash diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 6ea8e1a095c39..e493a183b4543 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -17,7 +17,15 @@ # specific language governing permissions and limitations # under the License. -set -e +set -eu + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 declare -A archs archs=([x86_64]=amd64 @@ -25,45 +33,60 @@ archs=([x86_64]=amd64 [aarch64]=arm64 [s390x]=s390x) -declare -A platforms -platforms=([Linux]=linux - [Darwin]=darwin) - arch=$(uname -m) -platform=$(uname) -version=$1 -prefix=$2 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit 1 -elif [ -z ${archs[$arch]} ]; then +if [ -z ${archs[$arch]} ]; then echo "Unsupported architecture: ${arch}" exit 0 -elif [ -z ${platforms[$platform]} ]; then - echo "Unsupported platform: ${platform}" - exit 0 -elif [ "${version}" != "latest" ]; then +fi +arch=${archs[$arch]} + +platform=$(uname) +case ${platform} in + Linux) + platform=linux + ;; + Darwin) + platform=darwin + ;; + MSYS_NT*|MINGW64_NT*) + platform=windows + ;; + *) + echo "Unsupported platform: ${platform}" + exit 0 + ;; +esac + +if [ "${version}" != "latest" ]; then echo "Cannot fetch specific versions of minio, only latest is supported." exit 1 fi -arch=${archs[$arch]} -platform=${platforms[$platform]} - # Use specific versions for minio server and client to avoid CI failures on new releases. minio_version="minio.RELEASE.2022-05-26T05-48-41Z" mc_version="mc.RELEASE.2022-05-09T04-08-26Z" +download() +{ + local output=$1 + local url=$2 + + if type wget > /dev/null 2>&1; then + wget -nv --output-document ${output} ${url} + else + curl --fail --location --output ${output} ${url} + fi +} + if [[ ! -x ${prefix}/bin/minio ]]; then url="https://dl.min.io/server/minio/release/${platform}-${arch}/archive/${minio_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/minio ${url} + download ${prefix}/bin/minio ${url} chmod +x ${prefix}/bin/minio fi if [[ ! -x ${prefix}/bin/mc ]]; then url="https://dl.min.io/client/mc/release/${platform}-${arch}/archive/${mc_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/mc ${url} + download ${prefix}/bin/mc ${url} chmod +x ${prefix}/bin/mc fi From 9db823b45fd4ae455c531e944681c898bede7d53 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 6 Feb 2024 08:50:58 +0900 Subject: [PATCH 43/74] GH-39930: [C++] Use Requires instead of Libs for system RE2 in arrow.pc (#39932) ### Rationale for this change We chose Libs{,.private} with libre2.a for system RE2 in GH-10626. Because "Require{,.private} re2" may add "-std=c++11". If "-std=c++11" was added, users can't build Apache Arrow C++ because Apache Arrow C++ requires C++17 or later. But this approach doesn't work with RE2 2024-06-01 or later because it at least requires Abseil. If we keep the Libs{,.private} approach, we also need to add Abseil libraries to Libs{,.private}. But it's unmaintainable. ### What changes are included in this PR? Let's use "Requires{,.private} re2" instead of Libs{,.private}. I hope recent re2.pc doesn't add "-std=c++11". ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39930 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 6bb9c0f6af2ca..0238c26c0fb51 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2594,16 +2594,11 @@ macro(build_re2) endmacro() if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. - resolve_dependency(re2 HAVE_ALT TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM" AND ARROW_BUILD_STATIC) - get_target_property(RE2_TYPE re2::re2 TYPE) - if(NOT RE2_TYPE STREQUAL "INTERFACE_LIBRARY") - string(APPEND ARROW_PC_LIBS_PRIVATE " $") - endif() - endif() + resolve_dependency(re2 + HAVE_ALT + TRUE + PC_PACKAGE_NAMES + re2) add_definitions(-DARROW_WITH_RE2) endif() From 0896d5b86510b9d410fd849610e2e1dedc77bf03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 6 Feb 2024 01:41:27 +0100 Subject: [PATCH 44/74] GH-39943: [CI][Python] Update manylinux images to avoid GPG problems downloading packages (#39944) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Old manylinux images seem to have issues with a GPG key in order to download packages. ### What changes are included in this PR? Update the manylinux image used for the latest one. ### Are these changes tested? Via archery jobs ### Are there any user-facing changes? No * Closes: #39943 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0252c4ec8a896..5c84d24fd7df7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1030,7 +1030,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2014 @@ -1053,7 +1053,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2_28 From 15525102992fbe83e6ce0943fe09e3f23a1287f4 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 6 Feb 2024 00:48:57 +0000 Subject: [PATCH 45/74] GH-39621: [CI][Packaging] Update vcpkg to 2023.11.20 release (#39622) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Old version of vcpkg is blocking https://github.com/apache/arrow/issues/39352 ### What changes are included in this PR? - Upgrade vcpkg - Fix ports patches - Upgrade visual studio used in windows wheel builds. VS2019 is now required for the vcpkg `abseil` build. - Move `VCPKG_FORCE_SYSTEM_BINARIES` to be set before vcpkg install to fix vcpkg install on linux ARM. - Fix for LLVM 17 which requires that an executable exports "llvm_orc_registerEHFrameSectionWrapper()" and "llvm_orc_unregisterEHFrameSectionWrapper()". This effects the java builds that depend on llvm from vcpkg for gandiva. - Update image used for python wheel builds on windows to 2024-02-05 ### Are these changes tested? Does not change any behaviour so should be covered by existing tests ### Are there any user-facing changes? There shouldn't be * Closes: #39621 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Co-authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- .env | 6 ++--- ci/docker/python-wheel-manylinux.dockerfile | 7 ++--- ...thon-wheel-windows-test-vs2019.dockerfile} | 4 +-- ...=> python-wheel-windows-vs2019.dockerfile} | 4 +-- ci/scripts/python_wheel_windows_build.bat | 8 +++--- ci/vcpkg/ports.patch | 27 ++++++++++--------- ci/vcpkg/vcpkg.json | 5 +++- cpp/src/gandiva/CMakeLists.txt | 9 +++++++ dev/tasks/python-wheels/github.windows.yml | 12 ++++----- docker-compose.yml | 16 +++++------ 10 files changed, 57 insertions(+), 41 deletions(-) rename ci/docker/{python-wheel-windows-test-vs2017.dockerfile => python-wheel-windows-test-vs2019.dockerfile} (96%) rename ci/docker/{python-wheel-windows-vs2017.dockerfile => python-wheel-windows-vs2019.dockerfile} (98%) diff --git a/.env b/.env index 427a4ab0bf398..eb87dc62bdd8c 100644 --- a/.env +++ b/.env @@ -92,13 +92,13 @@ DEVTOOLSET_VERSION= # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="501db0f17ef6df184fcdbfbe0f87cde2313b6ab1" # 2023.04.15 Release +VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # This must be updated when we update -# ci/docker/python-wheel-windows-vs2017.dockerfile. +# ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2023-08-02 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-02-05 # Use conanio/${CONAN} for "docker-compose run --rm conan". See # https://github.com/conan-io/conan-docker-tools#readme for available diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index a07c727ac76fa..2831440d5a967 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -62,15 +62,16 @@ COPY ci/vcpkg/*.patch \ COPY ci/scripts/install_vcpkg.sh \ arrow/ci/scripts/ ENV VCPKG_ROOT=/opt/vcpkg -RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} -ENV PATH="${PATH}:${VCPKG_ROOT}" - ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \ VCPKG_FEATURE_FLAGS="manifests" + +RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} +ENV PATH="${PATH}:${VCPKG_ROOT}" + COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/ # cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains # ssl related fixes as well as we can patch the vcpkg portfile to support diff --git a/ci/docker/python-wheel-windows-test-vs2017.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile similarity index 96% rename from ci/docker/python-wheel-windows-test-vs2017.dockerfile rename to ci/docker/python-wheel-windows-test-vs2019.dockerfile index e842ede18454b..67d99fa9c5724 100644 --- a/ci/docker/python-wheel-windows-test-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile similarity index 98% rename from ci/docker/python-wheel-windows-vs2017.dockerfile rename to ci/docker/python-wheel-windows-vs2019.dockerfile index 067105b3a7995..b8e8aad952b1c 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Install CMake and Ninja ARG cmake=3.21.4 diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index ffb43b3481e55..73b0192d9bc97 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -19,7 +19,7 @@ echo "Building windows wheel..." -call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" echo "=== (%PYTHON_VERSION%) Clear output directories and leftovers ===" del /s /q C:\arrow-build @@ -50,7 +50,8 @@ set ARROW_WITH_SNAPPY=ON set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON set CMAKE_UNITY_BUILD=ON -set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +set CMAKE_GENERATOR=Visual Studio 16 2019 +set CMAKE_PLATFORM=x64 set VCPKG_ROOT=C:\vcpkg set VCPKG_FEATURE_FLAGS=-manifests set VCGPK_TARGET_TRIPLET=amd64-windows-static-md-%CMAKE_BUILD_TYPE% @@ -96,6 +97,7 @@ cmake ^ -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=%VCGPK_TARGET_TRIPLET% ^ -G "%CMAKE_GENERATOR%" ^ + -A "%CMAKE_PLATFORM%" ^ C:\arrow\cpp || exit /B 1 cmake --build . --config %CMAKE_BUILD_TYPE% --target install || exit /B 1 popd @@ -121,6 +123,6 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM bundle the msvc runtime -cp "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Redist\MSVC\14.16.27012\x64\Microsoft.VC141.CRT\msvcp140.dll" pyarrow\ +cp "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Redist\MSVC\14.28.29325\x64\Microsoft.VC142.CRT\msvcp140.dll" pyarrow\ python setup.py bdist_wheel || exit /B 1 popd diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 68f6cae5addc9..0d4fb540a2003 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,13 +1,14 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index 5a14562..924b1b7 100644 +index bdc544e9e..53f6bbc3b 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -87,8 +87,11 @@ vcpkg_cmake_configure( +@@ -74,9 +74,12 @@ vcpkg_cmake_configure( -DENABLE_MANUAL=OFF -DCURL_CA_FALLBACK=ON -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none + -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON OPTIONS_DEBUG -DENABLE_DEBUG=ON + ${EXTRA_ARGS_DEBUG} @@ -15,29 +16,29 @@ index 5a14562..924b1b7 100644 vcpkg_cmake_install() vcpkg_copy_pdbs() diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 8f3f3f9..745b0fb 100644 +index 0c7098082..c603c3653 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -9,6 +9,7 @@ vcpkg_from_github( - HEAD_REF master +@@ -10,6 +10,7 @@ vcpkg_from_github( PATCHES fix_clang-cl_build.patch + no-werror.patch + "snappy-disable-bmi.patch" ) vcpkg_cmake_configure( diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 -index 0000000..a57ce0c +index 000000000..e839c93a4 --- /dev/null +++ b/ports/snappy/snappy-disable-bmi.patch @@ -0,0 +1,19 @@ +diff --git a/snappy.cc b/snappy.cc -+index 79dc0e8..f3153ea 100644 ++index d414718..7b49d2a 100644 +--- a/snappy.cc ++++ b/snappy.cc -+@@ -965,14 +965,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, -+ static inline uint32_t ExtractLowBytes(uint32_t v, int n) { ++@@ -1014,14 +1014,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, ++ static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { + assert(n >= 0); + assert(n <= 4); +-#if SNAPPY_HAVE_BMI2 @@ -52,13 +53,13 @@ index 0000000..a57ce0c + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index 4d7e26a..1f054a2 100644 +index bf9397b66..c3112b673 100644 --- a/ports/llvm/portfile.cmake +++ b/ports/llvm/portfile.cmake -@@ -274,6 +274,8 @@ vcpkg_cmake_configure( +@@ -293,6 +293,8 @@ vcpkg_cmake_configure( + ${FEATURE_OPTIONS} + MAYBE_UNUSED_VARIABLES COMPILER_RT_ENABLE_IOS - OPENMP_TOOLS_INSTALL_DIR - MLIR_TOOLS_INSTALL_DIR + BOLT_TOOLS_INSTALL_DIR + LIBOMP_INSTALL_ALIASES ) diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 71c23165e61f0..99771728ecf18 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -81,8 +81,11 @@ "default-features": false, "features": [ "clang", - "default-options", "default-targets", + "enable-bindings", + "enable-terminfo", + "enable-zlib", + "enable-zstd", "enable-rtti", "lld", "tools" diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 3f038f54a7b27..d773fb5ff5895 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -229,6 +229,15 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(TEST_NAME gandiva-${REL_TEST_NAME}) string(REPLACE "_" "-" TEST_NAME ${TEST_NAME}) + + if(ARG_USE_STATIC_LINKING OR ARROW_TEST_LINKAGE STREQUAL "static") + # LLVM 17 or later requires that an executable exports + # "llvm_orc_registerEHFrameSectionWrapper()" and + # "llvm_orc_unregisterEHFrameSectionWrapper()". We need to do + # nothing when we use libLLVM.so. But we need to export symbols + # explicitly when we use libLLVM*.a. + set_target_properties(${TEST_NAME} PROPERTIES ENABLE_EXPORTS TRUE) + endif() endfunction() add_gandiva_test(internals-test diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml index 1641796a719e2..01f4977a9b0b1 100644 --- a/dev/tasks/python-wheels/github.windows.yml +++ b/dev/tasks/python-wheels/github.windows.yml @@ -29,7 +29,7 @@ jobs: # this is a private repository at the moment (mostly because of licensing # consideration of windows images with visual studio), but anyone can # recreate the image by manually building it via: - # `archery build python-wheel-windows-vs2017` + # `archery build python-wheel-windows-vs2019` # note that we don't run docker build since there wouldn't be a cache hit # and rebuilding the dependencies takes a fair amount of time REPO: ghcr.io/ursacomputing/arrow @@ -46,17 +46,17 @@ jobs: run: | cd arrow @rem We want to use only - @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 @rem but it doesn't use pulled caches. @rem It always build an image from scratch. @rem We can remove this workaround once we find a way to use @rem pulled caches when build an image. echo on - archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2017 + archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2019 if errorlevel 1 ( - archery docker build --no-pull python-wheel-windows-vs2017 || exit /B 1 + archery docker build --no-pull python-wheel-windows-vs2019 || exit /B 1 ) - archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 - uses: actions/upload-artifact@v3 with: @@ -77,5 +77,5 @@ jobs: shell: cmd run: | cd arrow - archery docker push python-wheel-windows-vs2017 + archery docker push python-wheel-windows-vs2019 {% endif %} diff --git a/docker-compose.yml b/docker-compose.yml index 5c84d24fd7df7..8a7223b57632f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -172,7 +172,7 @@ x-hierarchy: - python-wheel-manylinux-2-28 - python-wheel-manylinux-test-imports - python-wheel-manylinux-test-unittests - - python-wheel-windows-vs2017 + - python-wheel-windows-vs2019 - python-wheel-windows-test volumes: @@ -1098,19 +1098,19 @@ services: CHECK_UNITTESTS: "ON" command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow - python-wheel-windows-vs2017: - image: ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + python-wheel-windows-vs2019: + image: ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: vcpkg: ${VCPKG} python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-vs2019.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. # Uncomment if no local cache is available. # cache_from: - # - abrarov/msvc-2017:2.11.0 - # - ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + # - abrarov/msvc-2019:2.11.0 + # - ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind @@ -1119,12 +1119,12 @@ services: command: arrow\\ci\\scripts\\python_wheel_windows_build.bat python-wheel-windows-test: - image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2017-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2019-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-test-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-test-vs2019.dockerfile volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind From 874e59670773bd0d52d9c6811483c00abc5ee736 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 6 Feb 2024 01:54:13 +0100 Subject: [PATCH 46/74] GH-39737: [Release][Docs] Update post release documentation task (#39762) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR updates the `dev/release/post-08-docs.sh` task so that - `DOCUMENTATION_OPTIONS.theme_switcher_version_match` changes from `""` to `"{previous_version}"` - `DOCUMENTATION_OPTIONS.show_version_warning_banner` changes from `false` to `true` for the documentation that is moved to a subfolder when a new major release is done. * Closes: #39737 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/release/post-08-docs.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index f18f7d10c73e6..4df574700e812 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -86,6 +86,21 @@ if [ "$is_major_release" = "yes" ] ; then fi git add docs git commit -m "[Website] Update documentations for ${version}" + +# Update DOCUMENTATION_OPTIONS.theme_switcher_version_match and +# DOCUMENTATION_OPTIONS.show_version_warning_banner +pushd docs/${previous_series} +find ./ \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '';/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '${previous_version}';/g" \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +git add docs/${previous_series} +git commit -m "[Website] Update warning banner for ${previous_series}" git clean -d -f -x popd From 062c841836642ab95b1ffde031d271fffd29987d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 6 Feb 2024 10:56:16 +0900 Subject: [PATCH 47/74] GH-39057: [CI][C++][Go] Don't run jobs that use a self-hosted GitHub Actions Runner on fork (#39903) ### Rationale for this change If jobs that use a self-hosted GitHub Actions Runner on fork are submitted on fork, they will timeout eventually and report noisy failure notifications. ### What changes are included in this PR? We can't use `jobs..if` to reject jobs that use self-hosted GitHub Actions Runner because `jobs..if` is evaluated before `jobs..strategy.matrix`. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idif > Note: The `jobs..if` condition is evaluated before > `jobs..strategy.matrix` is applied. We can use output `jobs.outputs` instead. See also: * https://docs.github.com/en/actions/using-jobs/defining-outputs-for-jobs * https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idoutputs ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39057 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 74 +++++++++++++++++++++++++++------------ .github/workflows/go.yml | 65 +++++++++++++++++++++++++--------- 2 files changed, 99 insertions(+), 40 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 9fbad06692bd2..e9409f1cd6248 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -57,37 +57,65 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "amd64", + "clang-tools": "14", + "image": "conda-cpp", + "llvm": "14", + "runs-on": "ubuntu-latest", + "simd-level": "AVX2", + "title": "AMD64 Conda C++ AVX2", + "ubuntu": "22.04" + }, + { + "arch": "amd64", + "clang-tools": "14", + "image": "ubuntu-cpp-sanitizer", + "llvm": "14", + "runs-on": "ubuntu-latest", + "title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN", + "ubuntu": "22.04" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "arm64v8", + "clang-tools": "10", + "image": "ubuntu-cpp", + "llvm": "10", + "runs-on": ["self-hosted", "arm", "linux"], + "title": "ARM64 Ubuntu 20.04 C++", + "ubuntu": "20.04" + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.title }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 strategy: fail-fast: false matrix: - include: - - arch: amd64 - clang-tools: "14" - image: conda-cpp - llvm: "14" - runs-on: ubuntu-latest - simd-level: AVX2 - title: AMD64 Conda C++ AVX2 - ubuntu: "22.04" - - arch: amd64 - clang-tools: "14" - image: ubuntu-cpp-sanitizer - llvm: "14" - runs-on: ubuntu-latest - title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN - ubuntu: "22.04" - - arch: arm64v8 - clang-tools: "10" - image: ubuntu-cpp - llvm: "10" - runs-on: ["self-hosted", "arm", "linux"] - title: ARM64 Ubuntu 20.04 C++ - ubuntu: "20.04" + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cd44e65e8811b..bbffab6704087 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -43,31 +43,62 @@ permissions: jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.19", + "runs-on": "ubuntu-latest" + }, + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.20", + "runs-on": "ubuntu-latest" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.19", + "runs-on": ["self-hosted", "arm", "linux"] + }, + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.20", + "runs-on": ["self-hosted", "arm", "linux"] + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.arch-label }} Debian 11 Go ${{ matrix.go }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: fail-fast: false matrix: - include: - - arch-label: AMD64 - arch: amd64 - go: 1.19 - runs-on: ubuntu-latest - - arch-label: AMD64 - arch: amd64 - go: '1.20' - runs-on: ubuntu-latest - - arch-label: ARM64 - arch: arm64v8 - go: 1.19 - runs-on: ["self-hosted", "arm", "linux"] - - arch-label: ARM64 - arch: arm64v8 - go: '1.20' - runs-on: ["self-hosted", "arm", "linux"] + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} GO: ${{ matrix.go }} From f38ae607983264dc52a938d1930916c73292a92e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:03:44 +0900 Subject: [PATCH 48/74] MINOR: [Java] Bump com.fasterxml.jackson:jackson-bom from 2.16.0 to 2.16.1 in /java (#39947) Bumps [com.fasterxml.jackson:jackson-bom](https://github.com/FasterXML/jackson-bom) from 2.16.0 to 2.16.1.
Commits
  • f70e9cf [maven-release-plugin] prepare release jackson-bom-2.16.1
  • 22a8c3a Prepare for 2.16.1 release
  • 4203816 back to snapshot deps
  • 4fb9d50 [maven-release-plugin] prepare for next development iteration
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.fasterxml.jackson:jackson-bom&package-manager=maven&previous-version=2.16.0&new-version=2.16.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index b2b300b2f3fed..1faeb46d02afc 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -36,7 +36,7 @@ 4.1.106.Final 1.60.0 3.23.1 - 2.16.0 + 2.16.1 3.3.6 23.5.26 1.11.3 From 1950f8000fa25368602b530dbec4b3d286aed819 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:04:08 +0900 Subject: [PATCH 49/74] MINOR: [Java] Bump org.cyclonedx:cyclonedx-maven-plugin from 2.7.10 to 2.7.11 in /java (#39948) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.cyclonedx:cyclonedx-maven-plugin](https://github.com/CycloneDX/cyclonedx-maven-plugin) from 2.7.10 to 2.7.11.
Release notes

Sourced from org.cyclonedx:cyclonedx-maven-plugin's releases.

2.7.11

🚀 New features and improvements

📦 Dependency updates

  • define plugin-tools.version property (#453) @​hboutemy
  • Bump org.apache.maven.plugin-tools:maven-plugin-annotations from 3.10.2 to 3.11.0 (#451) @​dependabot
  • Bump org.apache.maven.plugins:maven-plugin-report-plugin from 3.10.2 to 3.11.0 (#450) @​dependabot
  • Bump org.apache.maven.plugins:maven-plugin-plugin from 3.10.2 to 3.11.0 (#449) @​dependabot
  • Bump org.apache.maven.plugins:maven-compiler-plugin from 3.11.0 to 3.12.1 (#447) @​dependabot
  • Bump org.apache.maven.plugins:maven-plugin-plugin from 3.10.1 to 3.10.2 (#445) @​dependabot
  • Bump org.apache.maven.plugins:maven-project-info-reports-plugin from 3.4.5 to 3.5.0 (#442) @​dependabot
  • Bump org.apache.commons:commons-lang3 from 3.13.0 to 3.14.0 (#443) @​dependabot
  • Bump org.apache.maven.plugin-tools:maven-plugin-annotations from 3.10.1 to 3.10.2 (#444) @​dependabot
  • Bump org.junit:junit-bom from 5.10.0 to 5.10.1 (#422) @​dependabot
  • Bump org.apache.maven.plugins:maven-plugin-report-plugin from 3.10.1 to 3.10.2 (#424) @​dependabot
  • Bump org.apache.maven.plugins:maven-javadoc-plugin from 3.6.0 to 3.6.3 (#438) @​dependabot
  • Bump actions/setup-java from 3 to 4 (#437) @​dependabot
  • Bump org.apache.maven.plugins:maven-plugin-report-plugin from 3.9.0 to 3.10.1 (#417) @​dependabot
Commits
  • 349fe7c [maven-release-plugin] prepare release cyclonedx-maven-plugin-2.7.11
  • 2d130a0 rename convert methohds to explicit project vs dependency
  • 051be8e cleanup unused code
  • d0e6cb5 test dependency type=zip for #431 (reverts #9)
  • 46837cd Update DefaultModelConverter.java to support Zip files
  • dc90b21 define plugin-tools.version property
  • 8836cbd Add support for custom external references (#428)
  • 86410aa Bump org.apache.maven.plugin-tools:maven-plugin-annotations
  • 4d71b50 Bump org.apache.maven.plugins:maven-plugin-report-plugin
  • 70aae8e Bump org.apache.maven.plugins:maven-plugin-plugin from 3.10.2 to 3.11.0
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.cyclonedx:cyclonedx-maven-plugin&package-manager=maven&previous-version=2.7.10&new-version=2.7.11)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/maven/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 7fdca7db7b8d8..9842777c15495 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -271,7 +271,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package diff --git a/java/pom.xml b/java/pom.xml index 1faeb46d02afc..e928960182ab2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -364,7 +364,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package From c7a166fc5aeec3f1b6e5d68cc7746b228a8dad04 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:04:37 +0900 Subject: [PATCH 50/74] MINOR: [Java] Bump org.apache.maven.plugins:maven-project-info-reports-plugin from 3.0.0 to 3.5.0 in /java (#39949) Bumps [org.apache.maven.plugins:maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.0.0 to 3.5.0.
Commits
  • 28ac4b5 [maven-release-plugin] prepare release maven-project-info-reports-plugin-3.5.0
  • 9c4fc33 Remove unneeded incompatiblity notice
  • 482ea62 Fix formatting
  • 60cfdea [MPIR-453] Replace Commons IO in favor of standard APIs
  • 4d94edc [MPIR-446] Update to Maven SCM 2.0.
  • 91a065b [MPIR-452] Upgrade to Parent 41
  • ceac0bf Consistently use MavenReport#getReportOutputDirectory()
  • c16ec94 [MNG-7416] Simplify Boolean expressions and returns (#63)
  • abd0e76 Fix style value
  • 18aedbb Reduce IT runtime by invoking goal directly
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-project-info-reports-plugin&package-manager=maven&previous-version=3.0.0&new-version=3.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/maven/pom.xml | 2 +- java/pom.xml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 9842777c15495..c2b13119fc440 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -333,7 +333,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins diff --git a/java/pom.xml b/java/pom.xml index e928960182ab2..258e45a519c59 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -395,7 +395,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -598,7 +598,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -803,7 +803,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins From 672238ff6352fa388b54182d8ae1667f9e99c327 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:05:00 +0900 Subject: [PATCH 51/74] MINOR: [Java] Bump io.grpc:grpc-bom from 1.60.0 to 1.61.1 in /java (#39950) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.60.0 to 1.61.1.
Release notes

Sourced from io.grpc:grpc-bom's releases.

v1.61.1

Bug Fixes

xds: Fix a bug in WeightedRoundRobinLoadBalancer policy that could raise NullPointerException and further cause channel panic when picking a subchannel. This bug can only be triggered when connection can not be established and the channel reports TRANSIENT_FAILURE state. (#10868)

v1.61.0

API Changes

  • Remove unused experimental API ManagedChannelBuilder.enableFullStreamDecompression (#10744)
  • api: Deprecate LoadBalancer.EMPTY_PICKER added in 1.58.0 in favor of FixedResultPicker (860b5cb1f)

New Features

  • binder: Experimental support for asynchronous security policies (#10566)

Improvements

  • core: reduce CompositeReadableBuffer allocation (#3279)
  • core: Improve error message clarity when a channel leak is detected (201893f5e)
  • util: use shared index across round_robin pickers (dca89b25b). This makes its implementation more similar to weighted_round_robin.
  • xds: Implement ADS stream flow control mechanism (#10674). This limits the maximum memory consumed if the control plane sends updates more rapidly than they can be processed.

Bug Fixes

  • core: Check outbound maximum message size for the compressed size in addition to the already-checked uncompressed size (#10739). Fixed the status code to be RESOURCE_EXHAUSTED instead of UNKNOWN.
  • util: Fix NPE when multiple addresses are in an address group for petiole load balancer policies (#10769)
  • util: Disable publishing of fixtures (8ac43dd81). The Gradle test fixtures are for use by grpc-java's internal tests.
  • okhttp: Ignore known conscrypt socket close issue (#10812). This stops an exception from being thrown when a known Conscrypt synchronization issue happens.

Dependencies

  • Drop support for Bazel 5 (55a9c012c). Bazel 7 is available, and Protobuf has already dropped support for Bazel 5.
  • Change many compile deps to runtime deps (d6830d7f9). This reduces the transitive classes "leaked" into the compile classpath. In particular, grpc-core (io.grpc.internal) will be less frequently included transitively at compile time.
  • Upgrade dependencies (c985797d9)
    • Protobuf to 3.25.1
    • auto-value-annotations to 1.10.4
    • error_prone_annotations to 2.23.0
    • proto-google-common-protos to 2.29.0
    • google-cloud-logging to 3.15.14
    • guava to 32.1.3-android
    • okio to 3.4.0

Acknowledgements

v1.60.2

Bug Fixes

xds: Fix a bug in WeightedRoundRobinLoadBalancer policy that could raise NullPointerException and further cause channel panic when picking a subchannel. This bug can only be triggered when connection can not be established and the channel reports TRANSIENT_FAILURE state. (#10868)

v1.60.1

Bug Fixes

  • util: Fix NPE when multiple addresses in an address group for petiole load balancer policies (#10770)
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.60.0&new-version=1.61.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 258e45a519c59..6442987f5a192 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.11 33.0.0-jre 4.1.106.Final - 1.60.0 + 1.61.1 3.23.1 2.16.1 3.3.6 From 0993b369c4b91d81a17166d1427e7c26cd9beee4 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Mon, 5 Feb 2024 21:35:34 -0500 Subject: [PATCH 52/74] GH-39900: [Java][CI] To upload Maven and Memory Netty Buffer Patch into Apache Nightly repository (#39901) ### Rationale for this change To upload Maven and Memory Netty Buffer Patch into Apache Nightly repository ### What changes are included in this PR? Upload Maven and Memory Netty Buffer Patch into Apache Nightly repository ### Are these changes tested? Needed to run https://github.com/apache/arrow/actions/workflows/java_nightly.yml ### Are there any user-facing changes? No * Closes: #39900 Authored-by: david dali susanibar arce Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 0f8c58391fa66..cf04d29715306 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -748,6 +748,10 @@ tasks: - arrow-jdbc-{no_rc_snapshot_version}-tests.jar - arrow-jdbc-{no_rc_snapshot_version}.jar - arrow-jdbc-{no_rc_snapshot_version}.pom + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.json + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-maven-plugins-{no_rc_snapshot_version}-src.zip + - arrow-maven-plugins-{no_rc_snapshot_version}.pom - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-core-{no_rc_snapshot_version}-javadoc.jar @@ -762,6 +766,13 @@ tasks: - arrow-memory-netty-{no_rc_snapshot_version}-tests.jar - arrow-memory-netty-{no_rc_snapshot_version}.jar - arrow-memory-netty-{no_rc_snapshot_version}.pom + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.json + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-javadoc.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-sources.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-tests.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.pom - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-unsafe-{no_rc_snapshot_version}-javadoc.jar @@ -839,6 +850,13 @@ tasks: - flight-sql-jdbc-driver-{no_rc_snapshot_version}-tests.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.pom + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.json + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.xml + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-javadoc.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-sources.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-src.zip + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.pom ############################## NuGet packages ############################### From cd5faafb0c811d5985156c1fd1aecd1aa7130e9f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 6 Feb 2024 13:53:54 +0900 Subject: [PATCH 53/74] GH-39955: [C++] Use make -j1 to install bundled bzip2 (#39956) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change It seems that parallel "make install" isn't stable with "-G 'Unix Makefiles'" ("read jobs pipe: Bad file descriptor. Stop." is the important part): [ 19%] Performing install step for 'bzip2_ep' CMake Error at /tmp/Rtmp5v99SJ/file70b591df48f/bzip2_ep-prefix/src/bzip2_ep-stamp/bzip2_ep-install-RELEASE.cmake:37 (message): Command failed: 2 '/bin/make' 'install' 'PREFIX=/tmp/Rtmp5v99SJ/file70b591df48f/bzip2_ep-install' 'CC=/bin/gcc' 'CFLAGS=-g -O2 -ffile-prefix-map=/build/reproducible-path/r-base-4.3.2=. -fstack-protector-strong -fstack-clash-protection -Wformat -Werror=format-security -fcf-protection -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC' 'AR=/bin/ar' 'RANLIB=/bin/ranlib' See also /tmp/Rtmp5v99SJ/file70b591df48f/bzip2_ep-prefix/src/bzip2_ep-stamp/bzip2_ep-install-*.log -- stdout output is: -- stderr output is: make[3]: *** read jobs pipe: Bad file descriptor. Stop. make[3]: *** Waiting for unfinished jobs.... bzip2.c: In function ‘applySavedFileAttrToOutputFile’: bzip2.c:1073:11: warning: ignoring return value of ‘fchown’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 1073 | (void) fchown ( fd, fileMetaInfo.st_uid, fileMetaInfo.st_gid ); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CMake Error at /tmp/Rtmp5v99SJ/file70b591df48f/bzip2_ep-prefix/src/bzip2_ep-stamp/bzip2_ep-install-RELEASE.cmake:47 (message): Stopping after outputting logs. make[2]: *** [CMakeFiles/bzip2_ep.dir/build.make:104: bzip2_ep-prefix/src/bzip2_ep-stamp/bzip2_ep-install] Error 1 make[1]: *** [CMakeFiles/Makefile2:1205: CMakeFiles/bzip2_ep.dir/all] Error 2 make[1]: *** Waiting for unfinished jobs.... ### What changes are included in this PR? Force to disable parallel processing for `make install`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39955 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 0238c26c0fb51..b16ee07756013 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2629,7 +2629,7 @@ macro(build_bzip2) BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS} ${BZIP2_EXTRA_ARGS} - INSTALL_COMMAND ${MAKE} install PREFIX=${BZIP2_PREFIX} + INSTALL_COMMAND ${MAKE} install -j1 PREFIX=${BZIP2_PREFIX} ${BZIP2_EXTRA_ARGS} INSTALL_DIR ${BZIP2_PREFIX} URL ${ARROW_BZIP2_SOURCE_URL} From a6e577d031d20a1a7d3dd01536b9a77db5d1bff8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 6 Feb 2024 16:19:03 +0100 Subject: [PATCH 54/74] GH-39857: [C++] Improve error message for "chunker out of sync" condition (#39892) ### Rationale for this change When writing the CSV reader, we thought that the parser not finding the same line limits as the chunker should never happen, hence the terse "chunker out of sync" error message. It turns out that, if the input contains multiline cell values and the `newlines_in_values` option was not enabled, the chunker can happily delimit a block on a newline that's inside a quoted string. The parser will then see truncated data and will stop parsing, yielding a parsed size that's smaller than the first block (see added comment in the code). ### What changes are included in this PR? * Add some parser tests that showcase the condition encountered in GH-39857 * Improve error message to guide users towards the solution ### Are these changes tested? There's no functional change, the error message itself isn't tested. ### Are there any user-facing changes? No. * Closes: #39857 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/csv/parser_test.cc | 22 +++++++++++++++++++++ cpp/src/arrow/csv/reader.cc | 34 +++++++++++++++++++++++++++----- python/pyarrow/tests/test_csv.py | 25 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 960a69c59db5d..dd3d025202018 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -175,6 +175,13 @@ void AssertParsePartial(BlockParser& parser, const std::string& str, ASSERT_EQ(parsed_size, expected_size); } +void AssertParsePartial(BlockParser& parser, const std::vector& data, + uint32_t expected_size) { + uint32_t parsed_size = static_cast(-1); + ASSERT_OK(parser.Parse(data, &parsed_size)); + ASSERT_EQ(parsed_size, expected_size); +} + void AssertLastRowEq(const BlockParser& parser, const std::vector& expected) { std::vector values; @@ -376,6 +383,21 @@ TEST(BlockParser, TruncatedData) { } } +TEST(BlockParser, TruncatedDataViews) { + // The BlockParser API mandates that, when passing a vector of views, + // only the last view may be a truncated CSV block. + // In the current implementation, receiving a truncated non-last view + // simply stops parsing after that view. + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3); + AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6); + + // More sophisticated: non-last block ends on some newline inside a quoted string + // (terse reproducer of gh-39857) + AssertParsePartial(parser, Views({"a,b,\"c\n", "\"\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\n\"d\n", "\",e,f\n"}), 6); +} + TEST(BlockParser, Final) { // Tests for ParseFinal() BlockParser parser(ParseOptions::Defaults()); diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 332fad054fea3..1ac25e290a814 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -261,11 +261,10 @@ class SerialBlockReader : public BlockReader { auto consume_bytes = [this, bytes_before_buffer, next_buffer](int64_t nbytes) -> Status { DCHECK_GE(nbytes, 0); - auto offset = nbytes - bytes_before_buffer; - if (offset < 0) { - // Should not happen - return Status::Invalid("CSV parser got out of sync with chunker"); - } + int64_t offset = nbytes - bytes_before_buffer; + // All data before the buffer should have been consumed. + // This is checked in Parse() and BlockParsingOperator::operator(). + DCHECK_GE(offset, 0); partial_ = SliceBuffer(buffer_, offset); buffer_ = next_buffer; return Status::OK(); @@ -400,6 +399,7 @@ class BlockParsingOperator { count_rows_(first_row >= 0), num_rows_seen_(first_row) {} + // TODO: this is almost entirely the same as ReaderMixin::Parse(). Refactor? Result operator()(const CSVBlock& block) { constexpr int32_t max_num_rows = std::numeric_limits::max(); auto parser = std::make_shared( @@ -427,9 +427,24 @@ class BlockParsingOperator { } else { RETURN_NOT_OK(parser->Parse(views, &parsed_size)); } + + // `partial + completion` should have been entirely consumed. + const int64_t bytes_before_buffer = block.partial->size() + block.completion->size(); + if (static_cast(parsed_size) < bytes_before_buffer) { + // This can happen if `newlines_in_values` is not enabled and + // `partial + completion` ends with a newline inside a quoted string. + // In this case, the BlockParser stops at the truncated data in the first + // block (see gh-39857). + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); + } + if (count_rows_) { num_rows_seen_ += parser->total_num_rows(); } + RETURN_NOT_OK(block.consume_bytes(parsed_size)); return ParsedBlock{std::move(parser), block.block_index, static_cast(parsed_size) + block.bytes_skipped}; @@ -738,6 +753,15 @@ class ReaderMixin { } else { RETURN_NOT_OK(parser->Parse(views, &parsed_size)); } + // See BlockParsingOperator for explanation. + const int64_t bytes_before_buffer = partial->size() + completion->size(); + if (static_cast(parsed_size) < bytes_before_buffer) { + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); + } + if (count_rows_) { num_rows_seen_ += parser->total_num_rows(); } diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 31f24187e3b37..bc1dd8a09a768 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -667,6 +667,31 @@ def row_num(x): 'b': ["e", "j"], } + def test_chunker_out_of_sync(self): + # GH-39892: if there are newlines in values, the parser may become + # out of sync with the chunker. In this case, we try to produce an + # informative error message. + rows = b"""a,b,c\nd,e,"f\n"\ng,h,i\n""" + expected = { + 'a': ["d", "g"], + 'b': ["e", "h"], + 'c': ["f\n", "i"], + } + for block_size in range(8, 15): + # Sanity check: parsing works with newlines_in_values=True + d = self.read_bytes( + rows, parse_options=ParseOptions(newlines_in_values=True), + read_options=ReadOptions(block_size=block_size)).to_pydict() + assert d == expected + # With these block sizes, a block would end on the physical newline + # inside the quoted cell value, leading to a mismatch between + # CSV chunker and parser. + for block_size in range(8, 11): + with pytest.raises(ValueError, + match="cell values spanning multiple lines"): + self.read_bytes( + rows, read_options=ReadOptions(block_size=block_size)) + class BaseCSVTableRead(BaseTestCSV): From 0a05626f08836152526babf103aec95d0e4ec507 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Thu, 8 Feb 2024 00:01:46 +1100 Subject: [PATCH 55/74] MINOR: [Rust][Docs] Update Rust FlightSQL status doc (#39959) ### Rationale for this change Updating arrow-rs FlightSQL support status on site: https://arrow.apache.org/docs/status.html#flight-sql arrow-rs issue: https://github.com/apache/arrow-rs/issues/4337 ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? --- docs/source/status.rst | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index 11dd9c2c2965c..a0375585dbee2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -257,9 +257,9 @@ support/not support individual features. +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | CancelQuery | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| ClosePreparedStatement | ✓ | ✓ | ✓ | | ✓ | | | | +| ClosePreparedStatement | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| CreatePreparedStatement | ✓ | ✓ | ✓ | | ✓ | | | | +| CreatePreparedStatement | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | CreatePreparedSubstraitPlan | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -267,35 +267,35 @@ support/not support individual features. +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | EndTransaction | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetCatalogs | ✓ | ✓ | ✓ | | ✓ | | | | +| GetCatalogs | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetCrossReference | ✓ | ✓ | ✓ | | ✓ | | | | +| GetCrossReference | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetDbSchemas | ✓ | ✓ | ✓ | | ✓ | | | | +| GetDbSchemas | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetExportedKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetExportedKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetImportedKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetImportedKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetPrimaryKeys | ✓ | ✓ | ✓ | | ✓ | | | | +| GetPrimaryKeys | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetSqlInfo | ✓ | ✓ | ✓ | | ✓ | | | | +| GetSqlInfo | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetTables | ✓ | ✓ | ✓ | | ✓ | | | | +| GetTables | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetTableTypes | ✓ | ✓ | ✓ | | ✓ | | | | +| GetTableTypes | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| GetXdbcTypeInfo | ✓ | ✓ | ✓ | | ✓ | | | | +| GetXdbcTypeInfo | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| PreparedStatementQuery | ✓ | ✓ | ✓ | | ✓ | | | | +| PreparedStatementQuery | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| PreparedStatementUpdate | ✓ | ✓ | ✓ | | ✓ | | | | +| PreparedStatementUpdate | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | StatementSubstraitPlan | ✓ | ✓ | | | | | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| StatementQuery | ✓ | ✓ | ✓ | | ✓ | | | | +| StatementQuery | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| StatementUpdate | ✓ | ✓ | ✓ | | ✓ | | | | +| StatementUpdate | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ .. seealso:: From c9f6e04323a0b714487a0f707b46fc3c55b909e0 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 8 Feb 2024 02:32:16 +1300 Subject: [PATCH 56/74] GH-24834: [C#] Support writing compressed IPC data (#39871) ### Rationale for this change This allows using compression when writing IPC streams and files with the Arrow .NET library ### What changes are included in this PR? * Adds a compress method to the `ICompressionCodec` interface and implements this for Zstd and LZ4Frame in the `Apache.Arrow.Compression` package * Adds new compression related options to `IpcOptions` * Implements buffer compression in `ArrowStreamWriter` ### Are these changes tested? Yes, new unit tests have been added ### Are there any user-facing changes? Yes, this is a new user-facing feature and the `status.rst` and `csharp/README` files have been updated * Closes: #24834 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- csharp/README.md | 8 +- .../CompressionCodecFactory.cs | 9 +- .../Lz4CompressionCodec.cs | 32 ++- .../ZstdCompressionCodec.cs | 22 ++- .../src/Apache.Arrow/Ipc/ArrowFileWriter.cs | 10 +- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 117 +++++++++-- .../src/Apache.Arrow/Ipc/ICompressionCodec.cs | 15 ++ .../Ipc/ICompressionCodecFactory.cs | 21 ++ csharp/src/Apache.Arrow/Ipc/IpcOptions.cs | 17 ++ .../Apache.Arrow.Compression.Tests.csproj | 1 + .../ArrowFileWriterTests.cs | 147 ++++++++++++++ .../ArrowStreamWriterTests.cs | 184 ++++++++++++++++++ .../Apache.Arrow.IntegrationTest.csproj | 1 + .../IntegrationCommand.cs | 10 +- dev/archery/archery/integration/runner.py | 1 - docs/source/status.rst | 4 +- 16 files changed, 564 insertions(+), 35 deletions(-) create mode 100644 csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs create mode 100644 csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs diff --git a/csharp/README.md b/csharp/README.md index 6e6ed9c756873..b36eb899db2d5 100644 --- a/csharp/README.md +++ b/csharp/README.md @@ -115,10 +115,10 @@ for currently available features. ### Compression -- Buffer compression is not supported when writing IPC files or streams -- Buffer decompression is supported, but requires installing the `Apache.Arrow.Compression` package, - and passing an `Apache.Arrow.Compression.CompressionCodecFactory` instance to the - `ArrowFileReader` or `ArrowStreamReader` constructor. +- Buffer compression and decompression is supported, but requires installing the `Apache.Arrow.Compression` package. + When reading compressed data, you must pass an `Apache.Arrow.Compression.CompressionCodecFactory` instance to the + `ArrowFileReader` or `ArrowStreamReader` constructor, and when writing compressed data a + `CompressionCodecFactory` must be set in the `IpcOptions`. Alternatively, a custom implementation of `ICompressionCodecFactory` can be used. ## Not Implemented diff --git a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs index 3e0a537a89a8f..4bfcdf6544f9d 100644 --- a/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow.Compression/CompressionCodecFactory.cs @@ -24,11 +24,16 @@ namespace Apache.Arrow.Compression public sealed class CompressionCodecFactory : ICompressionCodecFactory { public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType) + { + return CreateCodec(compressionCodecType, null); + } + + public ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType, int? compressionLevel) { return compressionCodecType switch { - CompressionCodecType.Lz4Frame => Lz4CompressionCodec.Instance, - CompressionCodecType.Zstd => new ZstdCompressionCodec(), + CompressionCodecType.Lz4Frame => new Lz4CompressionCodec(compressionLevel), + CompressionCodecType.Zstd => new ZstdCompressionCodec(compressionLevel), _ => throw new NotImplementedException($"Compression type {compressionCodecType} is not supported") }; } diff --git a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs index ebbcfbc3e095f..df19c16a30213 100644 --- a/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/Lz4CompressionCodec.cs @@ -14,17 +14,35 @@ // limitations under the License. using System; +using System.IO; using Apache.Arrow.Ipc; +using K4os.Compression.LZ4; using K4os.Compression.LZ4.Streams; namespace Apache.Arrow.Compression { internal sealed class Lz4CompressionCodec : ICompressionCodec { - /// - /// Singleton instance, used as this class doesn't need to be disposed and has no state - /// - public static readonly Lz4CompressionCodec Instance = new Lz4CompressionCodec(); + private readonly LZ4EncoderSettings _settings = null; + + public Lz4CompressionCodec(int? compressionLevel = null) + { + if (compressionLevel.HasValue) + { + if (Enum.IsDefined(typeof(LZ4Level), compressionLevel)) + { + _settings = new LZ4EncoderSettings + { + CompressionLevel = (LZ4Level) compressionLevel, + }; + } + else + { + throw new ArgumentException( + $"Invalid LZ4 compression level ({compressionLevel})", nameof(compressionLevel)); + } + } + } public int Decompress(ReadOnlyMemory source, Memory destination) { @@ -32,6 +50,12 @@ public int Decompress(ReadOnlyMemory source, Memory destination) return decoder.ReadManyBytes(destination.Span); } + public void Compress(ReadOnlyMemory source, Stream destination) + { + using var encoder = LZ4Frame.Encode(destination, _settings, leaveOpen: true); + encoder.WriteManyBytes(source.Span); + } + public void Dispose() { } diff --git a/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs index 92c2e65371612..cc340a7cd1b9f 100644 --- a/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs +++ b/csharp/src/Apache.Arrow.Compression/ZstdCompressionCodec.cs @@ -14,6 +14,7 @@ // limitations under the License. using System; +using System.IO; using Apache.Arrow.Ipc; using ZstdSharp; @@ -22,10 +23,21 @@ namespace Apache.Arrow.Compression internal sealed class ZstdCompressionCodec : ICompressionCodec { private readonly Decompressor _decompressor; + private readonly Compressor _compressor; - public ZstdCompressionCodec() + public ZstdCompressionCodec(int? compressionLevel = null) { + if (compressionLevel.HasValue && + (compressionLevel.Value < Compressor.MinCompressionLevel || + compressionLevel.Value > Compressor.MaxCompressionLevel)) + { + throw new ArgumentException( + $"Zstd compression level must be between {Compressor.MinCompressionLevel} and {Compressor.MaxCompressionLevel}", + nameof(compressionLevel)); + } + _decompressor = new Decompressor(); + _compressor = new Compressor(compressionLevel ?? Compressor.DefaultCompressionLevel); } public int Decompress(ReadOnlyMemory source, Memory destination) @@ -33,9 +45,17 @@ public int Decompress(ReadOnlyMemory source, Memory destination) return _decompressor.Unwrap(source.Span, destination.Span); } + public void Compress(ReadOnlyMemory source, Stream destination) + { + using var compressor = new CompressionStream( + destination, _compressor, preserveCompressor: true, leaveOpen: true); + compressor.Write(source.Span); + } + public void Dispose() { _decompressor.Dispose(); + _compressor.Dispose(); } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs index 547fa800ec71e..a643012bab1a2 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs @@ -20,6 +20,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; +using Apache.Arrow.Memory; namespace Apache.Arrow.Ipc { @@ -37,12 +38,17 @@ public ArrowFileWriter(Stream stream, Schema schema) } public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen) - : this(stream, schema, leaveOpen, options: null) + : this(stream, schema, leaveOpen, options: null, allocator: null) { } public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen, IpcOptions options) - : base(stream, schema, leaveOpen, options) + : this(stream, schema, leaveOpen, options, allocator: null) + { + } + + public ArrowFileWriter(Stream stream, Schema schema, bool leaveOpen, IpcOptions options, MemoryAllocator allocator) + : base(stream, schema, leaveOpen, options, allocator) { if (!stream.CanWrite) { diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 07d1dcfdb171d..b002f8c8b1578 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -22,6 +22,7 @@ using System.Threading; using System.Threading.Tasks; using Apache.Arrow.Arrays; +using Apache.Arrow.Memory; using Apache.Arrow.Types; using Google.FlatBuffers; @@ -29,7 +30,7 @@ namespace Apache.Arrow.Ipc { public class ArrowStreamWriter : IDisposable { - internal class ArrowRecordBatchFlatBufferBuilder : + private class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -81,14 +82,21 @@ public Buffer(ArrowBuffer buffer, int offset) } private readonly List _buffers; + private readonly ICompressionCodec _compressionCodec; + private readonly MemoryAllocator _allocator; + private readonly MemoryStream _compressionStream; public IReadOnlyList Buffers => _buffers; public List VariadicCounts { get; private set; } public int TotalLength { get; private set; } - public ArrowRecordBatchFlatBufferBuilder() + public ArrowRecordBatchFlatBufferBuilder( + ICompressionCodec compressionCodec, MemoryAllocator allocator, MemoryStream compressionStream) { + _compressionCodec = compressionCodec; + _compressionStream = compressionStream; + _allocator = allocator; _buffers = new List(); TotalLength = 0; } @@ -238,11 +246,50 @@ private void CreateBuffers(PrimitiveArray array) private Buffer CreateBuffer(ArrowBuffer buffer) { int offset = TotalLength; + const int UncompressedLengthSize = 8; - int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(buffer.Length)); + ArrowBuffer bufferToWrite; + if (_compressionCodec == null) + { + bufferToWrite = buffer; + } + else if (buffer.Length == 0) + { + // Write zero length and skip compression + var uncompressedLengthBytes = _allocator.Allocate(UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(uncompressedLengthBytes.Memory.Span, 0); + bufferToWrite = new ArrowBuffer(uncompressedLengthBytes); + } + else + { + // See format/Message.fbs, and the BUFFER BodyCompressionMethod for documentation on how + // compressed buffers are stored. + _compressionStream.Seek(0, SeekOrigin.Begin); + _compressionStream.SetLength(0); + _compressionCodec.Compress(buffer.Memory, _compressionStream); + if (_compressionStream.Length < buffer.Length) + { + var newBuffer = _allocator.Allocate((int) _compressionStream.Length + UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, buffer.Length); + _compressionStream.Seek(0, SeekOrigin.Begin); + _compressionStream.ReadFullBuffer(newBuffer.Memory.Slice(UncompressedLengthSize)); + bufferToWrite = new ArrowBuffer(newBuffer); + } + else + { + // If the compressed buffer is larger than the uncompressed buffer, use the uncompressed + // buffer instead, and indicate this by setting the uncompressed length to -1 + var newBuffer = _allocator.Allocate(buffer.Length + UncompressedLengthSize); + BinaryPrimitives.WriteInt64LittleEndian(newBuffer.Memory.Span, -1); + buffer.Memory.CopyTo(newBuffer.Memory.Slice(UncompressedLengthSize)); + bufferToWrite = new ArrowBuffer(newBuffer); + } + } + + int paddedLength = checked((int)BitUtility.RoundUpToMultipleOf8(bufferToWrite.Length)); TotalLength += paddedLength; - return new Buffer(buffer, offset); + return new Buffer(bufferToWrite, offset); } public void Visit(IArrowArray array) @@ -269,6 +316,9 @@ public void Visit(IArrowArray array) private readonly bool _leaveOpen; private readonly IpcOptions _options; + private readonly MemoryAllocator _allocator; + // Reuse a single memory stream for writing compressed data to, to reduce memory allocations + private readonly MemoryStream _compressionStream = new MemoryStream(); private protected const Flatbuf.MetadataVersion CurrentMetadataVersion = Flatbuf.MetadataVersion.V5; @@ -285,15 +335,21 @@ public ArrowStreamWriter(Stream baseStream, Schema schema) } public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen) - : this(baseStream, schema, leaveOpen, options: null) + : this(baseStream, schema, leaveOpen, options: null, allocator: null) { } public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOptions options) + : this(baseStream, schema, leaveOpen, options, allocator: null) + { + } + + public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOptions options, MemoryAllocator allocator) { BaseStream = baseStream ?? throw new ArgumentNullException(nameof(baseStream)); Schema = schema ?? throw new ArgumentNullException(nameof(schema)); _leaveOpen = leaveOpen; + _allocator = allocator ?? MemoryAllocator.Default.Value; Buffers = ArrayPool.Create(); Builder = new FlatBufferBuilder(1024); @@ -301,6 +357,13 @@ public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOp _fieldTypeBuilder = new ArrowTypeFlatbufferBuilder(Builder); _options = options ?? IpcOptions.Default; + + if (_options.CompressionCodec.HasValue && _options.CompressionCodecFactory == null) + { + throw new ArgumentException( + $"A {nameof(_options.CompressionCodecFactory)} must be provided when a {nameof(_options.CompressionCodec)} is specified", + nameof(options)); + } } private void CreateSelfAndChildrenFieldNodes(ArrayData data) @@ -326,6 +389,23 @@ private static int CountAllNodes(IReadOnlyList fields) return count; } + private Offset GetBodyCompression() + { + if (_options.CompressionCodec == null) + { + return default; + } + + var compressionType = _options.CompressionCodec.Value switch + { + CompressionCodecType.Lz4Frame => Flatbuf.CompressionType.LZ4_FRAME, + CompressionCodecType.Zstd => Flatbuf.CompressionType.ZSTD, + _ => throw new ArgumentOutOfRangeException() + }; + return Flatbuf.BodyCompression.CreateBodyCompression( + Builder, compressionType, Flatbuf.BodyCompressionMethod.BUFFER); + } + private static void CountSelfAndChildrenNodes(IArrowType type, ref int count) { if (type is NestedType nestedType) @@ -356,7 +436,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) } (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(recordBatch); + PrepareWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -367,7 +447,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch, @@ -397,7 +477,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat } (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(recordBatch); + PrepareWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -408,7 +488,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, @@ -482,12 +562,12 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList PreparingWritingRecordBatch(RecordBatch recordBatch) + private Tuple PrepareWritingRecordBatch(RecordBatch recordBatch) { - return PreparingWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); + return PrepareWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); } - private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) + private Tuple PrepareWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) { Builder.Clear(); @@ -507,7 +587,13 @@ private Tuple Pre // Serialize buffers - var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(); + // CompressionCodec can be disposed after all data is visited by the builder, + // and doesn't need to be alive for the full lifetime of the ArrowRecordBatchFlatBufferBuilder + using var compressionCodec = _options.CompressionCodec.HasValue + ? _options.CompressionCodecFactory.CreateCodec(_options.CompressionCodec.Value, _options.CompressionLevel) + : null; + + var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(compressionCodec, _allocator, _compressionStream); for (int i = 0; i < fieldCount; i++) { IArrowArray fieldArray = arrays[i]; @@ -599,7 +685,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, var arrays = new List { dictionary }; (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = - PreparingWritingRecordBatch(fields, arrays); + PrepareWritingRecordBatch(fields, arrays); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -607,7 +693,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, dictionary.Length, fieldNodesVectorOffset, buffersVectorOffset, - default, + GetBodyCompression(), variadicCountsOffset); // TODO: Support delta. @@ -994,6 +1080,7 @@ public virtual void Dispose() { BaseStream.Dispose(); } + _compressionStream.Dispose(); } } diff --git a/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs b/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs index b18ca3a5e4190..16c01d7130fb5 100644 --- a/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs +++ b/csharp/src/Apache.Arrow/Ipc/ICompressionCodec.cs @@ -14,6 +14,7 @@ // limitations under the License. using System; +using System.IO; namespace Apache.Arrow.Ipc { @@ -29,5 +30,19 @@ public interface ICompressionCodec : IDisposable /// Data buffer to write decompressed data to /// The number of decompressed bytes written into the destination int Decompress(ReadOnlyMemory source, Memory destination); + + /// + /// Write compressed data + /// + /// The data to compress + /// The stream to write compressed data to + void Compress(ReadOnlyMemory source, Stream destination) +#if NET6_0_OR_GREATER + { + throw new NotImplementedException("This codec does not support compression"); + } +#else + ; +#endif } } diff --git a/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs b/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs index 5422a033bd6d2..f367b15574b6e 100644 --- a/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs +++ b/csharp/src/Apache.Arrow/Ipc/ICompressionCodecFactory.cs @@ -20,6 +20,27 @@ namespace Apache.Arrow.Ipc /// public interface ICompressionCodecFactory { + /// + /// Create a new compression codec + /// + /// The type of codec to create + /// The created codec ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType); + + /// + /// Create a new compression codec with a specified compression level + /// + /// The type of codec to create + /// The compression level to use when compressing data + /// The created codec + ICompressionCodec CreateCodec(CompressionCodecType compressionCodecType, int? compressionLevel) +#if NET6_0_OR_GREATER + { + // Default implementation ignores the compression level + return CreateCodec(compressionCodecType); + } +#else + ; +#endif } } diff --git a/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs b/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs index b6cc3a1cb4b51..8484c9a04ab2d 100644 --- a/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs +++ b/csharp/src/Apache.Arrow/Ipc/IpcOptions.cs @@ -25,6 +25,23 @@ public class IpcOptions /// public bool WriteLegacyIpcFormat { get; set; } + /// + /// The compression codec to use to compress data buffers. + /// If null (the default value), no compression is used. + /// + public CompressionCodecType? CompressionCodec { get; set; } + + /// + /// The compression codec factory used to create compression codecs. + /// Must be provided if a CompressionCodec is specified. + /// + public ICompressionCodecFactory CompressionCodecFactory { get; set; } + + /// + /// Sets the compression level to use for codecs that support this. + /// + public int? CompressionLevel { get; set; } + public IpcOptions() { } diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 8ed7a93bdcf27..f5e2a0ef8e16e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -15,6 +15,7 @@ + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs new file mode 100644 index 0000000000000..a237f9c1d0660 --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowFileWriterTests.cs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Apache.Arrow.Ipc; +using Apache.Arrow.Tests; +using K4os.Compression.LZ4; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowFileWriterTests + { + [Fact] + public void ThrowsWhenNoCompressionFactoryProvided() + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var options = new IpcOptions + { + CompressionCodec = CompressionCodecType.Zstd, + }; + + using var stream = new MemoryStream(); + var exception = Assert.Throws( + () => new ArrowFileWriter(stream, batch.Schema, leaveOpen: true, options)); + + Assert.Contains("A CompressionCodecFactory must be provided", exception.Message); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd, null)] + [InlineData(CompressionCodecType.Zstd, 2)] + [InlineData(CompressionCodecType.Lz4Frame, null)] + [InlineData(CompressionCodecType.Lz4Frame, (int)LZ4Level.L03_HC)] + public void CanWriteCompressedIpcFile(CompressionCodecType codec, int? compressionLevel) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = compressionLevel, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public async Task CanWriteCompressedIpcFileAsync(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + }; + await TestRoundTripRecordBatchesAsync(new [] {batch}, options, codecFactory); + } + + private static void TestRoundTripRecordBatches( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowFileWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + writer.WriteRecordBatch(originalBatch); + } + writer.WriteEnd(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = Assert.Throws(() => + { + using var reader = new ArrowFileReader(stream, leaveOpen: true); + reader.ReadNextRecordBatch(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowFileReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = reader.ReadNextRecordBatch(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + + private static async Task TestRoundTripRecordBatchesAsync( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowFileWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + await writer.WriteRecordBatchAsync(originalBatch); + } + await writer.WriteEndAsync(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = await Assert.ThrowsAsync(async () => + { + using var reader = new ArrowFileReader(stream, leaveOpen: true); + await reader.ReadNextRecordBatchAsync(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowFileReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = await reader.ReadNextRecordBatchAsync(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + } +} + diff --git a/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs new file mode 100644 index 0000000000000..3b09dc26a343f --- /dev/null +++ b/csharp/test/Apache.Arrow.Compression.Tests/ArrowStreamWriterTests.cs @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Apache.Arrow.Ipc; +using Apache.Arrow.Tests; +using K4os.Compression.LZ4; +using Xunit; + +namespace Apache.Arrow.Compression.Tests +{ + public class ArrowStreamWriterTests + { + [Fact] + public void ThrowsWhenNoCompressionFactoryProvided() + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var options = new IpcOptions + { + CompressionCodec = CompressionCodecType.Zstd, + }; + + using var stream = new MemoryStream(); + var exception = Assert.Throws( + () => new ArrowStreamWriter(stream, batch.Schema, leaveOpen: true, options)); + + Assert.Contains("A CompressionCodecFactory must be provided", exception.Message); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd, null)] + [InlineData(CompressionCodecType.Zstd, 2)] + [InlineData(CompressionCodecType.Lz4Frame, null)] + [InlineData(CompressionCodecType.Lz4Frame, (int)LZ4Level.L03_HC)] + public void CanWriteCompressedIpcStream(CompressionCodecType codec, int? compressionLevel) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = compressionLevel, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public async Task CanWriteCompressedIpcStreamAsync(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + }; + await TestRoundTripRecordBatchesAsync(new [] {batch}, options, codecFactory); + } + + [Fact] + public void CanWriteEmptyBatches() + { + var batch = TestData.CreateSampleRecordBatch(length: 0); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = CompressionCodecType.Lz4Frame, + }; + TestRoundTripRecordBatches(new [] {batch}, options, codecFactory); + } + + [Theory] + [InlineData(CompressionCodecType.Zstd)] + [InlineData(CompressionCodecType.Lz4Frame)] + public void ThrowsForInvalidCompressionLevel(CompressionCodecType codec) + { + var batch = TestData.CreateSampleRecordBatch(length: 100); + var codecFactory = new CompressionCodecFactory(); + var options = new IpcOptions + { + CompressionCodecFactory = codecFactory, + CompressionCodec = codec, + CompressionLevel = 12345, + }; + + using var stream = new MemoryStream(); + + Assert.Throws(() => + { + using var writer = new ArrowStreamWriter(stream, batch.Schema, leaveOpen: false, options); + writer.WriteRecordBatch(batch); + writer.WriteEnd(); + }); + } + + private static void TestRoundTripRecordBatches( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + writer.WriteRecordBatch(originalBatch); + } + writer.WriteEnd(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = Assert.Throws(() => + { + using var reader = new ArrowStreamReader(stream, leaveOpen: true); + reader.ReadNextRecordBatch(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowStreamReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = reader.ReadNextRecordBatch(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + + private static async Task TestRoundTripRecordBatchesAsync( + IReadOnlyList originalBatches, IpcOptions options, ICompressionCodecFactory codecFactory) + { + using var stream = new MemoryStream(); + + using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) + { + foreach (var originalBatch in originalBatches) + { + await writer.WriteRecordBatchAsync(originalBatch); + } + await writer.WriteEndAsync(); + } + + // Should throw if trying to read without an ICompressionCodecFactory + stream.Position = 0; + var exception = await Assert.ThrowsAsync(async () => + { + using var reader = new ArrowStreamReader(stream, leaveOpen: true); + await reader.ReadNextRecordBatchAsync(); + }); + Assert.Contains(nameof(ICompressionCodecFactory), exception.Message); + + stream.Position = 0; + using (var reader = new ArrowStreamReader(stream, codecFactory)) + { + foreach (var originalBatch in originalBatches) + { + var newBatch = await reader.ReadNextRecordBatchAsync(); + ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); + } + } + } + } +} + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj index cb7f7ae896ee2..e77f329bf2a15 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj +++ b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj @@ -10,6 +10,7 @@ + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs index 6a1e91240989b..3886846833c27 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/IntegrationCommand.cs @@ -16,6 +16,7 @@ using System; using System.IO; using System.Threading.Tasks; +using Apache.Arrow.Compression; using Apache.Arrow.Ipc; using Apache.Arrow.Tests; using Apache.Arrow.Types; @@ -65,8 +66,9 @@ private async Task Validate() { JsonFile jsonFile = await ParseJsonFile(); + var compressionFactory = new CompressionCodecFactory(); using FileStream arrowFileStream = ArrowFileInfo.OpenRead(); - using ArrowFileReader reader = new ArrowFileReader(arrowFileStream); + using ArrowFileReader reader = new ArrowFileReader(arrowFileStream, compressionCodecFactory: compressionFactory); int batchCount = await reader.RecordBatchCountAsync(); if (batchCount != jsonFile.Batches.Count) @@ -122,7 +124,8 @@ private async Task JsonToArrow() private async Task StreamToFile() { - using ArrowStreamReader reader = new ArrowStreamReader(Console.OpenStandardInput()); + var compressionFactory = new CompressionCodecFactory(); + using ArrowStreamReader reader = new ArrowStreamReader(Console.OpenStandardInput(), compressionCodecFactory: compressionFactory); RecordBatch batch = await reader.ReadNextRecordBatchAsync(); @@ -145,7 +148,8 @@ private async Task StreamToFile() private async Task FileToStream() { using FileStream fileStream = ArrowFileInfo.OpenRead(); - using ArrowFileReader fileReader = new ArrowFileReader(fileStream); + var compressionFactory = new CompressionCodecFactory(); + using ArrowFileReader fileReader = new ArrowFileReader(fileStream, compressionCodecFactory: compressionFactory); // read the record batch count to initialize the Schema await fileReader.RecordBatchCountAsync(); diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 7fadb7e47cf93..299983f62f283 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -158,7 +158,6 @@ def _gold_tests(self, gold_dir): skip_testers.add("JS") skip_testers.add("Rust") if prefix == '2.0.0-compression': - skip_testers.add("C#") skip_testers.add("JS") # See https://github.com/apache/arrow/pull/9822 for how to diff --git a/docs/source/status.rst b/docs/source/status.rst index a0375585dbee2..4bff37c8527fa 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -145,7 +145,7 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Sparse tensors | ✓ | | | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Buffer compression | ✓ | ✓ (3) | ✓ | | ✓ (4) | ✓ | ✓ | | +| Buffer compression | ✓ | ✓ (3) | ✓ | | ✓ | ✓ | ✓ | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Endianness conversion | ✓ (2) | | ✓ (2) | | | | | | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -160,8 +160,6 @@ Notes: * \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance. -* \(4) Compression when writing is not supported, only decompression when reading. - .. seealso:: The :ref:`format-ipc` specification. From 478755f0ef79a921aaa14822c8829c122bbbe92e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 7 Feb 2024 09:45:36 -0400 Subject: [PATCH 57/74] GH-38717: [C++] Add ImportChunkedArray and ExportChunkedArray to/from ArrowArrayStream (#39455) ### Rationale for this change The `ChunkedArray` has no equivalent in the C data interface; however, it is the primary array structure that higher level bindings interact with (because it is a column in a `Table`). In the Python capsule interface, this means that ChunkedArrays always require a workaround involving loops in Python. ### What changes are included in this PR? - Added `ImportChunkedArray()` and `ExportChunkedArray()` - Generalized the classes that support import/export to relax the assumption that every `ArrowArray` in an `ArrowArrayStream` is a `RecordBatch`. ### Are these changes tested? TODO ### Are there any user-facing changes? Yes, two new functions are added to bridge.h. * Closes: #38717 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Co-authored-by: Antoine Pitrou Signed-off-by: Dewey Dunnington --- cpp/src/arrow/c/bridge.cc | 252 +++++++++++++++++++++++++-------- cpp/src/arrow/c/bridge.h | 22 +++ cpp/src/arrow/c/bridge_test.cc | 115 +++++++++++++++ 3 files changed, 328 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 119249da99a6d..022fce72f59b8 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -2002,13 +2002,49 @@ Result> ImportDeviceRecordBatch( namespace { +Status ExportStreamSchema(const std::shared_ptr& src, + struct ArrowSchema* out_schema) { + return ExportSchema(*src->schema(), out_schema); +} + +Status ExportStreamSchema(const std::shared_ptr& src, + struct ArrowSchema* out_schema) { + return ExportType(*src->type(), out_schema); +} + +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowArray* out_array) { + std::shared_ptr batch; + RETURN_NOT_OK(src->ReadNext(&batch)); + if (batch == nullptr) { + // End of stream + ArrowArrayMarkReleased(out_array); + return Status::OK(); + } else { + return ExportRecordBatch(*batch, out_array); + } +} + +Status ExportStreamNext(const std::shared_ptr& src, int64_t i, + struct ArrowArray* out_array) { + if (i >= src->num_chunks()) { + // End of stream + ArrowArrayMarkReleased(out_array); + return Status::OK(); + } else { + return ExportArray(*src->chunk(static_cast(i)), out_array); + } +} + +template class ExportedArrayStream { public: struct PrivateData { - explicit PrivateData(std::shared_ptr reader) - : reader_(std::move(reader)) {} + explicit PrivateData(std::shared_ptr reader) + : reader_(std::move(reader)), batch_num_(0) {} - std::shared_ptr reader_; + std::shared_ptr reader_; + int64_t batch_num_; std::string last_error_; PrivateData() = default; @@ -2018,19 +2054,11 @@ class ExportedArrayStream { explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {} Status GetSchema(struct ArrowSchema* out_schema) { - return ExportSchema(*reader()->schema(), out_schema); + return ExportStreamSchema(reader(), out_schema); } Status GetNext(struct ArrowArray* out_array) { - std::shared_ptr batch; - RETURN_NOT_OK(reader()->ReadNext(&batch)); - if (batch == nullptr) { - // End of stream - ArrowArrayMarkReleased(out_array); - return Status::OK(); - } else { - return ExportRecordBatch(*batch, out_array); - } + return ExportStreamNext(reader(), next_batch_num(), out_array); } const char* GetLastError() { @@ -2070,6 +2098,15 @@ class ExportedArrayStream { return ExportedArrayStream{stream}.GetLastError(); } + static Status Make(std::shared_ptr reader, struct ArrowArrayStream* out) { + out->get_schema = ExportedArrayStream::StaticGetSchema; + out->get_next = ExportedArrayStream::StaticGetNext; + out->get_last_error = ExportedArrayStream::StaticGetLastError; + out->release = ExportedArrayStream::StaticRelease; + out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)}; + return Status::OK(); + } + private: int ToCError(const Status& status) { if (ARROW_PREDICT_TRUE(status.ok())) { @@ -2093,7 +2130,9 @@ class ExportedArrayStream { return reinterpret_cast(stream_->private_data); } - const std::shared_ptr& reader() { return private_data()->reader_; } + const std::shared_ptr& reader() { return private_data()->reader_; } + + int64_t next_batch_num() { return private_data()->batch_num_++; } struct ArrowArrayStream* stream_; }; @@ -2102,12 +2141,12 @@ class ExportedArrayStream { Status ExportRecordBatchReader(std::shared_ptr reader, struct ArrowArrayStream* out) { - out->get_schema = ExportedArrayStream::StaticGetSchema; - out->get_next = ExportedArrayStream::StaticGetNext; - out->get_last_error = ExportedArrayStream::StaticGetLastError; - out->release = ExportedArrayStream::StaticRelease; - out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)}; - return Status::OK(); + return ExportedArrayStream::Make(std::move(reader), out); +} + +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out) { + return ExportedArrayStream::Make(std::move(chunked_array), out); } ////////////////////////////////////////////////////////////////////////// @@ -2115,66 +2154,58 @@ Status ExportRecordBatchReader(std::shared_ptr reader, namespace { -class ArrayStreamBatchReader : public RecordBatchReader { +class ArrayStreamReader { public: - explicit ArrayStreamBatchReader(std::shared_ptr schema, - struct ArrowArrayStream* stream) - : schema_(std::move(schema)) { + explicit ArrayStreamReader(struct ArrowArrayStream* stream) { ArrowArrayStreamMove(stream, &stream_); DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } - ~ArrayStreamBatchReader() override { + ~ArrayStreamReader() { ReleaseStream(); } + + void ReleaseStream() { if (!ArrowArrayStreamIsReleased(&stream_)) { ArrowArrayStreamRelease(&stream_); } DCHECK(ArrowArrayStreamIsReleased(&stream_)); } - std::shared_ptr schema() const override { return schema_; } - - Status ReadNext(std::shared_ptr* batch) override { - struct ArrowArray c_array; - if (ArrowArrayStreamIsReleased(&stream_)) { - return Status::Invalid( - "Attempt to read from a reader that has already been closed"); - } - RETURN_NOT_OK(StatusFromCError(stream_.get_next(&stream_, &c_array))); - if (ArrowArrayIsReleased(&c_array)) { - // End of stream - batch->reset(); - return Status::OK(); - } else { - return ImportRecordBatch(&c_array, schema_).Value(batch); + protected: + Status ReadNextArrayInternal(struct ArrowArray* array) { + ArrowArrayMarkReleased(array); + Status status = StatusFromCError(stream_.get_next(&stream_, array)); + if (!status.ok() && !ArrowArrayIsReleased(array)) { + ArrowArrayRelease(array); } + + return status; } - Status Close() override { - if (!ArrowArrayStreamIsReleased(&stream_)) { - ArrowArrayStreamRelease(&stream_); - } - return Status::OK(); + Result> ReadSchema() { + struct ArrowSchema c_schema = {}; + ARROW_RETURN_NOT_OK( + StatusFromCError(&stream_, stream_.get_schema(&stream_, &c_schema))); + ARROW_ASSIGN_OR_RAISE(auto schema, ImportSchema(&c_schema)); + return schema; } - static Result> Make( - struct ArrowArrayStream* stream) { - if (ArrowArrayStreamIsReleased(stream)) { - return Status::Invalid("Cannot import released ArrowArrayStream"); - } - std::shared_ptr schema; + Result> ReadField() { struct ArrowSchema c_schema = {}; - auto status = StatusFromCError(stream, stream->get_schema(stream, &c_schema)); - if (status.ok()) { - status = ImportSchema(&c_schema).Value(&schema); - } - if (!status.ok()) { - ArrowArrayStreamRelease(stream); - return status; + ARROW_RETURN_NOT_OK( + StatusFromCError(&stream_, stream_.get_schema(&stream_, &c_schema))); + ARROW_ASSIGN_OR_RAISE(auto schema, ImportField(&c_schema)); + return schema; + } + + Status CheckNotReleased() { + if (ArrowArrayStreamIsReleased(&stream_)) { + return Status::Invalid( + "Attempt to read from a stream that has already been closed"); + } else { + return Status::OK(); } - return std::make_shared(std::move(schema), stream); } - private: Status StatusFromCError(int errno_like) const { return StatusFromCError(&stream_, errno_like); } @@ -2203,15 +2234,114 @@ class ArrayStreamBatchReader : public RecordBatchReader { return {code, last_error ? std::string(last_error) : ""}; } + private: mutable struct ArrowArrayStream stream_; +}; + +class ArrayStreamBatchReader : public RecordBatchReader, public ArrayStreamReader { + public: + explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) + : ArrayStreamReader(stream) {} + + Status Init() { + ARROW_ASSIGN_OR_RAISE(schema_, ReadSchema()); + return Status::OK(); + } + + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* batch) override { + ARROW_RETURN_NOT_OK(CheckNotReleased()); + + struct ArrowArray c_array; + ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + + if (ArrowArrayIsReleased(&c_array)) { + // End of stream + batch->reset(); + return Status::OK(); + } else { + return ImportRecordBatch(&c_array, schema_).Value(batch); + } + } + + Status Close() override { + ReleaseStream(); + return Status::OK(); + } + + private: std::shared_ptr schema_; }; +class ArrayStreamArrayReader : public ArrayStreamReader { + public: + explicit ArrayStreamArrayReader(struct ArrowArrayStream* stream) + : ArrayStreamReader(stream) {} + + Status Init() { + ARROW_ASSIGN_OR_RAISE(field_, ReadField()); + return Status::OK(); + } + + std::shared_ptr data_type() const { return field_->type(); } + + Status ReadNext(std::shared_ptr* array) { + ARROW_RETURN_NOT_OK(CheckNotReleased()); + + struct ArrowArray c_array; + ARROW_RETURN_NOT_OK(ReadNextArrayInternal(&c_array)); + + if (ArrowArrayIsReleased(&c_array)) { + // End of stream + array->reset(); + return Status::OK(); + } else { + return ImportArray(&c_array, field_->type()).Value(array); + } + } + + private: + std::shared_ptr field_; +}; + } // namespace Result> ImportRecordBatchReader( struct ArrowArrayStream* stream) { - return ArrayStreamBatchReader::Make(stream); + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + + auto reader = std::make_shared(stream); + ARROW_RETURN_NOT_OK(reader->Init()); + return reader; +} + +Result> ImportChunkedArray( + struct ArrowArrayStream* stream) { + if (ArrowArrayStreamIsReleased(stream)) { + return Status::Invalid("Cannot import released ArrowArrayStream"); + } + + auto reader = std::make_shared(stream); + ARROW_RETURN_NOT_OK(reader->Init()); + + std::shared_ptr data_type = reader->data_type(); + + ArrayVector chunks; + std::shared_ptr chunk; + while (true) { + ARROW_RETURN_NOT_OK(reader->ReadNext(&chunk)); + if (!chunk) { + break; + } + + chunks.push_back(std::move(chunk)); + } + + reader->ReleaseStream(); + return ChunkedArray::Make(std::move(chunks), std::move(data_type)); } } // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index 45583109a761f..e98a42818f628 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -302,6 +302,17 @@ ARROW_EXPORT Status ExportRecordBatchReader(std::shared_ptr reader, struct ArrowArrayStream* out); +/// \brief Export C++ ChunkedArray using the C data interface format. +/// +/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out); + /// \brief Import C++ RecordBatchReader from the C stream interface. /// /// The ArrowArrayStream struct has its contents moved to a private object @@ -313,6 +324,17 @@ ARROW_EXPORT Result> ImportRecordBatchReader( struct ArrowArrayStream* stream); +/// \brief Import C++ ChunkedArray from the C stream interface +/// +/// The ArrowArrayStream struct has its contents moved to a private object, +/// is consumed in its entirity, and released before returning all chunks +/// as a ChunkedArray. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportChunkedArray(struct ArrowArrayStream* stream); + /// @} } // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index b8d5e0fcd3845..dba6e4736b673 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -4400,6 +4400,17 @@ class TestArrayStreamExport : public BaseArrayStreamTest { ASSERT_OK_AND_ASSIGN(auto batch, ImportRecordBatch(&c_array, expected.schema())); AssertBatchesEqual(expected, *batch); } + + void AssertStreamNext(struct ArrowArrayStream* c_stream, const Array& expected) { + struct ArrowArray c_array; + ASSERT_EQ(0, c_stream->get_next(c_stream, &c_array)); + + ArrayExportGuard guard(&c_array); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array)); + + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_array, expected.type())); + AssertArraysEqual(expected, *array); + } }; TEST_F(TestArrayStreamExport, Empty) { @@ -4495,6 +4506,67 @@ TEST_F(TestArrayStreamExport, Errors) { ASSERT_EQ(EINVAL, c_stream.get_next(&c_stream, &c_array)); } +TEST_F(TestArrayStreamExport, ChunkedArrayExportEmpty) { + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({}, int32())); + + struct ArrowArrayStream c_stream; + struct ArrowSchema c_schema; + + ASSERT_OK(ExportChunkedArray(chunked_array, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + { + ArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + AssertStreamEnd(&c_stream); + } + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } +} + +TEST_F(TestArrayStreamExport, ChunkedArrayExport) { + ASSERT_OK_AND_ASSIGN(auto chunked_array, + ChunkedArray::Make({ArrayFromJSON(int32(), "[1, 2]"), + ArrayFromJSON(int32(), "[4, 5, null]")})); + + struct ArrowArrayStream c_stream; + struct ArrowSchema c_schema; + struct ArrowArray c_array0, c_array1; + + ASSERT_OK(ExportChunkedArray(chunked_array, &c_stream)); + ArrayStreamExportGuard guard(&c_stream); + + { + ArrayStreamExportGuard guard(&c_stream); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + ASSERT_EQ(0, c_stream.get_schema(&c_stream, &c_schema)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array0)); + ASSERT_EQ(0, c_stream.get_next(&c_stream, &c_array1)); + AssertStreamEnd(&c_stream); + } + + ArrayExportGuard guard0(&c_array0), guard1(&c_array1); + + { + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK_AND_ASSIGN(auto got_type, ImportType(&c_schema)); + AssertTypeEqual(*chunked_array->type(), *got_type); + } + + ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_array0, chunked_array->type())); + AssertArraysEqual(*chunked_array->chunk(0), *array); + ASSERT_OK_AND_ASSIGN(array, ImportArray(&c_array1, chunked_array->type())); + AssertArraysEqual(*chunked_array->chunk(1), *array); +} + //////////////////////////////////////////////////////////////////////////// // Array stream roundtrip tests @@ -4534,6 +4606,29 @@ class TestArrayStreamRoundtrip : public BaseArrayStreamTest { ASSERT_TRUE(weak_reader.expired()); } + void Roundtrip(std::shared_ptr src, + std::function&)> check_func) { + ArrowArrayStream c_stream; + + // One original copy which to compare the result, one copy held by the stream + std::weak_ptr weak_src(src); + int64_t initial_use_count = weak_src.use_count(); + + ASSERT_OK(ExportChunkedArray(std::move(src), &c_stream)); + ASSERT_FALSE(ArrowArrayStreamIsReleased(&c_stream)); + + { + ASSERT_OK_AND_ASSIGN(auto dst, ImportChunkedArray(&c_stream)); + // Stream was moved, consumed, and released + ASSERT_TRUE(ArrowArrayStreamIsReleased(&c_stream)); + + // Stream was released by ImportChunkedArray but original copy remains + ASSERT_EQ(weak_src.use_count(), initial_use_count - 1); + + check_func(dst); + } + } + void AssertReaderNext(const std::shared_ptr& reader, const RecordBatch& expected) { ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); @@ -4631,4 +4726,24 @@ TEST_F(TestArrayStreamRoundtrip, SchemaError) { ASSERT_TRUE(state.released); } +TEST_F(TestArrayStreamRoundtrip, ChunkedArrayRoundtrip) { + ASSERT_OK_AND_ASSIGN(auto src, + ChunkedArray::Make({ArrayFromJSON(int32(), "[1, 2]"), + ArrayFromJSON(int32(), "[4, 5, null]")})); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + +TEST_F(TestArrayStreamRoundtrip, ChunkedArrayRoundtripEmpty) { + ASSERT_OK_AND_ASSIGN(auto src, ChunkedArray::Make({}, int32())); + + Roundtrip(src, [&](const std::shared_ptr& dst) { + AssertTypeEqual(*dst->type(), *src->type()); + AssertChunkedEqual(*dst, *src); + }); +} + } // namespace arrow From 7e2fe4fe7634c359017213b79255c9040786fc06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Feb 2024 15:21:37 +0100 Subject: [PATCH 58/74] GH-39852: [Python] Support creating Binary/StringView arrays from python objects (#39853) Next step for Binary/StringView support in Python (https://github.com/apache/arrow/issues/39633), now adding it to the python->arrow conversion code path. * Closes: #39852 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../src/arrow/python/python_to_arrow.cc | 35 ++++++++++++------- python/pyarrow/tests/test_convert_builtin.py | 19 ++++++++-- python/pyarrow/tests/test_scalars.py | 28 +++------------ 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index d1d94ac17a13e..3c4d59d6594a2 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -486,6 +486,10 @@ class PyValue { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -499,8 +503,8 @@ class PyValue { } template - static enable_if_string Convert(const T*, const O& options, I obj, - PyBytesView& view) { + static enable_if_t::value || is_string_view_type::value, Status> + Convert(const T*, const O& options, I obj, PyBytesView& view) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 @@ -570,18 +574,12 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, - enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value && !is_binary_view_like_type::value) || - std::is_same::value>> { + T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; -template -struct PyConverterTrait> { - // not implemented -}; - template struct PyConverterTrait> { using type = PyListConverter; @@ -699,11 +697,22 @@ class PyPrimitiveConverter:: PyBytesView view_; }; +template +struct OffsetTypeTrait { + using type = typename T::offset_type; +}; + +template +struct OffsetTypeTrait> { + using type = int64_t; +}; + template -class PyPrimitiveConverter> +class PyPrimitiveConverter< + T, enable_if_t::value || is_binary_view_like_type::value>> : public PrimitiveConverter { public: - using OffsetType = typename T::offset_type; + using OffsetType = typename OffsetTypeTrait::type; Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 49c4f1a6e79d6..55ea28f50fbb3 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -763,6 +763,16 @@ def test_sequence_unicode(): assert arr.to_pylist() == data +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) +def test_sequence_unicode_explicit_type(ty): + data = ['foo', 'bar', None, 'mañana'] + arr = pa.array(data, type=ty) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == ty + assert arr.to_pylist() == data + + def check_array_mixed_unicode_bytes(binary_type, string_type): values = ['qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type): def test_array_mixed_unicode_bytes(): check_array_mixed_unicode_bytes(pa.binary(), pa.string()) check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) + check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view()) @pytest.mark.large_memory @@ -818,7 +829,7 @@ def test_large_binary_value(ty): @pytest.mark.large_memory -@pytest.mark.parametrize("ty", [pa.binary(), pa.string()]) +@pytest.mark.parametrize("ty", [pa.binary(), pa.string(), pa.string_view()]) def test_string_too_large(ty): # Construct a binary array with a single value larger than 4GB s = b"0123456789abcdefghijklmnopqrstuvwxyz" @@ -836,7 +847,7 @@ def test_sequence_bytes(): u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] - for ty in [None, pa.binary(), pa.large_binary()]: + for ty in [None, pa.binary(), pa.large_binary(), pa.binary_view()]: arr = pa.array(data, type=ty) assert len(arr) == 6 assert arr.null_count == 1 @@ -844,7 +855,7 @@ def test_sequence_bytes(): assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None] -@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()]) +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) def test_sequence_utf8_to_unicode(ty): # ARROW-1225 data = [b'foo', None, b'bar'] @@ -2431,6 +2442,8 @@ def test_array_from_pylist_offset_overflow(): pa.binary(3)), ([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()), (["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()), + ([b"a"], [pa.scalar("a", type=pa.binary_view())], pa.binary_view()), + (["a"], [pa.scalar("a", type=pa.string_view())], pa.string_view()), ( ["a"], [pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))], diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 4a239b23d5676..eed5f045be945 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -51,9 +51,8 @@ (b"bytes", None, pa.BinaryScalar), ("largestring", pa.large_string(), pa.LargeStringScalar), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar), - # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented - # ("string_view", pa.string_view(), pa.StringViewScalar), - # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), + ("string_view", pa.string_view(), pa.StringViewScalar), + (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar), ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), @@ -492,7 +491,7 @@ def test_month_day_nano_interval(): @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), (pa.large_string(), pa.LargeStringScalar), - # (pa.string_view(), pa.StringViewScalar), + (pa.string_view(), pa.StringViewScalar), ]) def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -507,30 +506,11 @@ def test_string(value, ty, scalar_typ): assert buf.to_pybytes() == value.encode() -@pytest.mark.parametrize('value', ['foo', 'mañana']) -def test_string_view(value): - # TODO: replace with normal scalar construction - builder = pa.lib.StringViewBuilder() - builder.append(value) - arr = builder.finish() - - s = arr[0] - assert isinstance(s, pa.StringViewScalar) - assert s.as_py() == value - assert s.as_py() != 'something' - assert repr(value) in repr(s) - assert str(s) == str(value) - - buf = s.as_buffer() - assert isinstance(buf, pa.Buffer) - assert buf.to_pybytes() == value.encode() - - @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.binary(), pa.BinaryScalar), (pa.large_binary(), pa.LargeBinaryScalar), - # (pa.binary_view(), pa.BinaryViewScalar), + (pa.binary_view(), pa.BinaryViewScalar), ]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) From 8ffc2140af3c994240b62f3c6412b8dbc889f604 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 7 Feb 2024 10:29:46 -0400 Subject: [PATCH 59/74] GH-39933: [R] Fix pointer conversion to Python for latest reticulate (#39969) ### Rationale for this change The integration tests and documentation build is failing ### What changes are included in this PR? Instead of relying on how reticulate converts an R external pointer, use a Python integer instead. We can't use an R integer (because they're only 32 bits); we can't use an R double (because the static cast to/from uintptr_t is a bit iffy); however, we can use Python to convert a string to Python integer. This is probably how I should have written it the first time but it didn't occur to me at the time. ### Are these changes tested? Yes, covered by existing tests. ### Are there any user-facing changes? No * Closes: #39933 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/R/python.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/r/R/python.R b/r/R/python.R index 023d914f16a9e..1159806bf7c25 100644 --- a/r/R/python.R +++ b/r/R/python.R @@ -339,15 +339,9 @@ install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) { } pyarrow_compatible_pointer <- function(ptr) { - pa <- reticulate::import("pyarrow") - version_string <- pa$`__version__` - # remove trailing .devXXX because it won't work with package_version() - pyarrow_version <- package_version(gsub("\\.dev.*?$", "", version_string)) - - # pyarrow pointers changed in version 7.0.0 - if (pyarrow_version >= "7.0.0") { - return(ptr) - } else { - return(external_pointer_addr_double(ptr)) - } + # GH-39933: Workaround because there is no built-in way to send a + # 64-bit integer to Python from an R object + py <- reticulate::import_builtins(convert = FALSE) + addr <- external_pointer_addr_character(ptr) + py$int(addr) } From e83295b1aafbea985f0be61983b0b4fc9094854c Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Wed, 7 Feb 2024 11:02:23 -0500 Subject: [PATCH 60/74] MINOR: [Documentation] Fix LargeListView format string in example (#39974) ### Rationale for this change https://github.com/apache/arrow/pull/38899 fixed the format string for LargeListView in the table but left the incorrect format string in the example below. This fixes that. ### What changes are included in this PR? Documentation change ### Are these changes tested? N/A --- docs/source/format/CDataInterface.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index 812212f536169..ef4bf1cf3238d 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -251,7 +251,7 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. -* A ``large_list_view`` array has format string ``+Lv``, and its single +* A ``large_list_view`` array has format string ``+vL``, and its single child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and From f609bb171a8bce973d7b040d8684b04a60e806ed Mon Sep 17 00:00:00 2001 From: abandy Date: Wed, 7 Feb 2024 16:01:55 -0500 Subject: [PATCH 61/74] GH-39910: [Go] Add func to load prepared statement from ActionCreatePreparedStatementResult (#39913) Currently, in order to create a PreparedStatement a DoAction call will always be made via the client. I need to be able to make a PreparedStatement from persisted data that will not trigger the DoAction call to the server. * Closes: #39910 Authored-by: Alva Bandy Signed-off-by: Matt Topol --- go/arrow/flight/flightsql/client.go | 65 ++++++++++++++++++++++++ go/arrow/flight/flightsql/client_test.go | 30 +++++++++++ go/arrow/flight/flightsql/types.go | 2 + 3 files changed, 97 insertions(+) diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 441f88f39f43a..068bfa84c3144 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -450,6 +450,31 @@ func (c *Client) PrepareSubstrait(ctx context.Context, plan SubstraitPlan, opts return parsePreparedStatementResponse(c, c.Alloc, stream) } +func (c *Client) LoadPreparedStatementFromResult(result *CreatePreparedStatementResult) (*PreparedStatement, error) { + var ( + err error + dsSchema, paramSchema *arrow.Schema + ) + if result.DatasetSchema != nil { + dsSchema, err = flight.DeserializeSchema(result.DatasetSchema, c.Alloc) + if err != nil { + return nil, err + } + } + if result.ParameterSchema != nil { + paramSchema, err = flight.DeserializeSchema(result.ParameterSchema, c.Alloc) + if err != nil { + return nil, err + } + } + return &PreparedStatement{ + client: c, + handle: result.PreparedStatementHandle, + datasetSchema: dsSchema, + paramSchema: paramSchema, + }, nil +} + func parsePreparedStatementResponse(c *Client, mem memory.Allocator, results pb.FlightService_DoActionClient) (*PreparedStatement, error) { if err := results.CloseSend(); err != nil { return nil, err @@ -1027,6 +1052,46 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return p.client.getFlightInfo(ctx, desc, opts...) } +// ExecutePut calls DoPut for the prepared statement on the server. If SetParameters +// has been called then the parameter bindings will be sent before execution. +// +// Will error if already closed. +func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOption) error { + if p.closed { + return errors.New("arrow/flightsql: prepared statement already closed") + } + + cmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + + desc, err := descForCommand(cmd) + if err != nil { + return err + } + + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return err + } + + wr, err := p.writeBindParameters(pstream, desc) + if err != nil { + return err + } + if err = wr.Close(); err != nil { + return err + } + pstream.CloseSend() + + // wait for the server to ack the result + if _, err = pstream.Recv(); err != nil && err != io.EOF { + return err + } + } + + return nil +} + // ExecutePoll executes the prepared statement on the server and returns a PollInfo // indicating the progress of execution. // diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index c8b9f7f1246c1..f35aeefcf4628 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -665,6 +665,36 @@ func (s *FlightSqlClientSuite) TestRenewFlightEndpoint() { s.Equal(&mockedRenewedEndpoint, renewedEndpoint) } +func (s *FlightSqlClientSuite) TestPreparedStatementLoadFromResult() { + const query = "query" + + result := &pb.ActionCreatePreparedStatementResult{ + PreparedStatementHandle: []byte(query), + } + + parameterSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "p_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.ParameterSchema = flight.SerializeSchema(parameterSchemaResult, memory.DefaultAllocator) + datasetSchemaResult := arrow.NewSchema([]arrow.Field{{Name: "ds_id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) + result.DatasetSchema = flight.SerializeSchema(datasetSchemaResult, memory.DefaultAllocator) + + prepared, err := s.sqlClient.LoadPreparedStatementFromResult(result) + s.NoError(err) + + s.Equal(string(prepared.Handle()), "query") + + paramSchema := prepared.ParameterSchema() + paramRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, paramSchema, strings.NewReader(`[{"p_id": 1}]`)) + s.NoError(err) + defer paramRec.Release() + + datasetSchema := prepared.DatasetSchema() + datasetRec, _, err := array.RecordFromJSON(memory.DefaultAllocator, datasetSchema, strings.NewReader(`[{"ds_id": 1}]`)) + s.NoError(err) + defer datasetRec.Release() + + s.Equal(string(prepared.Handle()), "query") +} + func TestFlightSqlClient(t *testing.T) { suite.Run(t, new(FlightSqlClientSuite)) } diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index d89e68f028bb8..c70a8bdc4ec26 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -852,3 +852,5 @@ const ( // cancellation request. CancelResultNotCancellable = pb.ActionCancelQueryResult_CANCEL_RESULT_NOT_CANCELLABLE ) + +type CreatePreparedStatementResult = pb.ActionCreatePreparedStatementResult From 66b41c48554cf79fb449fd6c627e44cd0a202cd8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 7 Feb 2024 20:21:07 -0400 Subject: [PATCH 62/74] GH-39738: [R] Support build against the last three released versions of Arrow (#39739) ### Rationale for this change Development velocity of the R package has slowed considerably since early versions of Arrow such that the commit-level integration that we once relied on is no longer necessary. The ability to build against older versions of Arrow also opens up more options for our CRAN submissions, since we may be able to work with CRAN to build a version of Arrow C++ they are happy with. This change doesn't require us to *do* anything about it...it just adds a check so that we are aware of the first PR that breaks the ability to build against a previous version. There is a possibility that an accidentally but previously installed version will end up being used via pkg-config, which I believe is how the version checking came into existence in the first place. ### What changes are included in this PR? - An `#if` to guard code that was added to support the string view/binary view - Changes to the version checker script to not error for supported Arrow C++ versions - CI job that checks build against supported Arrow versions ### Are these changes tested? Yes, a CI job was added ### Are there any user-facing changes? Yes, but I'll wait until there's consensus on this before documenting what our intended support policy will be. * Closes: #39738 Lead-authored-by: Dewey Dunnington Co-authored-by: Jacob Wujciak-Jens Co-authored-by: Dewey Dunnington Signed-off-by: Jacob Wujciak-Jens --- .github/workflows/r.yml | 57 +++++++++++++++++++++++++++++++++++ r/PACKAGING.md | 1 + r/src/r_to_arrow.cpp | 9 ++++++ r/tools/check-versions.R | 35 +++++++++++++-------- r/tools/test-check-versions.R | 40 ++++++++++++++++-------- 5 files changed, 116 insertions(+), 26 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 3d1f75ede4bb5..8c47915b7b6d3 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -54,6 +54,63 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + ubuntu-minimum-cpp-version: + name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) + runs-on: ubuntu-latest + strategy: + matrix: + include: + - cpp_version: "13.0.0" + steps: + - name: Checkout Arrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + path: src + submodules: recursive + + - name: Install Arrow C++ (${{ matrix.cpp_version }}) + run: | + sudo apt update + sudo apt install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt update + # We have to list all packages to avoid version conflicts. + sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \ + libarrow-acero-dev=${{ matrix.cpp_version }}-1 \ + libparquet-dev=${{ matrix.cpp_version }}-1 \ + libarrow-dataset-dev=${{ matrix.cpp_version }}-1 + + - name: Install checkbashisms + run: | + sudo apt-get install devscripts + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + working-directory: src/r + + - uses: r-lib/actions/check-r-package@v2 + with: + working-directory: src/r + env: + LIBARROW_BINARY: "false" + LIBARROW_BUILD: "false" + ARROW_R_VERBOSE_TEST: "true" + ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true" + + - name: Show install output + if: always() + run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true + shell: bash + + ubuntu: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} runs-on: ubuntu-latest diff --git a/r/PACKAGING.md b/r/PACKAGING.md index 7f42ecf562e59..4edeb4f2130cc 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -26,6 +26,7 @@ For a high-level overview of the release process see the ## Before the release candidate is cut - [ ] [Create a GitHub issue](https://github.com/apache/arrow/issues/new/) entitled `[R] CRAN packaging checklist for version X.X.X` and copy this checklist to the issue. +- [ ] Review deprecated functions to advance their deprecation status, including removing preprocessor directives that no longer apply (search for `ARROW_VERSION_MAJOR` in r/src). - [ ] Evaluate the status of any failing [nightly tests and nightly packaging builds](http://crossbow.voltrondata.com). These checks replicate most of the checks that CRAN runs, so we need them all to be passing or to understand that the failures may (though won't necessarily) result in a rejection from CRAN. - [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_arrow.html) - [ ] Ensure the contents of the README are accurate and up to date. diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d2db11e14a787..a81210f0ad914 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1050,6 +1050,7 @@ class RDictionaryConverter> template struct RConverterTrait; +#if ARROW_VERSION_MAJOR >= 15 template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && @@ -1061,6 +1062,14 @@ template struct RConverterTrait> { // not implemented }; +#else +template +struct RConverterTrait< + T, enable_if_t::value && !is_interval_type::value && + !is_extension_type::value>> { + using type = RPrimitiveConverter; +}; +#endif template struct RConverterTrait> { diff --git a/r/tools/check-versions.R b/r/tools/check-versions.R index 3d8cbf02a14c9..34b2ef680c547 100644 --- a/r/tools/check-versions.R +++ b/r/tools/check-versions.R @@ -20,6 +20,20 @@ args <- commandArgs(TRUE) # TESTING is set in test-check-version.R; it won't be set when called from configure test_mode <- exists("TESTING") +release_version_supported <- function(r_version, cpp_version) { + r_version <- package_version(r_version) + cpp_version <- package_version(cpp_version) + major <- function(x) as.numeric(x[1, 1]) + minimum_cpp_version <- package_version("13.0.0") + + allow_mismatch <- identical(tolower(Sys.getenv("ARROW_R_ALLOW_CPP_VERSION_MISMATCH", "false")), "true") + # If we allow a version mismatch we still need to cover the minimum version (13.0.0 for now) + # we don't allow newer C++ versions as new features without additional feature gates are likely to + # break the R package + version_valid <- cpp_version >= minimum_cpp_version && major(cpp_version) <= major(r_version) + allow_mismatch && version_valid || major(r_version) == major(cpp_version) +} + check_versions <- function(r_version, cpp_version) { r_parsed <- package_version(r_version) r_dev_version <- r_parsed[1, 4] @@ -39,20 +53,10 @@ check_versions <- function(r_version, cpp_version) { "*** > or retry with FORCE_BUNDLED_BUILD=true" ) cat(paste0(msg, "\n", collapse = "")) - } else if (r_is_patch && as.character(r_parsed[1, 1:3]) == cpp_version) { - # Patch releases we do for CRAN feedback get an extra x.y.z.1 version. - # These should work with the x.y.z C++ library (which never has .1 added) - cat( - sprintf( - "*** > Using C++ library version %s with R package %s\n", - cpp_version, - r_version - ) - ) - } else if (r_version != cpp_version) { + } else if (cpp_is_dev || !release_version_supported(r_version, cpp_parsed)) { cat( sprintf( - "**** Not using: C++ library version (%s) does not match R package (%s)\n", + "**** Not using: C++ library version (%s): not supported by R package version %s\n", cpp_version, r_version ) @@ -61,7 +65,12 @@ check_versions <- function(r_version, cpp_version) { # Add ALLOW_VERSION_MISMATCH env var to override stop()? (Could be useful for debugging) } else { # OK - cat(sprintf("**** C++ and R library versions match: %s\n", cpp_version)) + cat( + sprintf( + "**** C++ library version %s is supported by R version %s\n", + cpp_version, r_version + ) + ) } } diff --git a/r/tools/test-check-versions.R b/r/tools/test-check-versions.R index 9c284507b8801..f558648bed1e3 100644 --- a/r/tools/test-check-versions.R +++ b/r/tools/test-check-versions.R @@ -24,10 +24,10 @@ TESTING <- TRUE source("check-versions.R", local = TRUE) -test_that("check_versions", { +test_that("check_versions without mismatch", { expect_output( check_versions("10.0.0", "10.0.0"), - "**** C++ and R library versions match: 10.0.0", + "**** C++ library version 10.0.0 is supported by R version 10.0.0", fixed = TRUE ) expect_output( @@ -35,7 +35,7 @@ test_that("check_versions", { check_versions("10.0.0", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0", fixed = TRUE ) expect_output( @@ -43,20 +43,12 @@ test_that("check_versions", { check_versions("10.0.0.9000", "10.0.0-SNAPSHOT"), "version mismatch" ), - "**** Not using: C++ library version (10.0.0-SNAPSHOT) does not match R package (10.0.0.9000)", - fixed = TRUE - ) - expect_output( - expect_error( - check_versions("10.0.0.9000", "10.0.0"), - "version mismatch" - ), - "**** Not using: C++ library version (10.0.0) does not match R package (10.0.0.9000)", + "**** Not using: C++ library version (10.0.0-SNAPSHOT): not supported by R package version 10.0.0.9000", fixed = TRUE ) expect_output( check_versions("10.0.0.3", "10.0.0"), - "*** > Using C++ library version 10.0.0 with R package 10.0.0.3", + "**** C++ library version 10.0.0 is supported by R version 10.0.0.3", fixed = TRUE ) expect_output( @@ -65,3 +57,25 @@ test_that("check_versions", { fixed = TRUE ) }) + +test_that("check_versions with mismatch", { + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "false")) + + expect_false( + release_version_supported("15.0.0", "13.0.0") + ) + + withr::local_envvar(.new = c(ARROW_R_ALLOW_CPP_VERSION_MISMATCH = "true")) + + expect_true( + release_version_supported("15.0.0", "13.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "16.0.0") + ) + + expect_false( + release_version_supported("15.0.0", "12.0.0") + ) +}) From e1241e74a92561d65c134c06b9d5a95deeb273f3 Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Thu, 8 Feb 2024 01:37:12 +0100 Subject: [PATCH 63/74] GH-39987: [R] Make it possible to use a rtools libarrow on windows (#39986) This enables the use of libarrow from rtools. This is currently only possible by cross compiling manually but will be part of a future rtools version. These changes can't be tested, there are no user facing changes for now. * Closes: #39987 Lead-authored-by: Jacob Wujciak-Jens Co-authored-by: Neal Richardson Signed-off-by: Jacob Wujciak-Jens --- r/configure.win | 201 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 187 insertions(+), 14 deletions(-) diff --git a/r/configure.win b/r/configure.win index 2d9e5cdf54e44..b6ac19faea2d4 100755 --- a/r/configure.win +++ b/r/configure.win @@ -17,33 +17,58 @@ # specific language governing permissions and limitations # under the License. +: ${PKG_CONFIG:="pkg-config"} +# Library settings +PKG_CONFIG_NAME="arrow" +PKG_TEST_HEADER="" + +VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` + +# Development mode, also increases verbosity in the bundled build +ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'` +# If present, `pkg-config` will be used to find libarrow on the system, +# unless this is set to false +ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` # generate code -if [ "$ARROW_R_DEV" == "TRUE" ]; then +if [ "$ARROW_R_DEV" == "true" ]; then echo "*** Generating code with data-raw/codegen.R" "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" data-raw/codegen.R fi -OPENSSL_LIBS="-lcrypto -lcrypt32" -MIMALLOC_LIBS="-lbcrypt -lpsapi" -BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it -AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ - -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ - -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ - -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp" -# pkg-config --libs libcurl -GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ - -lz -lws2_32 -lnghttp2 -ldbghelp" +# Test if pkg-config is available to use +if ${PKG_CONFIG} --version >/dev/null 2>&1; then + PKG_CONFIG_AVAILABLE="true" + echo "*** pkg-config found." +else + echo "*** pkg-config not found." + PKG_CONFIG_AVAILABLE="false" + ARROW_USE_PKG_CONFIG="false" +fi -function configure_release() { - VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) + +function configure_binaries() { # Try to find/download a C++ Arrow binary, "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/nixlibs.R" $VERSION # If binary not found, script exits nonzero if [ $? -ne 0 ]; then + _LIBARROW_FOUND="false" echo "Arrow C++ library was not found" + # return 0 so set -e doesn't exit the script + return 0 fi + OPENSSL_LIBS="-lcrypto -lcrypt32" + MIMALLOC_LIBS="-lbcrypt -lpsapi" + BROTLI_LIBS="-lbrotlienc -lbrotlidec -lbrotlicommon" # Common goes last since dec and enc depend on it + AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management \ + -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 \ + -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common \ + -luserenv -lversion -lws2_32 -lbcrypt -lwininet -lwinhttp" + # pkg-config --libs libcurl + GCS_LIBS="-lcurl -lnormaliz -lssh2 -lgdi32 -lssl -lcrypto -lcrypt32 -lwldap32 \ + -lz -lws2_32 -lnghttp2 -ldbghelp" + # Set the right flags to point to and enable arrow/parquet if [ -d "windows/arrow-$VERSION" ]; then RWINLIB="../windows/arrow-$VERSION" @@ -75,12 +100,160 @@ function configure_release() { # It seems that order matters PKG_LIBS="${PKG_LIBS} -lws2_32" fi + +} + +# Once libarrow is obtained, this function sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` +# either from pkg-config or by inferring things about the directory in $1 +set_pkg_vars () { + set_lib_dir_with_pc + + # Check cmake options for enabled features. This uses LIB_DIR that + # is set by the above set_lib_dir_* call. + add_feature_flags + set_pkg_vars_with_pc + + # Set any user-defined CXXFLAGS + if [ "$ARROW_R_CXXFLAGS" ]; then + PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" + fi + + # We use expr because the product version returns more than just 10.13 and we want to + # match the substring. However, expr always outputs the number of matched characters + # to stdout, to avoid noise in the log we redirect the output to /dev/null + if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then + # avoid C++17 availability warnings on macOS < 11 + PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" + fi +} + +# If we have pkg-config, it will tell us what libarrow needs +set_lib_dir_with_pc () { + LIB_DIR="`${PKG_CONFIG} --variable=libdir ${PKG_CONFIG_NAME}`" +} +set_pkg_vars_with_pc () { + pkg_config_names="${PKG_CONFIG_NAME} ${PKG_CONFIG_NAMES_FEATURES}" + PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" + PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" + PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" + PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` +} + +add_feature_flags () { + PKG_CFLAGS_FEATURES="" + PKG_CONFIG_NAMES_FEATURES="" + PKG_LIBS_FEATURES="" + PKG_LIBS_FEATURES_WITHOUT_PC="" + + # Now we need to check what features it was built with and enable + # the corresponding feature flags in the R bindings (-DARROW_R_WITH_stuff). + # We do this by inspecting ArrowOptions.cmake, which the libarrow build + # generates. + ARROW_OPTS_CMAKE="$LIB_DIR/cmake/Arrow/ArrowOptions.cmake" + if [ ! -f "${ARROW_OPTS_CMAKE}" ]; then + echo "*** $ARROW_OPTS_CMAKE not found; some features will not be enabled" + else + if arrow_built_with ARROW_PARQUET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_PARQUET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES parquet" + PKG_LIBS_FEATURES_WITHOUT_PC="-lparquet $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: parquet is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_DATASET; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_DATASET" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-dataset" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_dataset $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_dataset is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_ACERO; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_ACERO" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-acero" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_acero $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_acero is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_SUBSTRAIT; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_SUBSTRAIT" + PKG_CONFIG_NAMES_FEATURES="$PKG_CONFIG_NAMES_FEATURES arrow-substrait" + PKG_LIBS_FEATURES_WITHOUT_PC="-larrow_substrait $PKG_LIBS_FEATURES_WITHOUT_PC" + # NOTE: arrow_substrait is assumed to have the same -L flag as arrow + # so there is no need to add its location to PKG_DIRS + fi + if arrow_built_with ARROW_JSON; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_JSON" + fi + if arrow_built_with ARROW_S3; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" + fi + if arrow_built_with ARROW_GCS; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" + fi + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + # If pkg-config is available it will handle this for us automatically + SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" + fi + fi +} + + +arrow_built_with() { + # Function to check cmake options for features + grep -i 'set('"$1"' "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 +} + +function configure_rtools() { + # Use pkg-config to find arrow from rtools + _LIBARROW_PREFIX="`${PKG_CONFIG} --variable=prefix ${PKG_CONFIG_NAME}`" + _LIBARROW_FOUND="true" + echo "*** Trying Arrow C++ found by pkg-config: $_LIBARROW_PREFIX" + + PC_LIB_VERSION=`${PKG_CONFIG} --modversion ${PKG_CONFIG_NAME}` + # This is in an R script for convenience and testability. + # Success means the found C++ library is ok to use. + # Error means the versions don't line up and we shouldn't use it. + # More specific messaging to the user is in the R script + if ! ${R_HOME}/bin/Rscript tools/check-versions.R $VERSION $PC_LIB_VERSION 2> /dev/null; then + _LIBARROW_FOUND="false" + fi + + # We should have a valid libarrow build in $_LIBARROW_FOUND +# Now set `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` based on that. +if [ "$_LIBARROW_FOUND" == "true" ]; then + set_pkg_vars ${_LIBARROW_PREFIX} + # add mingw specific windows flags + PKG_LIBS="$PKG_LIBS -lws2_32 -lole32 -lwldap32 -lsecur32 -lncrypt -lcrypt32 -lshlwapi" + # override -fno-exceptions from aws-cpp-sdk pc file + PKG_CFLAGS="$PKG_CFLAGS -fexceptions" +else + # To make it easier to debug which code path was taken add a specific + # message to the log in addition to the 'NOTE' + echo "*** Failed to find Arrow C++ libraries in rtools" +fi +} + +function configure_release() { + if [ "$ARROW_USE_PKG_CONFIG" != "false" ] && $PKG_CONFIG --exists $PKG_CONFIG_NAME; then + configure_rtools + else + configure_binaries + fi + + if [ "$_LIBARROW_FOUND" == "false" ]; then + echo "------------------------- NOTE ---------------------------" + echo "There was an issue preparing the Arrow C++ libraries." + echo "See https://arrow.apache.org/docs/r/articles/install.html" + echo "----------------------------------------------------------" + exit 1 + fi } # Returns 1 if CMAKE options is set "ON", otherwise 0 function cmake_option() { ARROW_OPTS_CMAKE="$ARROW_HOME/lib/cmake/Arrow/ArrowOptions.cmake" - grep -cm1 "set($1 \"ON\")" $ARROW_OPTS_CMAKE + arrow_built_with $1 } function configure_dev() { From c38b0f33f3361350dd1321a93b53716e64489a69 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 8 Feb 2024 11:40:12 +0900 Subject: [PATCH 64/74] GH-39992: [CI][Docs][Java] ubuntu-docs uses Maven version in .env (#39993) ### Rationale for this change GH-39696 updated Maven version but `ubuntu-docs` haven't used it yet. ### What changes are included in this PR? Use `MAVEN` in `.env` in `ubuntu-docs`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39992 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ci/docker/linux-apt-docs.dockerfile | 2 +- docker-compose.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index c51600a1e5920..3d102796b8c00 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -60,7 +60,7 @@ RUN apt-get update -y && \ ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 -ARG maven=3.5.4 +ARG maven=3.6.3 COPY ci/scripts/util_download_apache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/util_download_apache.sh \ "maven/maven-3/${maven}/binaries/apache-maven-${maven}-bin.tar.gz" /opt diff --git a/docker-compose.yml b/docker-compose.yml index 8a7223b57632f..a31fa0d9aa659 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1740,6 +1740,7 @@ services: args: r: ${R} jdk: ${JDK} + maven: ${MAVEN} node: ${NODE} base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 environment: From fa26fa56ce579f52de8f1fa07dfb103a122e2d8a Mon Sep 17 00:00:00 2001 From: av8or1 Date: Wed, 7 Feb 2024 23:53:01 -0600 Subject: [PATCH 65/74] GH-38703: [C++][FS][Azure] Implement DeleteFile() (#39840) ### Rationale for this change `DeleteFile()` API isn't implemented yet. ### What changes are included in this PR? Implement `DeleteFile()` by the "Delete Blob" API: https://learn.microsoft.com/en-us/rest/api/storageservices/delete-blob ### Are these changes tested? I tested the modification by creating a file via the web browser on our internal ADLS, then ran a sample program that deleted the file. I added three regression tests to cover the use case scenarios of: * A valid delete attempt, where "valid" means that the file exists and is indeed a file * An intentional failure where a file delete is attempted, but the file does not exist * An intentional failure where a file delete is attempted, but the target is a container * An intentional failure where a file delete is attempted, but the target is a directory ### Are there any user-facing changes? Yes. * Closes: #38703 Lead-authored-by: av8or1 Co-authored-by: jerry.adair Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 23 ++++++++++++++++- cpp/src/arrow/filesystem/azurefs_test.cc | 33 +++++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a5179c22190e1..87b9822878cce 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1690,6 +1690,26 @@ class AzureFileSystem::Impl { } } + Status DeleteFile(const AzureLocation& location) { + RETURN_NOT_OK(ValidateFileLocation(location)); + auto file_client = datalake_service_client_->GetFileSystemClient(location.container) + .GetFileClient(location.path); + try { + auto response = file_client.Delete(); + // Only the "*IfExists" functions ever set Deleted to false. + // All the others either succeed or throw an exception. + DCHECK(response.Value.Deleted); + } catch (const Storage::StorageException& exception) { + if (exception.ErrorCode == "FilesystemNotFound" || + exception.ErrorCode == "PathNotFound") { + return PathNotFound(location); + } + return ExceptionToStatus(exception, "Failed to delete a file: ", location.path, + ": ", file_client.GetUrl()); + } + return Status::OK(); + } + Status CopyFile(const AzureLocation& src, const AzureLocation& dest) { RETURN_NOT_OK(ValidateFileLocation(src)); RETURN_NOT_OK(ValidateFileLocation(dest)); @@ -1875,7 +1895,8 @@ Status AzureFileSystem::DeleteRootDirContents() { } Status AzureFileSystem::DeleteFile(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->DeleteFile(location); } Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6104b04411b32..4d123028ea86e 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1382,6 +1382,38 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } +TEST_F(TestAzuriteFileSystem, DeleteFileSuccess) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + const auto file_name = ConcatAbstractPath(container_name, "abc"); + CreateFile(fs(), file_name, "data"); + arrow::fs::AssertFileInfo(fs(), file_name, FileType::File); + ASSERT_OK(fs()->DeleteFile(file_name)); + arrow::fs::AssertFileInfo(fs(), file_name, FileType::NotFound); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureNonexistent) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + const auto nonexistent_file_name = ConcatAbstractPath(container_name, "nonexistent"); + ASSERT_RAISES(IOError, fs()->DeleteFile(nonexistent_file_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureContainer) { + const auto container_name = PreexistingData::RandomContainerName(rng_); + ASSERT_OK(fs()->CreateDir(container_name)); + arrow::fs::AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(container_name)); +} + +TEST_F(TestAzuriteFileSystem, DeleteFileFailureDirectory) { + const auto directory_name = + ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "directory"); + ASSERT_OK(fs()->CreateDir(directory_name)); + arrow::fs::AssertFileInfo(fs(), directory_name, FileType::Directory); + ASSERT_RAISES(IOError, fs()->DeleteFile(directory_name)); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); @@ -1868,6 +1900,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); ASSERT_RAISES(Invalid, stream->Seek(2)); } - } // namespace fs } // namespace arrow From 026188e3bb36c58573f23215aedc14e6392264c2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 8 Feb 2024 12:25:38 +0100 Subject: [PATCH 66/74] GH-37484: [Python] Add a FixedSizeTensorScalar class (#37533) ### Rationale for this change When working with `FixedSizeTensorArray` we want to access individual tensors. This would be enabled by adding: ```python def FixedSizeTensorScalar(pa.ExtensionScalar): def to_numpy_ndarray(): ... ``` See #37484. ### What changes are included in this PR? This adds `FixedSizeTensorScalar` and tests for it. ### Are there any user-facing changes? Yes, when calling `FixedSizeTensorArray[i]` we would get back `FixedSizeTensorScalar` instead of `ExtensionScalar`. * Closes: #37484 Lead-authored-by: Rok Mihevc Co-authored-by: Antoine Pitrou Co-authored-by: Joris Van den Bossche Co-authored-by: Alenka Frim Signed-off-by: Antoine Pitrou --- cpp/src/arrow/extension/fixed_shape_tensor.cc | 130 +++++++--- cpp/src/arrow/extension/fixed_shape_tensor.h | 11 +- .../extension/fixed_shape_tensor_test.cc | 226 ++++++++++++++++-- cpp/src/arrow/extension/tensor_internal.h | 45 ++++ python/pyarrow/array.pxi | 77 ++++-- python/pyarrow/includes/libarrow.pxd | 12 +- python/pyarrow/scalar.pxi | 42 ++++ python/pyarrow/tests/test_extension_type.py | 123 ++++++++-- python/pyarrow/types.pxi | 22 +- 9 files changed, 566 insertions(+), 122 deletions(-) create mode 100644 cpp/src/arrow/extension/tensor_internal.h diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index af8305a025291..02e0a890e4b3d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -19,6 +19,8 @@ #include #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/tensor_internal.h" +#include "arrow/scalar.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" @@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const { if (extension_name() != other.extension_name()) { return false; } - const auto& other_ext = static_cast(other); + const auto& other_ext = internal::checked_cast(other); auto is_permutation_trivial = [](const std::vector& permutation) { for (size_t i = 1; i < permutation.size(); ++i) { @@ -143,7 +145,7 @@ std::string FixedShapeTensorType::Serialize() const { if (!dim_names_.empty()) { rj::Value dim_names(rj::kArrayType); - for (std::string v : dim_names_) { + for (const std::string& v : dim_names_) { dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator); } document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator); @@ -199,10 +201,52 @@ std::shared_ptr FixedShapeTensorType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.fixed_shape_tensor", - static_cast(*data->type).extension_name()); + internal::checked_cast(*data->type).extension_name()); return std::make_shared(data); } +Result> FixedShapeTensorType::MakeTensor( + const std::shared_ptr& scalar) { + const auto ext_scalar = internal::checked_pointer_cast(scalar); + const auto ext_type = + internal::checked_pointer_cast(scalar->type); + if (!is_fixed_width(*ext_type->value_type())) { + return Status::TypeError("Cannot convert non-fixed-width values to Tensor."); + } + const auto array = + internal::checked_pointer_cast(ext_scalar->value)->value; + if (array->null_count() > 0) { + return Status::Invalid("Cannot convert data with nulls to Tensor."); + } + const auto value_type = + internal::checked_pointer_cast(ext_type->value_type()); + const auto byte_width = value_type->byte_width(); + + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim()); + std::iota(permutation.begin(), permutation.end(), 0); + } + + std::vector shape = ext_type->shape(); + internal::Permute(permutation, &shape); + + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + internal::Permute(permutation, &dim_names); + } + + std::vector strides; + RETURN_NOT_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides)); + const auto start_position = array->offset() * byte_width; + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); + const auto buffer = + SliceBuffer(array->data()->buffers[1], start_position, size * byte_width); + + return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); +} + Result> FixedShapeTensorArray::FromTensor( const std::shared_ptr& tensor) { auto permutation = internal::ArgSort(tensor->strides(), std::greater<>()); @@ -293,53 +337,71 @@ const Result> FixedShapeTensorArray::ToTensor() const { // To convert an array of n dimensional tensors to a n+1 dimensional tensor we // interpret the array's length as the first dimension the new tensor. - auto ext_arr = std::static_pointer_cast(this->storage()); - auto ext_type = internal::checked_pointer_cast(this->type()); - ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()), - Status::Invalid(ext_arr->value_type()->ToString(), - " is not valid data type for a tensor")); - auto permutation = ext_type->permutation(); - - std::vector dim_names; - if (!ext_type->dim_names().empty()) { - for (auto i : permutation) { - dim_names.emplace_back(ext_type->dim_names()[i]); - } - dim_names.insert(dim_names.begin(), 1, ""); + const auto ext_type = + internal::checked_pointer_cast(this->type()); + const auto value_type = ext_type->value_type(); + ARROW_RETURN_IF( + !is_fixed_width(*value_type), + Status::TypeError(value_type->ToString(), " is not valid data type for a tensor")); + + // ext_type->permutation() gives us permutation for a single row with values in + // range [0, ndim). Here want to create a ndim + 1 dimensional tensor from the entire + // array and we assume the first dimension will always have the greatest stride, so it + // will get permutation index 0 and remaining values from ext_type->permutation() need + // to be shifted to fill the [1, ndim+1) range. Computed permutation will be used to + // generate the new tensor's shape, strides and dim_names. + std::vector permutation = ext_type->permutation(); + if (permutation.empty()) { + permutation.resize(ext_type->ndim() + 1); + std::iota(permutation.begin(), permutation.end(), 0); } else { - dim_names = {}; + for (auto i = 0; i < static_cast(ext_type->ndim()); i++) { + permutation[i] += 1; + } + permutation.insert(permutation.begin(), 1, 0); } - std::vector shape; - for (int64_t& i : permutation) { - shape.emplace_back(ext_type->shape()[i]); - ++i; + std::vector dim_names = ext_type->dim_names(); + if (!dim_names.empty()) { + dim_names.insert(dim_names.begin(), 1, ""); + internal::Permute(permutation, &dim_names); } + + std::vector shape = ext_type->shape(); + auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast(1), + std::multiplies<>()); shape.insert(shape.begin(), 1, this->length()); - permutation.insert(permutation.begin(), 1, 0); + internal::Permute(permutation, &shape); std::vector tensor_strides; - auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); + const auto fw_value_type = internal::checked_pointer_cast(value_type); ARROW_RETURN_NOT_OK( - ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); - ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); + ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides)); + + const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1]; ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); - return tensor; + const auto buffer, + SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width())); + + return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names); } Result> FixedShapeTensorType::Make( const std::shared_ptr& value_type, const std::vector& shape, const std::vector& permutation, const std::vector& dim_names) { - if (!permutation.empty() && shape.size() != permutation.size()) { - return Status::Invalid("permutation size must match shape size. Expected: ", - shape.size(), " Got: ", permutation.size()); + const auto ndim = shape.size(); + if (!permutation.empty() && ndim != permutation.size()) { + return Status::Invalid("permutation size must match shape size. Expected: ", ndim, + " Got: ", permutation.size()); + } + if (!dim_names.empty() && ndim != dim_names.size()) { + return Status::Invalid("dim_names size must match shape size. Expected: ", ndim, + " Got: ", dim_names.size()); } - if (!dim_names.empty() && shape.size() != dim_names.size()) { - return Status::Invalid("dim_names size must match shape size. Expected: ", - shape.size(), " Got: ", dim_names.size()); + if (!permutation.empty()) { + RETURN_NOT_OK(internal::IsPermutationValid(permutation)); } + const auto size = std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies<>()); return std::make_shared(value_type, static_cast(size), diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index fcfb1ebbab96a..591a7cee32a34 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { std::string ToString() const override; /// Number of dimensions of tensor elements - size_t ndim() { return shape_.size(); } + size_t ndim() const { return shape_.size(); } /// Shape of tensor elements const std::vector shape() const { return shape_; } @@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { /// Create a FixedShapeTensorArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; + /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray + /// + /// This method will return a Tensor from ExtensionScalar with strides + /// derived from shape and permutation of FixedShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// FixedShapeTensorType metadata. + static Result> MakeTensor( + const std::shared_ptr& scalar); + /// \brief Create a FixedShapeTensorType instance static Result> Make( const std::shared_ptr& value_type, const std::vector& shape, diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 2b8e703d3c66e..3fd39a11ff50d 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -28,6 +28,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/sort.h" namespace arrow { @@ -39,34 +40,34 @@ class TestExtensionType : public ::testing::Test { public: void SetUp() override { shape_ = {3, 3, 4}; - cell_shape_ = {3, 4}; + element_shape_ = {3, 4}; value_type_ = int64(); - cell_type_ = fixed_size_list(value_type_, 12); + element_type_ = fixed_size_list(value_type_, 12); dim_names_ = {"x", "y"}; ext_type_ = internal::checked_pointer_cast( - fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_)); + fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_)); values_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; values_partial_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; shape_partial_ = {2, 3, 4}; tensor_strides_ = {96, 32, 8}; - cell_strides_ = {32, 8}; + element_strides_ = {32, 8}; serialized_ = R"({"shape":[3,4],"dim_names":["x","y"]})"; } protected: std::vector shape_; std::vector shape_partial_; - std::vector cell_shape_; + std::vector element_shape_; std::shared_ptr value_type_; - std::shared_ptr cell_type_; + std::shared_ptr element_type_; std::vector dim_names_; std::shared_ptr ext_type_; std::vector values_; std::vector values_partial_; std::vector tensor_strides_; - std::vector cell_strides_; + std::vector element_strides_; std::string serialized_; }; @@ -96,8 +97,8 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test ExtensionType methods ASSERT_EQ(ext_type_->extension_name(), "arrow.fixed_shape_tensor"); ASSERT_TRUE(ext_type_->Equals(*exact_ext_type)); - ASSERT_FALSE(ext_type_->Equals(*cell_type_)); - ASSERT_TRUE(ext_type_->storage_type()->Equals(*cell_type_)); + ASSERT_FALSE(ext_type_->Equals(*element_type_)); + ASSERT_TRUE(ext_type_->storage_type()->Equals(*element_type_)); ASSERT_EQ(ext_type_->Serialize(), serialized_); ASSERT_OK_AND_ASSIGN(auto ds, ext_type_->Deserialize(ext_type_->storage_type(), serialized_)); @@ -106,18 +107,28 @@ TEST_F(TestExtensionType, CreateExtensionType) { // Test FixedShapeTensorType methods ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION); - ASSERT_EQ(exact_ext_type->ndim(), cell_shape_.size()); - ASSERT_EQ(exact_ext_type->shape(), cell_shape_); + ASSERT_EQ(exact_ext_type->ndim(), element_shape_.size()); + ASSERT_EQ(exact_ext_type->shape(), element_shape_); ASSERT_EQ(exact_ext_type->value_type(), value_type_); - ASSERT_EQ(exact_ext_type->strides(), cell_strides_); + ASSERT_EQ(exact_ext_type->strides(), element_strides_); ASSERT_EQ(exact_ext_type->dim_names(), dim_names_); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: permutation size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {0})); + FixedShapeTensorType::Make(value_type_, element_shape_, {0})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Invalid: dim_names size must match shape size."), - FixedShapeTensorType::Make(value_type_, cell_shape_, {}, {"x"})); + FixedShapeTensorType::Make(value_type_, element_shape_, {}, {"x"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 2 dimensional tensors must be " + "unique and within [0, 1] range. Got: [3,0]"), + FixedShapeTensorType::Make(value_type_, {5, 6}, {3, 0})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Permutation indices for 3 dimensional tensors must be " + "unique and within [0, 2] range. Got: [0,1,1]"), + FixedShapeTensorType::Make(value_type_, {1, 2, 3}, {0, 1, 1})); } TEST_F(TestExtensionType, EqualsCases) { @@ -148,7 +159,7 @@ TEST_F(TestExtensionType, CreateFromArray) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); ASSERT_EQ(ext_arr->length(), shape_[0]); ASSERT_EQ(ext_arr->null_count(), 0); @@ -200,7 +211,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { std::vector> buffers = {nullptr, Buffer::Wrap(values_)}; auto arr_data = std::make_shared(value_type_, values_.size(), buffers, 0, 0); auto arr = std::make_shared(arr_data); - ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type_)); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type_)); auto ext_arr = ExtensionType::WrapArray(ext_type_, fsla_arr); // Pass extension array, expect getting back extension array @@ -215,7 +226,7 @@ TEST_F(TestExtensionType, RoundtripBatch) { auto ext_metadata = key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, {"ARROW:extension:metadata", serialized_}}); - ext_field = field(/*name=*/"f0", /*type=*/cell_type_, /*nullable=*/true, + ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true, /*metadata=*/ext_metadata); auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr}); RoundtripBatch(batch2, &read_batch2); @@ -270,7 +281,7 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto ext_arr_5 = std::static_pointer_cast( ExtensionType::WrapArray(ext_type_5, fsla_arr)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("binary is not valid data type for a tensor"), + TypeError, testing::HasSubstr("binary is not valid data type for a tensor"), ext_arr_5->ToTensor()); auto ext_type_6 = internal::checked_pointer_cast( @@ -278,6 +289,10 @@ TEST_F(TestExtensionType, CreateFromTensor) { auto arr_with_null = ArrayFromJSON(int64(), "[1, 0, null, null, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto fsla_arr_6, FixedSizeListArray::FromArrays( arr_with_null, fixed_size_list(int64(), 2))); + + auto ext_type_7 = internal::checked_pointer_cast( + fixed_shape_tensor(int64(), {3, 4}, {})); + ASSERT_OK_AND_ASSIGN(auto ext_arr_7, FixedShapeTensorArray::FromTensor(tensor)); } void CheckFromTensorType(const std::shared_ptr& tensor, @@ -308,7 +323,7 @@ TEST_F(TestExtensionType, TestFromTensorType) { auto dim_names = std::vector>{ {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"y", "z"}, {"y", "z"}, {"y", "z"}, {"y", "z"}}; - auto cell_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; + auto element_shapes = std::vector>{{3, 4}, {4, 3}, {4, 3}, {3, 4}}; auto permutations = std::vector>{{0, 1}, {1, 0}, {0, 1}, {1, 0}}; for (size_t i = 0; i < shapes.size(); i++) { @@ -316,11 +331,82 @@ TEST_F(TestExtensionType, TestFromTensorType) { strides[i], tensor_dim_names[i])); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); auto ext_type = - fixed_shape_tensor(value_type_, cell_shapes[i], permutations[i], dim_names[i]); + fixed_shape_tensor(value_type_, element_shapes[i], permutations[i], dim_names[i]); CheckFromTensorType(tensor, ext_type); } } +template +void CheckToTensor(const std::vector& values, const std::shared_ptr typ, + const int32_t& element_size, const std::vector& element_shape, + const std::vector& element_permutation, + const std::vector& element_dim_names, + const std::vector& tensor_shape, + const std::vector& tensor_dim_names, + const std::vector& tensor_strides) { + auto buffer = Buffer::Wrap(values); + const std::shared_ptr element_type = fixed_size_list(typ, element_size); + std::vector> buffers = {nullptr, buffer}; + auto arr_data = std::make_shared(typ, values.size(), buffers); + auto arr = std::make_shared(arr_data); + ASSERT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, element_type)); + + ASSERT_OK_AND_ASSIGN( + auto expected_tensor, + Tensor::Make(typ, buffer, tensor_shape, tensor_strides, tensor_dim_names)); + const auto ext_type = + fixed_shape_tensor(typ, element_shape, element_permutation, element_dim_names); + + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + const auto tensor_array = std::static_pointer_cast(ext_arr); + ASSERT_OK_AND_ASSIGN(const auto actual_tensor, tensor_array->ToTensor()); + ASSERT_OK(actual_tensor->Validate()); + + ASSERT_EQ(actual_tensor->type(), expected_tensor->type()); + ASSERT_EQ(actual_tensor->shape(), expected_tensor->shape()); + ASSERT_EQ(actual_tensor->strides(), expected_tensor->strides()); + ASSERT_EQ(actual_tensor->dim_names(), expected_tensor->dim_names()); + ASSERT_TRUE(actual_tensor->data()->Equals(*expected_tensor->data())); + ASSERT_TRUE(actual_tensor->Equals(*expected_tensor)); +} + +TEST_F(TestExtensionType, ToTensor) { + std::vector float_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; + + auto element_sizes = std::vector{6, 6, 18, 18, 18, 18}; + + auto element_shapes = std::vector>{{2, 3}, {3, 2}, {3, 6}, + {6, 3}, {3, 2, 3}, {3, 2, 3}}; + auto tensor_shapes = std::vector>{ + {6, 2, 3}, {6, 2, 3}, {2, 3, 6}, {2, 3, 6}, {2, 3, 2, 3}, {2, 3, 2, 3}}; + + auto element_permutations = std::vector>{ + {0, 1}, {1, 0}, {0, 1}, {1, 0}, {0, 1, 2}, {2, 1, 0}}; + auto tensor_strides_32 = + std::vector>{{24, 12, 4}, {24, 4, 8}, {72, 24, 4}, + {72, 4, 12}, {72, 24, 12, 4}, {72, 4, 12, 24}}; + auto tensor_strides_64 = + std::vector>{{48, 24, 8}, {48, 8, 16}, {144, 48, 8}, + {144, 8, 24}, {144, 48, 24, 8}, {144, 8, 24, 48}}; + + auto element_dim_names = std::vector>{ + {"y", "z"}, {"z", "y"}, {"y", "z"}, {"z", "y"}, {"H", "W", "C"}, {"H", "W", "C"}}; + auto tensor_dim_names = std::vector>{ + {"", "y", "z"}, {"", "y", "z"}, {"", "y", "z"}, + {"", "y", "z"}, {"", "H", "W", "C"}, {"", "C", "W", "H"}}; + + for (size_t i = 0; i < element_shapes.size(); i++) { + CheckToTensor(float_values, float32(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_32[i]); + CheckToTensor(values_, int64(), element_sizes[i], element_shapes[i], + element_permutations[i], element_dim_names[i], tensor_shapes[i], + tensor_dim_names[i], tensor_strides_64[i]); + } +} + void CheckTensorRoundtrip(const std::shared_ptr& tensor) { ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); ASSERT_OK_AND_ASSIGN(auto tensor_from_array, ext_arr->ToTensor()); @@ -364,7 +450,7 @@ TEST_F(TestExtensionType, SliceTensor) { Tensor::Make(value_type_, Buffer::Wrap(values_partial_), shape_partial_)); ASSERT_EQ(tensor->strides(), tensor_strides_); ASSERT_EQ(tensor_partial->strides(), tensor_strides_); - auto ext_type = fixed_shape_tensor(value_type_, cell_shape_, {}, dim_names_); + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); auto exact_ext_type = internal::checked_pointer_cast(ext_type_); ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); @@ -404,11 +490,11 @@ TEST_F(TestExtensionType, ComputeStrides) { auto exact_ext_type = internal::checked_pointer_cast(ext_type_); auto ext_type_1 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_2 = internal::checked_pointer_cast( - fixed_shape_tensor(int64(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int64(), element_shape_, {}, dim_names_)); auto ext_type_3 = internal::checked_pointer_cast( - fixed_shape_tensor(int32(), cell_shape_, {}, dim_names_)); + fixed_shape_tensor(int32(), element_shape_, {}, dim_names_)); ASSERT_TRUE(ext_type_1->Equals(*ext_type_2)); ASSERT_FALSE(ext_type_1->Equals(*ext_type_3)); @@ -462,4 +548,96 @@ TEST_F(TestExtensionType, ToString) { ASSERT_EQ(expected_3, result_3); } +TEST_F(TestExtensionType, GetTensor) { + auto arr = ArrayFromJSON(element_type_, + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]," + "[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]"); + auto element_values = + std::vector>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}}; + + auto ext_type = fixed_shape_tensor(value_type_, element_shape_, {}, dim_names_); + auto permuted_ext_type = fixed_shape_tensor(value_type_, {3, 4}, {1, 0}, {"x", "y"}); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto exact_permuted_ext_type = + internal::checked_pointer_cast(permuted_ext_type); + + auto array = std::static_pointer_cast( + ExtensionType::WrapArray(ext_type, arr)); + auto permuted_array = std::static_pointer_cast( + ExtensionType::WrapArray(permuted_ext_type, arr)); + + for (size_t i = 0; i < element_values.size(); i++) { + // Get tensor from extension array with trivial permutation + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(i)); + auto actual_ext_scalar = internal::checked_pointer_cast(scalar); + ASSERT_OK_AND_ASSIGN(auto actual_tensor, + exact_ext_type->MakeTensor(actual_ext_scalar)); + ASSERT_OK(actual_tensor->Validate()); + ASSERT_OK_AND_ASSIGN(auto expected_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {3, 4}, {}, {"x", "y"})); + ASSERT_EQ(expected_tensor->shape(), actual_tensor->shape()); + ASSERT_EQ(expected_tensor->dim_names(), actual_tensor->dim_names()); + ASSERT_EQ(expected_tensor->strides(), actual_tensor->strides()); + ASSERT_EQ(actual_tensor->strides(), std::vector({32, 8})); + ASSERT_EQ(expected_tensor->type(), actual_tensor->type()); + ASSERT_TRUE(expected_tensor->Equals(*actual_tensor)); + + // Get tensor from extension array with non-trivial permutation + ASSERT_OK_AND_ASSIGN(auto expected_permuted_tensor, + Tensor::Make(value_type_, Buffer::Wrap(element_values[i]), + {4, 3}, {8, 24}, {"y", "x"})); + ASSERT_OK_AND_ASSIGN(scalar, permuted_array->GetScalar(i)); + ASSERT_OK_AND_ASSIGN(auto actual_permuted_tensor, + exact_permuted_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + ASSERT_OK(actual_permuted_tensor->Validate()); + ASSERT_EQ(expected_permuted_tensor->strides(), actual_permuted_tensor->strides()); + ASSERT_EQ(expected_permuted_tensor->shape(), actual_permuted_tensor->shape()); + ASSERT_EQ(expected_permuted_tensor->dim_names(), actual_permuted_tensor->dim_names()); + ASSERT_EQ(expected_permuted_tensor->type(), actual_permuted_tensor->type()); + ASSERT_EQ(expected_permuted_tensor->is_contiguous(), + actual_permuted_tensor->is_contiguous()); + ASSERT_EQ(expected_permuted_tensor->is_column_major(), + actual_permuted_tensor->is_column_major()); + ASSERT_TRUE(expected_permuted_tensor->Equals(*actual_permuted_tensor)); + } + + // Test null values fail + auto element_type = fixed_size_list(int64(), 1); + auto fsla_arr = ArrayFromJSON(element_type, "[[1], [null], null]"); + ext_type = fixed_shape_tensor(int64(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + auto ext_arr = ExtensionType::WrapArray(ext_type, fsla_arr); + auto tensor_array = internal::checked_pointer_cast(ext_arr); + + ASSERT_OK_AND_ASSIGN(auto scalar, tensor_array->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto tensor, + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(1)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + ASSERT_OK_AND_ASSIGN(scalar, tensor_array->GetScalar(2)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid: Cannot convert data with nulls to Tensor."), + exact_ext_type->MakeTensor( + internal::checked_pointer_cast(scalar))); + + element_type = list(utf8()); + ext_type = fixed_shape_tensor(utf8(), {1}); + exact_ext_type = internal::checked_pointer_cast(ext_type); + scalar = std::make_shared(ArrayFromJSON(element_type, R"([["a", "b"]])")); + auto ext_scalar = std::make_shared(scalar, ext_type); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + testing::HasSubstr("Type error: Cannot convert non-fixed-width values to Tensor."), + exact_ext_type->MakeTensor(ext_scalar)); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h new file mode 100644 index 0000000000000..069880cb17c85 --- /dev/null +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/print.h" + +namespace arrow::internal { + +ARROW_EXPORT +Status IsPermutationValid(const std::vector& permutation) { + const auto size = static_cast(permutation.size()); + std::vector dim_seen(size, 0); + + for (const auto p : permutation) { + if (p < 0 || p >= size || dim_seen[p] != 0) { + return Status::Invalid( + "Permutation indices for ", size, + " dimensional tensors must be unique and within [0, ", size - 1, + "] range. Got: ", ::arrow::internal::PrintVector{permutation, ","}); + } + dim_seen[p] = 1; + } + return Status::OK(); +} + +} // namespace arrow::internal diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1029f3a629817..5ab07f21d5b71 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3541,7 +3541,7 @@ cdef class ExtensionArray(Array): return result -class FixedShapeTensorArray(ExtensionArray): +cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. @@ -3582,17 +3582,48 @@ class FixedShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): """ - Convert fixed shape tensor extension array to a numpy array (with dim+1). + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. - Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. """ - if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): - np_flat = np.asarray(self.storage.flatten()) - numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) - return numpy_tensor - else: - raise ValueError( - 'Only non-permuted tensors can be converted to numpy tensors.') + + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + cdef: + CFixedShapeTensorArray* ext_array = (self.ap) + CResult[shared_ptr[CTensor]] ctensor + with nogil: + ctensor = ext_array.ToTensor() + return pyarrow_wrap_tensor(GetResultValue(ctensor)) @staticmethod def from_numpy_ndarray(obj): @@ -3600,9 +3631,7 @@ class FixedShapeTensorArray(ExtensionArray): Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. The first dimension of ndarray will become the length of the fixed shape tensor array. - - Numpy array needs to be C-contiguous in memory - (``obj.flags["C_CONTIGUOUS"]==True``). + If input array data is not contiguous a copy will be made. Parameters ---------- @@ -3636,17 +3665,25 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if not obj.flags["C_CONTIGUOUS"]: - raise ValueError('The data in the numpy array need to be in a single, ' - 'C-style contiguous segment.') + + if len(obj.shape) < 2: + raise ValueError( + "Cannot convert 1D array or scalar to fixed shape tensor array") + if np.prod(obj.shape) == 0: + raise ValueError("Expected a non-empty ndarray") + + permutation = (-np.array(obj.strides)).argsort(kind='stable') + if permutation[0] != 0: + raise ValueError('First stride needs to be largest to ensure that ' + 'individual tensor data is contiguous in memory.') arrow_type = from_numpy_dtype(obj.dtype) - shape = obj.shape[1:] - size = obj.size / obj.shape[0] + shape = np.take(obj.shape, permutation) + values = np.ravel(obj, order="K") return ExtensionArray.from_storage( - fixed_shape_tensor(arrow_type, shape), - FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) + fixed_shape_tensor(arrow_type, shape[1:], permutation=permutation[1:] - 1), + FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d92f09da779b6..6149bee97236f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2704,26 +2704,26 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): + CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const + @staticmethod CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, const vector[int64_t]& shape, const vector[int64_t]& permutation, const vector[c_string]& dim_names) - CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, - const c_string& serialized_data) const - - c_string Serialize() const - const shared_ptr[CDataType] value_type() const vector[int64_t] shape() const vector[int64_t] permutation() const vector[c_string] dim_names() + cdef cppclass CFixedShapeTensorArray \ + " arrow::extension::FixedShapeTensorArray"(CExtensionArray): + const CResult[shared_ptr[CTensor]] ToTensor() const cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2772acf81861c..80ca3ea84187e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1035,6 +1035,48 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +cdef class FixedShapeTensorScalar(ExtensionScalar): + """ + Concrete class for fixed shape tensor extension scalar. + """ + + def to_numpy(self): + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + cdef: + CFixedShapeTensorType* c_type = static_pointer_cast[CFixedShapeTensorType, CDataType]( + self.wrapped.get().type).get() + shared_ptr[CExtensionScalar] scalar = static_pointer_cast[CExtensionScalar, CScalar](self.wrapped) + shared_ptr[CTensor] ctensor + + with nogil: + ctensor = GetResultValue(c_type.MakeTensor(scalar)) + return pyarrow_wrap_tensor(ctensor) + + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index d8c792ef00c6b..fe38bf651baae 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1318,39 +1318,120 @@ def test_tensor_type(): assert tensor_type.permutation is None -def test_tensor_class_methods(): - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], - pa.list_(pa.float32(), 6)) +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_class_methods(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) + storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr.to_numpy_ndarray() + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type) + np.testing.assert_array_equal(arr.to_tensor(), expected) + np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) + + expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type) + result = arr[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - expected = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr[:1].to_numpy_ndarray() + values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] + flat_arr = np.array(values[0], dtype=value_type) + bw = value_type.itemsize + storage = pa.array(values, pa.list_(arrow_type, 12)) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + result = flat_arr.reshape(1, 2, 3, 2) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) np.testing.assert_array_equal(result, expected) - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="C") + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) + result = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = as_strided(flat_arr, shape=(1, 3, 2, 2), + strides=(bw * 12, bw, bw * 6, bw * 2)) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + assert result.type.permutation == [2, 0, 1] + assert result.type.shape == [2, 2, 3] + assert result.to_tensor().shape == (1, 3, 2, 2) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + + +@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) +def test_tensor_array_from_numpy(value_type): + from numpy.lib.stride_tricks import as_strided + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) - assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.value_type == arrow_type assert tensor_array_from_numpy.type.shape == [2, 3] - arr = np.array( - [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order="F") - with pytest.raises(ValueError, match="C-style contiguous segment"): + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match="First stride needs to be largest"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) - storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) - arr = pa.ExtensionArray.from_storage(tensor_type, storage) - with pytest.raises(ValueError, match="non-permuted tensors"): - arr.to_numpy_ndarray() + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [3, 4] + assert tensor_array_from_numpy.type.permutation == [0, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert tensor_array_from_numpy.type.shape == [2, 2, 3] + assert tensor_array_from_numpy.type.permutation == [0, 2, 1] + assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) + + arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type) + expected = arr[1:] + result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) + + arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array(1, dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + arr = np.array([], dtype=value_type) + + with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0, 3, 2))) + + with pytest.raises(ValueError, match="Expected a non-empty ndarray"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2))) @pytest.mark.parametrize("tensor_type", ( diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ce3736b5af847..b9ba157a327a5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1658,20 +1658,6 @@ cdef class FixedShapeTensorType(BaseExtensionType): else: return None - def __arrow_ext_serialize__(self): - """ - Serialized representation of metadata to reconstruct the type object. - """ - return self.tensor_ext_type.Serialize() - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - """ - Return an FixedShapeTensor type instance from the storage type and serialized - metadata. - """ - return self.tensor_ext_type.Deserialize(storage_type, serialized) - def __arrow_ext_class__(self): return FixedShapeTensorArray @@ -1679,6 +1665,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): return fixed_shape_tensor, (self.value_type, self.shape, self.dim_names, self.permutation) + def __arrow_ext_scalar_class__(self): + return FixedShapeTensorScalar + _py_extension_type_auto_load = False @@ -4976,8 +4965,9 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N cdef FixedShapeTensorType out = FixedShapeTensorType.__new__(FixedShapeTensorType) - c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( - value_type.sp_type, c_shape, c_permutation, c_dim_names)) + with nogil: + c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) out.init(c_tensor_ext_type) From 42e35f101e87e689dcc48981abf81bc32c41d162 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 8 Feb 2024 09:44:19 -0500 Subject: [PATCH 67/74] GH-39812: [Python] Add bindings for ListView and LargeListView (#39813) ### Rationale for this change Add bindings to the ListView and LargeListView array formats. ### What changes are included in this PR? * Add initial implementation for ListView and LargeListView * Add basic unit tests ### Are these changes tested? * Basic unit tests only (follow up PRs will be needed to implement full functionality) ### Are there any user-facing changes? Yes, documentation is updated in this PR to include the new PyArrow objects. * Closes: #39812 Lead-authored-by: Dane Pitkin Co-authored-by: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/api/arrays.rst | 4 + docs/source/python/api/datatypes.rst | 4 + python/pyarrow/__init__.py | 14 +- python/pyarrow/array.pxi | 574 +++++++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 90 +++++ python/pyarrow/lib.pxd | 18 + python/pyarrow/lib.pyx | 2 + python/pyarrow/public-api.pxi | 4 + python/pyarrow/scalar.pxi | 10 + python/pyarrow/tests/test_array.py | 71 ++++ python/pyarrow/tests/test_misc.py | 4 + python/pyarrow/tests/test_scalars.py | 8 +- python/pyarrow/tests/test_types.py | 49 +++ python/pyarrow/types.pxi | 171 ++++++++ python/pyarrow/types.py | 10 + 15 files changed, 1027 insertions(+), 6 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index b858862dcff01..e6f6c3dbbd3d1 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -77,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 642c243b21af0..62bf4b7723558 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas. decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4dbd1258d3cea..2ee97ddb662e5 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,8 @@ def print_entry(label, value): binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +175,9 @@ def print_entry(label, value): field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,8 +203,9 @@ def print_entry(label, value): Int32Array, UInt32Array, Int64Array, UInt64Array, HalfFloatArray, FloatArray, DoubleArray, - ListArray, LargeListArray, MapArray, - FixedSizeListArray, UnionArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, BinaryViewArray, StringViewArray, @@ -220,6 +223,7 @@ def print_entry(label, value): HalfFloatScalar, FloatScalar, DoubleScalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5ab07f21d5b71..ad01d45571ba1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2460,6 +2460,578 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) +cdef class ListViewArray(Array): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this ListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + +cdef class LargeListViewArray(Array): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this LargeListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + cdef class MapArray(ListArray): """ Concrete class for Arrow arrays of a map data type. @@ -3710,6 +4282,8 @@ cdef dict _array_classes = { _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, _Type_SPARSE_UNION: UnionArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6149bee97236f..8056d99354965 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,6 +132,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" + _Type_LIST_VIEW" arrow::Type::LIST_VIEW" + _Type_LARGE_LIST_VIEW" arrow::Type::LARGE_LIST_VIEW" _Type_STRUCT" arrow::Type::STRUCT" _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" _Type_DENSE_UNION" arrow::Type::DENSE_UNION" @@ -366,6 +368,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CListViewType" arrow::ListViewType"(CDataType): + CListViewType(const shared_ptr[CDataType]& value_type) + CListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + + cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): + CLargeListViewType(const shared_ptr[CDataType]& value_type) + CLargeListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, const shared_ptr[CField]& item_field, c_bool keys_sorted) @@ -485,6 +499,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] run_end_type, shared_ptr[CDataType] value_type) + cdef shared_ptr[CDataType] CMakeListViewType" arrow::list_view"( + shared_ptr[CField] value_type) + + cdef shared_ptr[CDataType] CMakeLargeListViewType" arrow::large_list_view"( + shared_ptr[CField] value_type) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -690,6 +710,70 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + const int32_t* raw_value_offsets() + const int32_t* raw_value_sizes() + int32_t value_offset(int i) + int32_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + + cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( @@ -1150,6 +1234,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CListScalar" arrow::ListScalar"(CBaseListScalar): pass + cdef cppclass CListViewScalar" arrow::ListViewScalar"(CBaseListScalar): + pass + + cdef cppclass CLargeListViewScalar" arrow::LargeListViewScalar"(CBaseListScalar): + pass + cdef cppclass CMapScalar" arrow::MapScalar"(CListScalar): pass diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c1104864066e9..48350212c2076 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -120,6 +120,16 @@ cdef class LargeListType(DataType): const CLargeListType* list_type +cdef class ListViewType(DataType): + cdef: + const CListViewType* list_view_type + + +cdef class LargeListViewType(DataType): + cdef: + const CLargeListViewType* list_view_type + + cdef class MapType(DataType): cdef: const CMapType* map_type @@ -425,6 +435,14 @@ cdef class LargeListArray(BaseListArray): pass +cdef class ListViewArray(Array): + pass + + +cdef class LargeListViewArray(Array): + pass + + cdef class MapArray(ListArray): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index b0368b67f790e..3245e50f0fe69 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -110,6 +110,8 @@ Type_BINARY_VIEW = _Type_BINARY_VIEW Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST +Type_LIST_VIEW = _Type_LIST_VIEW +Type_LARGE_LIST_VIEW = _Type_LARGE_LIST_VIEW Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec387..966273b4bea84 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -87,6 +87,10 @@ cdef api object pyarrow_wrap_data_type( out = ListType.__new__(ListType) elif type.get().id() == _Type_LARGE_LIST: out = LargeListType.__new__(LargeListType) + elif type.get().id() == _Type_LIST_VIEW: + out = ListViewType.__new__(ListViewType) + elif type.get().id() == _Type_LARGE_LIST_VIEW: + out = LargeListViewType.__new__(LargeListViewType) elif type.get().id() == _Type_MAP: out = MapType.__new__(MapType) elif type.get().id() == _Type_FIXED_SIZE_LIST: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 80ca3ea84187e..41bfde39adb6f 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -720,6 +720,14 @@ cdef class LargeListScalar(ListScalar): pass +cdef class ListViewScalar(ListScalar): + pass + + +cdef class LargeListViewScalar(ListScalar): + pass + + cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. @@ -1108,6 +1116,8 @@ cdef dict _scalar_classes = { _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, + _Type_LIST_VIEW: ListViewScalar, + _Type_LARGE_LIST_VIEW: LargeListViewScalar, _Type_STRUCT: StructScalar, _Type_MAP: MapScalar, _Type_DICTIONARY: DictionaryScalar, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f851d4e0b6c29..bd9ae214b041e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3573,3 +3573,74 @@ def test_run_end_encoded_from_buffers(): with pytest.raises(ValueError): pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers, 1, offset, children) + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_from_arrays(list_array_type): + # test in order offsets, similar to ListArray representation + values = [1, 2, 3, 4, 5, 6, None, 7] + offsets = [0, 2, 4, 6] + sizes = [2, 2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[1, 2], [3, 4], [5, 6], [None, 7]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test out of order offsets with overlapping values + values = [1, 2, 3, 4] + offsets = [2, 1, 0] + sizes = [2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[3, 4], [2, 3], [1, 2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test null offsets and empty list values + values = [] + offsets = [0, None] + sizes = [0, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[], None] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == [0, 0] + assert array.sizes.to_pylist() == sizes + + # test null sizes and empty list values + values = [] + offsets = [0, 0] + sizes = [None, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [None, []] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == [0, 0] + + # test null bitmask + values = [1, 2] + offsets = [0, 0, 1] + sizes = [1, 0, 1] + mask = pa.array([False, True, False]) + array = list_array_type.from_arrays(offsets, sizes, values, mask=mask) + + assert array.to_pylist() == [[1], None, [2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_flatten(list_array_type): + values = [1, 2, 3, 4] + offsets = [3, 2, 1, 0] + sizes = [1, 1, 1, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.flatten().to_pylist() == [4, 3, 2, 1] diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8cec8783280dd..39dac4eb81dfb 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -154,6 +154,8 @@ def test_set_timezone_db_path_non_windows(): pa.ListType, pa.LargeListType, pa.FixedSizeListType, + pa.ListViewType, + pa.LargeListViewType, pa.UnionType, pa.SparseUnionType, pa.DenseUnionType, @@ -227,6 +229,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, + pa.ListViewScalar, + pa.LargeListViewScalar, pa.MapScalar, pa.FixedSizeListScalar, pa.UnionScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index eed5f045be945..074fb757e265a 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -57,6 +57,9 @@ ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar), + # TODO GH-39855 + # ([1, 2, 3], pa.list_view(pa.int8()), pa.ListViewScalar), + # ([1, 2, 3, 4], pa.large_list_view(pa.int8()), pa.LargeListViewScalar), (datetime.date.today(), None, pa.Date32Scalar), (datetime.date.today(), pa.date64(), pa.Date64Scalar), (datetime.datetime.now(), None, pa.TimestampScalar), @@ -537,7 +540,10 @@ def test_fixed_size_binary(): @pytest.mark.parametrize(('ty', 'klass'), [ (pa.list_(pa.string()), pa.ListScalar), - (pa.large_list(pa.string()), pa.LargeListScalar) + (pa.large_list(pa.string()), pa.LargeListScalar), + # TODO GH-39855 + # (pa.list_view(pa.string()), pa.ListViewScalar), + # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): v = ['foo', None] diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index a5ab3128dc874..0add5786088d3 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -66,6 +66,8 @@ def get_many_types(): pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.uint16()), pa.map_(pa.string(), pa.int32()), pa.map_(pa.field('key', pa.int32(), nullable=False), pa.field('value', pa.int32())), @@ -169,6 +171,18 @@ def test_is_list(): assert not types.is_list(pa.int32()) +def test_is_list_view(): + a = pa.list_view(pa.int32()) + b = pa.large_list_view(pa.int32()) + + assert types.is_list_view(a) + assert not types.is_large_list_view(a) + assert not types.is_list(a) + assert types.is_large_list_view(b) + assert not types.is_list_view(b) + assert not types.is_large_list(b) + + def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) @@ -573,6 +587,41 @@ def test_large_list_type(): pa.large_list(None) +def test_list_view_type(): + ty = pa.list_view(pa.int64()) + assert isinstance(ty, pa.ListViewType) + assert ty.value_type == pa.int64() + assert ty.value_field == pa.field("item", pa.int64(), nullable=True) + + # nullability matters in comparison + ty_non_nullable = pa.list_view(pa.field("item", pa.int64(), nullable=False)) + assert ty != ty_non_nullable + + # field names don't matter by default + ty_named = pa.list_view(pa.field("element", pa.int64())) + assert ty == ty_named + assert not ty.equals(ty_named, check_metadata=True) + + # metadata doesn't matter by default + ty_metadata = pa.list_view( + pa.field("item", pa.int64(), metadata={"hello": "world"})) + assert ty == ty_metadata + assert not ty.equals(ty_metadata, check_metadata=True) + + with pytest.raises(TypeError): + pa.list_view(None) + + +def test_large_list_view_type(): + ty = pa.large_list_view(pa.utf8()) + assert isinstance(ty, pa.LargeListViewType) + assert ty.value_type == pa.utf8() + assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) + + with pytest.raises(TypeError): + pa.large_list_view(None) + + def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b9ba157a327a5..50b10c5512dc1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -557,6 +557,101 @@ cdef class LargeListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class ListViewType(DataType): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + +cdef class LargeListViewType(DataType): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return large_list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + cdef class MapType(DataType): """ Concrete class for map data types. @@ -4517,6 +4612,82 @@ cpdef LargeListType large_list(value_type): return out +cpdef ListViewType list_view(value_type): + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('ListView requires DataType or Field') + + list_view_type = CMakeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + +cpdef LargeListViewType large_list_view(value_type): + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('LargeListView requires DataType or Field') + + list_view_type = CMakeLargeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + cpdef MapType map_(key_type, item_type, keys_sorted=False): """ Create MapType instance from key and item data types or fields. diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 32398dac9c5f5..0f68ca9fe574b 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -151,6 +151,16 @@ def is_fixed_size_list(t): return t.id == lib.Type_FIXED_SIZE_LIST +@doc(is_null, datatype="list view") +def is_list_view(t): + return t.id == lib.Type_LIST_VIEW + + +@doc(is_null, datatype="large list view") +def is_large_list_view(t): + return t.id == lib.Type_LARGE_LIST_VIEW + + @doc(is_null, datatype="struct") def is_struct(t): return t.id == lib.Type_STRUCT From 30f6fdbbd32b77f0351c08732c395bbd28af2850 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 8 Feb 2024 16:28:25 +0100 Subject: [PATCH 68/74] GH-39996: [Archery] Fix Crossbow build on a PR from a fork's main branch (#39997) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * Closes: #39996 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- dev/archery/archery/bot.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py index 4e5104362254c..caab824aeb38f 100644 --- a/dev/archery/archery/bot.py +++ b/dev/archery/archery/bot.py @@ -324,7 +324,8 @@ def crossbow(obj, crossbow): obj['crossbow_repo'] = crossbow -def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): +def _clone_arrow_and_crossbow(dest, crossbow_repo, arrow_repo_url, + pr_number, pr_branch): """ Clone the repositories and initialize crossbow objects. @@ -338,22 +339,25 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): Object containing information about the pull request the comment bot was triggered from. """ + bare_arrow_path = dest / 'arrow_bare' arrow_path = dest / 'arrow' queue_path = dest / 'crossbow' - # clone arrow and checkout the pull request's branch - pull_request_ref = 'pull/{}/head:{}'.format( - pull_request.number, pull_request.head.ref - ) - git.clone(pull_request.base.repo.clone_url, str(arrow_path)) - git.fetch('origin', pull_request_ref, git_dir=arrow_path) - git.checkout(pull_request.head.ref, git_dir=arrow_path) - - # clone crossbow repository + # 1. clone arrow and checkout the PR's branch + pr_ref = f'pull/{pr_number}/head:{pr_branch}' + # we do a bare clone of upstream arrow to avoid issues when the PR is + # submitted from a fork's main branch (GH-39996) + git.clone('--bare', arrow_repo_url, str(bare_arrow_path)) + # fetch the PR's branch into the bare clone + git.fetch('origin', pr_ref, git_dir=bare_arrow_path) + # clone and checkout the PR's branch into a full local repo + git.clone(f'--branch={pr_branch}', bare_arrow_path, arrow_path) + + # 2. clone crossbow repository crossbow_url = 'https://github.com/{}'.format(crossbow_repo) git.clone(crossbow_url, str(queue_path)) - # initialize crossbow objects + # 3. initialize crossbow objects github_token = os.environ['CROSSBOW_GITHUB_TOKEN'] arrow = Repo(arrow_path) queue = Queue(queue_path, github_token=github_token, require_https=True) @@ -385,7 +389,9 @@ def submit(obj, tasks, groups, params, arrow_version, wait): arrow, queue = _clone_arrow_and_crossbow( dest=Path(tmpdir), crossbow_repo=crossbow_repo, - pull_request=pull_request, + arrow_repo_url=pull_request.base.repo.clone_url, + pr_number=pull_request.number, + pr_branch=pull_request.head.ref, ) # load available tasks configuration and groups from yaml config = Config.load_yaml(arrow.path / "dev" / "tasks" / "tasks.yml") From 98c422557cfe33e714ea009a270aab98818e2748 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 9 Feb 2024 00:05:50 +0800 Subject: [PATCH 69/74] GH-39976: [C++] Fix out-of-line data size calculation in BinaryViewBuilder::AppendArraySlice (#39994) ### Rationale for this change Fix the bug in `BinaryViewBuilder::AppendArraySlice` that, when calculating out-of-line data size, the array is wrongly iterated. ### What changes are included in this PR? Fix and UT. ### Are these changes tested? UT included. ### Are there any user-facing changes? No. * Closes: #39976 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/src/arrow/array/array_test.cc | 23 +++++++++++++++++++++++ cpp/src/arrow/array/builder_binary.cc | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e9d478f108584..21ac1a09f56e7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -905,6 +905,29 @@ TEST_F(TestArray, TestAppendArraySlice) { } } +// GH-39976: Test out-of-line data size calculation in +// BinaryViewBuilder::AppendArraySlice. +TEST_F(TestArray, TestBinaryViewAppendArraySlice) { + BinaryViewBuilder src_builder(pool_); + ASSERT_OK(src_builder.AppendNull()); + ASSERT_OK(src_builder.Append("long string; not inlined")); + ASSERT_EQ(2, src_builder.length()); + ASSERT_OK_AND_ASSIGN(auto src, src_builder.Finish()); + ASSERT_OK(src->ValidateFull()); + + ArraySpan span; + span.SetMembers(*src->data()); + BinaryViewBuilder dst_builder(pool_); + ASSERT_OK(dst_builder.AppendArraySlice(span, 0, 1)); + ASSERT_EQ(1, dst_builder.length()); + ASSERT_OK(dst_builder.AppendArraySlice(span, 1, 1)); + ASSERT_EQ(2, dst_builder.length()); + ASSERT_OK_AND_ASSIGN(auto dst, dst_builder.Finish()); + ASSERT_OK(dst->ValidateFull()); + + AssertArraysEqual(*src, *dst); +} + TEST_F(TestArray, ValidateBuffersPrimitive) { auto empty_buffer = std::make_shared(""); auto null_buffer = Buffer::FromString("\xff"); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index f85852fa0eda6..7e5721917f3a0 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -54,7 +54,7 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse int64_t out_of_line_total = 0, i = 0; VisitNullBitmapInline( - array.buffers[0].data, array.offset, array.length, array.null_count, + array.buffers[0].data, array.offset + offset, length, array.null_count, [&] { if (!values[i].is_inline()) { out_of_line_total += static_cast(values[i].size()); From a946214b127ff50ea0cf7e68946c186fa66009a2 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Fri, 9 Feb 2024 01:02:04 +0800 Subject: [PATCH 70/74] GH-39973: [C++][CI] Disable debug memory pool for ASAN and Valgrind (#39975) ### Rationale for this change Disable debug memory pool for ASAN and Valgrind so that they can detect more subtle memory issues regarding to buffer tail bytes. ### What changes are included in this PR? 1. Add a `none` option to debug memory pool env var to make other things slightly easier. 2. Change `*_test.sh` scripts to conditionally set debug memory pool env var. 3. Top-level docker compose change to pass none to debug memory pool env var for ASAN and Valgrind. ### Are these changes tested? The CI should cover it well. ### Are there any user-facing changes? No. * Closes: #39973 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- ci/appveyor-cpp-build.bat | 5 ++++- ci/scripts/c_glib_test.sh | 6 ++++-- ci/scripts/cpp_test.sh | 6 ++++-- ci/scripts/python_test.sh | 6 ++++-- ci/scripts/r_test.sh | 6 ++++-- ci/scripts/ruby_test.sh | 6 ++++-- cpp/src/arrow/memory_pool.cc | 4 ++-- docker-compose.yml | 4 ++++ docs/source/cpp/env_vars.rst | 4 +++- python/pyarrow/tests/test_memory.py | 30 +++++++++++++++++++++++++---- 10 files changed, 59 insertions(+), 18 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 5e561a0461ea3..ab85032fe9924 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -26,7 +26,10 @@ git submodule update --init || exit /B set ARROW_TEST_DATA=%CD%\testing\data set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data -set ARROW_DEBUG_MEMORY_POOL=trap +@rem Enable memory debug checks if the env is not set already +IF "%ARROW_DEBUG_MEMORY_POOL%"=="" ( + set ARROW_DEBUG_MEMORY_POOL=trap +) set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index cea600191ae05..f8083c7759d8a 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -28,8 +28,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi pushd ${source_dir} diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0c6e1c6ef7057..1d685c51a9326 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -37,8 +37,10 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib}:${LD_LIBRARY_P # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi ctest_options=() case "$(uname)" in diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 341c2dd0577ef..8dfedb2880b50 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -32,8 +32,10 @@ export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py # Enable some checks inside Python itself export PYTHONDEVMODE=1 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # By default, force-test all optional components : ${PYARROW_TEST_ACERO:=${ARROW_ACERO:-ON}} diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 22ec551edb9fa..72078ab3c06c2 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -72,8 +72,10 @@ export _R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_=TRUE # to retrieve metadata. Disable this so that S3FileSystem tests run faster. export AWS_EC2_METADATA_DISABLED=TRUE -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi # Hack so that texlive2020 doesn't pollute the home dir export TEXMFCONFIG=/tmp/texmf-config diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 4fd6a85fe3966..56c33a4d6378a 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -26,7 +26,9 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -# Enable memory debug checks. -export ARROW_DEBUG_MEMORY_POOL=trap +# Enable memory debug checks if the env is not set already +if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then + export ARROW_DEBUG_MEMORY_POOL=trap +fi rake -f ${source_dir}/Rakefile BUILD_DIR=${build_dir} USE_BUNDLER=yes diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 843329c17bc28..d58c203d2ae27 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -195,7 +195,7 @@ bool IsDebugEnabled() { return false; } auto env_value = *std::move(maybe_env_value); - if (env_value.empty()) { + if (env_value.empty() || env_value == "none") { return false; } auto debug_state = DebugState::Instance(); @@ -212,7 +212,7 @@ bool IsDebugEnabled() { return true; } ARROW_LOG(WARNING) << "Invalid value for " << kDebugMemoryEnvVar << ": '" << env_value - << "'. Valid values are 'abort', 'trap', 'warn'."; + << "'. Valid values are 'abort', 'trap', 'warn', 'none'."; return false; }(); diff --git a/docker-compose.yml b/docker-compose.yml index a31fa0d9aa659..7ae625a017417 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -320,6 +320,8 @@ services: # Shrink test runtime by enabling minimal optimizations ARROW_C_FLAGS_DEBUG: "-g1 -Og" ARROW_CXX_FLAGS_DEBUG: "-g1 -Og" + # GH-39973: Do not use debug memory pool for valgrind + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_FLIGHT: "OFF" ARROW_FLIGHT_SQL: "OFF" @@ -598,6 +600,8 @@ services: CXX: clang++-${CLANG_TOOLS} # Avoid creating huge static libraries ARROW_BUILD_STATIC: "OFF" + # GH-39973: Do not use debug memory pool for ASAN + ARROW_DEBUG_MEMORY_POOL: "none" ARROW_ENABLE_TIMING_TESTS: # inherit # GH-33920: Disable Flight SQL to reduce build time. # We'll be able to re-enable this with Ubuntu 24.04 because diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 0fa80aa1106c1..eb7c797df5e27 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -58,8 +58,10 @@ that changing their value later will have an effect. - ``abort`` exits the processus with a non-zero return value; - ``trap`` issues a platform-specific debugger breakpoint / trap instruction; - ``warn`` prints a warning on stderr and continues execution; + - ``none`` disables memory checks; - If this variable is not set, or has empty an value, memory checks are disabled. + If this variable is not set, or has an empty value, it has the same effect + as the value ``none`` - memory checks are disabled. .. note:: While this functionality can be useful and has little overhead, it diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index d9fdeb152c35e..4f199952344f2 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -243,13 +243,35 @@ def test_debug_memory_pool_warn(pool_factory): assert "Wrong size on deallocation" in res.stderr -@pytest.mark.parametrize('pool_factory', supported_factories()) -def test_debug_memory_pool_disabled(pool_factory): - res = run_debug_memory_pool(pool_factory.__name__, "") +def check_debug_memory_pool_disabled(pool_factory, env_value, msg): + res = run_debug_memory_pool(pool_factory.__name__, env_value) # The subprocess either returned successfully or was killed by a signal # (due to writing out of bounds), depending on the underlying allocator. if os.name == "posix": assert res.returncode <= 0 else: res.check_returncode() - assert res.stderr == "" + if msg == "": + assert res.stderr == "" + else: + assert msg in res.stderr + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_none(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "none", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_empty(pool_factory): + check_debug_memory_pool_disabled(pool_factory, "", "") + + +@pytest.mark.parametrize('pool_factory', supported_factories()) +def test_debug_memory_pool_unknown(pool_factory): + env_value = "some_arbitrary_value" + msg = ( + f"Invalid value for ARROW_DEBUG_MEMORY_POOL: '{env_value}'. " + "Valid values are 'abort', 'trap', 'warn', 'none'." + ) + check_debug_memory_pool_disabled(pool_factory, env_value, msg) From de3cdc00c21fd3e9d8d763099591f23720ca8d1f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 8 Feb 2024 18:45:40 +0100 Subject: [PATCH 71/74] GH-39962: [C++] Small CSV reader refactoring (#39963) ### What changes are included in this PR? Factor our some shared functionality in the CSV reader to avoid code duplication. ### Are these changes tested? Yes, by existing tests. ### Are there any user-facing changes? No. * Closes: #39962 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/csv/reader.cc | 144 +++++++++++------------------------- 1 file changed, 45 insertions(+), 99 deletions(-) diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 1ac25e290a814..e981fafe8e780 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -445,16 +445,20 @@ class BlockParsingOperator { num_rows_seen_ += parser->total_num_rows(); } - RETURN_NOT_OK(block.consume_bytes(parsed_size)); + if (block.consume_bytes) { + RETURN_NOT_OK(block.consume_bytes(parsed_size)); + } return ParsedBlock{std::move(parser), block.block_index, static_cast(parsed_size) + block.bytes_skipped}; } + int num_csv_cols() const { return num_csv_cols_; } + private: io::IOContext io_context_; - ParseOptions parse_options_; - int num_csv_cols_; - bool count_rows_; + const ParseOptions parse_options_; + const int num_csv_cols_; + const bool count_rows_; int64_t num_rows_seen_; }; @@ -570,7 +574,6 @@ class ReaderMixin { parse_options_(parse_options), convert_options_(convert_options), count_rows_(count_rows), - num_rows_seen_(count_rows_ ? 1 : -1), input_(std::move(input)) {} protected: @@ -581,6 +584,7 @@ class ReaderMixin { const uint8_t* data = buf->data(); const auto data_end = data + buf->size(); DCHECK_GT(data_end - data, 0); + int64_t num_rows_seen = 1; if (read_options_.skip_rows) { // Skip initial rows (potentially invalid CSV data) @@ -593,14 +597,14 @@ class ReaderMixin { "either file is too short or header is larger than block size"); } if (count_rows_) { - num_rows_seen_ += num_skipped_rows; + num_rows_seen += num_skipped_rows; } } if (read_options_.column_names.empty()) { // Parse one row (either to read column names or to know the number of columns) - BlockParser parser(io_context_.pool(), parse_options_, num_csv_cols_, - num_rows_seen_, 1); + BlockParser parser(io_context_.pool(), parse_options_, /*num_cols=*/-1, + /*first_row=*/num_rows_seen, /*max_num_rows=*/1); uint32_t parsed_size = 0; RETURN_NOT_OK(parser.Parse( std::string_view(reinterpret_cast(data), data_end - data), @@ -627,7 +631,7 @@ class ReaderMixin { // Skip parsed header row data += parsed_size; if (count_rows_) { - ++num_rows_seen_; + ++num_rows_seen; } } } else { @@ -636,14 +640,17 @@ class ReaderMixin { if (count_rows_) { // increase rows seen to skip past rows which will be skipped - num_rows_seen_ += read_options_.skip_rows_after_names; + num_rows_seen += read_options_.skip_rows_after_names; } auto bytes_consumed = data - buf->data(); *rest = SliceBuffer(buf, bytes_consumed); - num_csv_cols_ = static_cast(column_names_.size()); - DCHECK_GT(num_csv_cols_, 0); + int32_t num_csv_cols = static_cast(column_names_.size()); + DCHECK_GT(num_csv_cols, 0); + // Since we know the number of columns, we can instantiate the BlockParsingOperator + parsing_operator_.emplace(io_context_, parse_options_, num_csv_cols, + count_rows_ ? num_rows_seen : -1); RETURN_NOT_OK(MakeConversionSchema()); return bytes_consumed; @@ -691,7 +698,7 @@ class ReaderMixin { if (convert_options_.include_columns.empty()) { // Include all columns in CSV file order - for (int32_t col_index = 0; col_index < num_csv_cols_; ++col_index) { + for (int32_t col_index = 0; col_index < num_csv_cols(); ++col_index) { append_csv_column(column_names_[col_index], col_index); } } else { @@ -719,66 +726,25 @@ class ReaderMixin { return Status::OK(); } - struct ParseResult { - std::shared_ptr parser; - int64_t parsed_bytes; - }; - - Result Parse(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, int64_t block_index, - bool is_final) { - static constexpr int32_t max_num_rows = std::numeric_limits::max(); - auto parser = std::make_shared( - io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, max_num_rows); - - std::shared_ptr straddling; - std::vector views; - if (partial->size() != 0 || completion->size() != 0) { - if (partial->size() == 0) { - straddling = completion; - } else if (completion->size() == 0) { - straddling = partial; - } else { - ARROW_ASSIGN_OR_RAISE( - straddling, ConcatenateBuffers({partial, completion}, io_context_.pool())); - } - views = {std::string_view(*straddling), std::string_view(*block)}; - } else { - views = {std::string_view(*block)}; - } - uint32_t parsed_size; - if (is_final) { - RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size)); - } else { - RETURN_NOT_OK(parser->Parse(views, &parsed_size)); - } - // See BlockParsingOperator for explanation. - const int64_t bytes_before_buffer = partial->size() + completion->size(); - if (static_cast(parsed_size) < bytes_before_buffer) { - return Status::Invalid( - "CSV parser got out of sync with chunker. This can mean the data file " - "contains cell values spanning multiple lines; please consider enabling " - "the option 'newlines_in_values'."); - } + Result Parse(const CSVBlock& block) { + DCHECK(parsing_operator_.has_value()); + return (*parsing_operator_)(block); + } - if (count_rows_) { - num_rows_seen_ += parser->total_num_rows(); - } - return ParseResult{std::move(parser), static_cast(parsed_size)}; + int num_csv_cols() const { + DCHECK(parsing_operator_.has_value()); + return parsing_operator_->num_csv_cols(); } io::IOContext io_context_; - ReadOptions read_options_; - ParseOptions parse_options_; - ConvertOptions convert_options_; - - // Number of columns in the CSV file - int32_t num_csv_cols_ = -1; - // Whether num_rows_seen_ tracks the number of rows seen in the CSV being parsed - bool count_rows_; - // Number of rows seen in the csv. Not used if count_rows is false - int64_t num_rows_seen_; + const ReadOptions read_options_; + const ParseOptions parse_options_; + const ConvertOptions convert_options_; + // Whether to track the number of rows seen in the CSV being parsed + const bool count_rows_; + + std::optional parsing_operator_; + // Column names in the CSV file std::vector column_names_; ConversionSchema conversion_schema_; @@ -822,14 +788,10 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader { return Status::OK(); } - Result ParseAndInsert(const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, - int64_t block_index, bool is_final) { - ARROW_ASSIGN_OR_RAISE(auto result, - Parse(partial, completion, block, block_index, is_final)); - RETURN_NOT_OK(ProcessData(result.parser, block_index)); - return result.parsed_bytes; + Status ParseAndInsert(const CSVBlock& block) { + ARROW_ASSIGN_OR_RAISE(auto result, Parse(block)); + RETURN_NOT_OK(ProcessData(result.parser, result.block_index)); + return Status::OK(); } // Trigger conversion of parsed block data @@ -921,8 +883,6 @@ class StreamingReaderImpl : public ReaderMixin, ProcessHeader(first_buffer, &after_header)); bytes_decoded_->fetch_add(header_bytes_consumed); - auto parser_op = - BlockParsingOperator(io_context_, parse_options_, num_csv_cols_, num_rows_seen_); ARROW_ASSIGN_OR_RAISE( auto decoder_op, BlockDecodingOperator::Make(io_context_, convert_options_, conversion_schema_)); @@ -930,8 +890,7 @@ class StreamingReaderImpl : public ReaderMixin, auto block_gen = SerialBlockReader::MakeAsyncIterator( std::move(buffer_generator), MakeChunker(parse_options_), std::move(after_header), read_options_.skip_rows_after_names); - auto parsed_block_gen = - MakeMappedGenerator(std::move(block_gen), std::move(parser_op)); + auto parsed_block_gen = MakeMappedGenerator(std::move(block_gen), *parsing_operator_); auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op)); auto self = shared_from_this(); @@ -1035,11 +994,7 @@ class SerialTableReader : public BaseTableReader { // EOF break; } - ARROW_ASSIGN_OR_RAISE( - int64_t parsed_bytes, - ParseAndInsert(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes)); + RETURN_NOT_OK(ParseAndInsert(maybe_block)); } // Finish conversion, create schema and table RETURN_NOT_OK(task_group_->Finish()); @@ -1110,13 +1065,8 @@ class AsyncThreadedTableReader DCHECK(!maybe_block.consume_bytes); // Launch parse task - self->task_group_->Append([self, maybe_block] { - return self - ->ParseAndInsert(maybe_block.partial, maybe_block.completion, - maybe_block.buffer, maybe_block.block_index, - maybe_block.is_final) - .status(); - }); + self->task_group_->Append( + [self, maybe_block] { return self->ParseAndInsert(maybe_block); }); return Status::OK(); }; @@ -1239,12 +1189,8 @@ class CSVRowCounter : public ReaderMixin, // IterationEnd. std::function>(const CSVBlock&)> count_cb = [self](const CSVBlock& maybe_block) -> Result> { - ARROW_ASSIGN_OR_RAISE( - auto parser, - self->Parse(maybe_block.partial, maybe_block.completion, maybe_block.buffer, - maybe_block.block_index, maybe_block.is_final)); - RETURN_NOT_OK(maybe_block.consume_bytes(parser.parsed_bytes)); - int32_t total_row_count = parser.parser->total_num_rows(); + ARROW_ASSIGN_OR_RAISE(auto parsed_block, self->Parse(maybe_block)); + int32_t total_row_count = parsed_block.parser->total_num_rows(); self->row_count_ += total_row_count; return total_row_count; }; From 8f1537468a7e5f5a526ddf1e9c634740c923404b Mon Sep 17 00:00:00 2001 From: Lubo Slivka Date: Thu, 8 Feb 2024 22:58:07 +0100 Subject: [PATCH 72/74] GH-40004: [Python][FlightRPC] Release GIL in GeneratorStream (#40005) Fixes #40004. * Closes: #40004 Authored-by: lupko Signed-off-by: David Li --- python/pyarrow/_flight.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index a2ff045f256ac..67ee7590560f0 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -2013,8 +2013,9 @@ cdef CStatus _data_stream_next(void* self, CFlightPayload* payload) except *: max_attempts = 128 for _ in range(max_attempts): if stream.current_stream != nullptr: - check_flight_status( - stream.current_stream.get().Next().Value(payload)) + with nogil: + check_flight_status( + stream.current_stream.get().Next().Value(payload)) # If the stream ended, see if there's another stream from the # generator if payload.ipc_message.metadata != nullptr: From f9a88e5398fe6ec406759e55935d17bb09f9569b Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Thu, 8 Feb 2024 14:26:06 -0800 Subject: [PATCH 73/74] GH-39916: [C#] Restore support for .NET 4.6.2 (#40008) ### What changes are included in this PR? Project targets have been added for net462 which is still in support. A few tests have been modified to allow them to build against that target. ### Are these changes tested? Yes. ### Are there any user-facing changes? There are new build artifacts for Apache.Arrow.dll and Apache.Arrow.Compression.dll. * Closes: #39916 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.csproj | 8 +++++++- csharp/src/Apache.Arrow/Apache.Arrow.csproj | 12 +++++++++--- .../Extensions/TupleExtensions.netstandard.cs | 7 +++++++ .../Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- .../Apache.Arrow.Tests/BinaryArrayBuilderTests.cs | 8 ++++---- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index fded62911262c..6988567193db4 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -1,10 +1,16 @@ - netstandard2.0 Provides decompression support for the Arrow IPC format + + netstandard2.0;net462 + + + netstandard2.0 + + diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 3a229f4ffcaf8..c4bb64b73a9ed 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -1,14 +1,20 @@ - netstandard2.0;net6.0 true $(DefineConstants);UNSAFE_BYTEBUFFER;BYTEBUFFER_NO_BOUNDS_CHECK;ENABLE_SPAN_T Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. - + + netstandard2.0;net6.0;net462 + + + netstandard2.0;net6.0 + + + @@ -34,7 +40,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs index fe42075f14f73..e0e0f5707086b 100644 --- a/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs +++ b/csharp/src/Apache.Arrow/Extensions/TupleExtensions.netstandard.cs @@ -25,5 +25,12 @@ public static void Deconstruct(this Tuple value, out T1 item1, o item1 = value.Item1; item2 = value.Item2; } + + public static void Deconstruct(this Tuple value, out T1 item1, out T2 item2, out T3 item3) + { + item1 = value.Item1; + item2 = value.Item2; + item3 = value.Item3; + } } } diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index d8a92ff756751..c422da56b4cef 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -7,7 +7,7 @@ - net7.0;net472 + net7.0;net472;net462 net7.0 diff --git a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs index 4c2b050d0c8ba..447572dda0eea 100644 --- a/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/BinaryArrayBuilderTests.cs @@ -83,7 +83,7 @@ public void AppendSingleByte(byte[][] initialContents, byte singleByte) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(new[] { singleByte }); + var expectedArrayContents = initialContents.Concat(new[] { new[] { singleByte } }); // Act var actualReturnValue = builder.Append(singleByte); @@ -130,7 +130,7 @@ public void AppendNull(byte[][] initialContents) builder.AppendRange(initialContents); int initialLength = builder.Length; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(null); + var expectedArrayContents = initialContents.Concat(new byte[][] { null }); // Act var actualReturnValue = builder.AppendNull(); @@ -180,7 +180,7 @@ public void AppendReadOnlySpan(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; var span = (ReadOnlySpan)bytes; int expectedLength = initialLength + 1; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(span); @@ -230,7 +230,7 @@ public void AppendEnumerable(byte[][] initialContents, byte[] bytes) int initialLength = builder.Length; int expectedLength = initialLength + 1; var enumerable = (IEnumerable)bytes; - var expectedArrayContents = initialContents.Append(bytes); + var expectedArrayContents = initialContents.Concat(new[] { bytes }); // Act var actualReturnValue = builder.Append(enumerable); From a0dec7f39394e619c8bdfe0eacb6ecde73a9ec12 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 9 Feb 2024 01:41:36 +0000 Subject: [PATCH 74/74] GH-39352: [FS][Azure] Enable azure in builds (#39971) ### Rationale for this change ### What changes are included in this PR? Enable Azure in linux and mac os wheel builds. Tried to copy GCS Don't enable Azure for windows builds because windows builds where all failing. Failures were a combination of cmake version or `Could not find a package configuration file provided by "wil"`. I think it makes sense to come back to windows builds in another PR. ### Are these changes tested? There is no new functionality to test. ### Are there any user-facing changes? No * Closes: #39352 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- ci/docker/python-wheel-manylinux.dockerfile | 1 + ci/scripts/cpp_build.sh | 1 + ci/scripts/python_build.sh | 1 + ci/scripts/python_test.sh | 1 + ci/scripts/python_wheel_macos_build.sh | 3 +++ ci/scripts/python_wheel_manylinux_build.sh | 3 +++ ci/scripts/python_wheel_unix_test.sh | 6 ++++-- ci/vcpkg/vcpkg.json | 10 ++++++++++ dev/tasks/python-wheels/github.osx.amd64.yml | 1 + dev/tasks/python-wheels/github.osx.arm64.yml | 1 + 10 files changed, 26 insertions(+), 2 deletions(-) diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 2831440d5a967..b1d9ed5ab88d9 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -82,6 +82,7 @@ RUN vcpkg install \ --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 69d86e871ac5f..60cab1a9feaba 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -54,6 +54,7 @@ if [ "${GITHUB_ACTIONS:-false}" = "true" ]; then fi if [ "${ARROW_ENABLE_THREADING:-ON}" = "OFF" ]; then + ARROW_AZURE=OFF ARROW_FLIGHT=OFF ARROW_FLIGHT_SQL=OFF ARROW_GCS=OFF diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index c0a27e6e705e9..9bdcc4d687584 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -55,6 +55,7 @@ export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_ACERO=${ARROW_ACERO:-OFF} +export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 8dfedb2880b50..20ca3300c0538 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -39,6 +39,7 @@ fi # By default, force-test all optional components : ${PYARROW_TEST_ACERO:=${ARROW_ACERO:-ON}} +: ${PYARROW_TEST_AZURE:=${ARROW_AZURE:-ON}} : ${PYARROW_TEST_CUDA:=${ARROW_CUDA:-ON}} : ${PYARROW_TEST_DATASET:=${ARROW_DATASET:-ON}} : ${PYARROW_TEST_FLIGHT:=${ARROW_FLIGHT:-ON}} diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 8123a9fdf1c48..bea5409100770 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -63,6 +63,7 @@ pip install "delocate>=0.10.3" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -95,6 +96,7 @@ pushd ${build_dir}/build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -148,6 +150,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 58e42fea88088..4d4d4fb694e0b 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -49,6 +49,7 @@ rm -rf /arrow/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_ACERO:=ON} +: ${ARROW_AZURE:=ON} : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} @@ -87,6 +88,7 @@ pushd /tmp/arrow-build cmake \ -DARROW_ACERO=${ARROW_ACERO} \ + -DARROW_AZURE=${ARROW_AZURE} \ -DARROW_BUILD_SHARED=ON \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ @@ -141,6 +143,7 @@ export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} +export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 01250ff7ef40c..a25e5c51bddbc 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -28,15 +28,17 @@ fi source_dir=${1} +: ${ARROW_AZURE:=ON} : ${ARROW_FLIGHT:=ON} -: ${ARROW_SUBSTRAIT:=ON} -: ${ARROW_S3:=ON} : ${ARROW_GCS:=ON} +: ${ARROW_S3:=ON} +: ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} export PYARROW_TEST_ACERO=ON +export PYARROW_TEST_AZURE=${ARROW_AZURE} export PYARROW_TEST_CYTHON=OFF export PYARROW_TEST_DATASET=ON export PYARROW_TEST_FLIGHT=${ARROW_FLIGHT} diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 99771728ecf18..e86479a7c32fc 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -105,6 +105,16 @@ } ] }, + "azure": { + "description": "Azure blob storage support", + "dependencies": [ + "azure-core-cpp", + "azure-identity-cpp", + "azure-storage-blobs-cpp", + "azure-storage-common-cpp", + "azure-storage-files-datalake-cpp" + ] + }, "orc": { "description": "ORC support", "dependencies": [ diff --git a/dev/tasks/python-wheels/github.osx.amd64.yml b/dev/tasks/python-wheels/github.osx.amd64.yml index 526412f84214b..e31a681653b37 100644 --- a/dev/tasks/python-wheels/github.osx.amd64.yml +++ b/dev/tasks/python-wheels/github.osx.amd64.yml @@ -85,6 +85,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 35d74f1462453..380c2e42f1d88 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -71,6 +71,7 @@ jobs: --clean-after-build \ --x-install-root=${VCPKG_ROOT}/installed \ --x-manifest-root=arrow/ci/vcpkg \ + --x-feature=azure \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \